TroglodyteDerivations commited on
Commit
f2d49da
1 Parent(s): ac5129d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +375 -0
app.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bluesky_languages_streamlit.py
2
+
3
+ import streamlit as st
4
+ from datasets import load_dataset
5
+ from langdetect import detect
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import folium
9
+ from streamlit_folium import folium_static
10
+
11
+ # Load the dataset
12
+ dataset = load_dataset("alpindale/two-million-bluesky-posts")
13
+
14
+ # Function to detect language
15
+ def detect_language(text):
16
+ try:
17
+ return detect(text)
18
+ except:
19
+ return 'unknown'
20
+
21
+ # Apply language detection to the 'text' column
22
+ dataset['train'] = dataset['train'].map(lambda x: {'language': detect_language(x['text'])})
23
+
24
+ # Extract the 'language' column
25
+ languages = dataset['train']['language']
26
+
27
+ # Ensure languages is a pandas Series
28
+ languages = pd.Series(languages)
29
+
30
+ # Create a DataFrame with language counts
31
+ language_counts = pd.DataFrame(languages.value_counts()).reset_index()
32
+ language_counts.columns = ['language', 'count']
33
+
34
+ # Expanded mapping of languages to countries
35
+ language_to_country = {
36
+ 'en': 'United States',
37
+ 'ja': 'Japan',
38
+ 'unknown': 'Unknown',
39
+ 'es': 'Spain',
40
+ 'pt': 'Portugal',
41
+ 'fr': 'France',
42
+ 'de': 'Germany',
43
+ 'ko': 'South Korea',
44
+ 'nl': 'Netherlands',
45
+ 'it': 'Italy',
46
+ 'pl': 'Poland',
47
+ 'so': 'Somalia',
48
+ 'af': 'South Africa',
49
+ 'ru': 'Russia',
50
+ 'ca': 'Canada',
51
+ 'tr': 'Turkey',
52
+ 'no': 'Norway',
53
+ 'id': 'Indonesia',
54
+ 'fi': 'Finland',
55
+ 'da': 'Denmark',
56
+ 'cy': 'Cyprus',
57
+ 'tl': 'Philippines',
58
+ 'sv': 'Sweden',
59
+ 'th': 'Thailand',
60
+ 'ro': 'Romania',
61
+ 'et': 'Estonia',
62
+ 'sw': 'Kenya',
63
+ 'vi': 'Vietnam',
64
+ 'el': 'Greece',
65
+ 'zh-cn': 'China',
66
+ 'hr': 'Croatia',
67
+ 'cs': 'Czech Republic',
68
+ 'ur': 'Pakistan',
69
+ 'sk': 'Slovakia',
70
+ 'sl': 'Slovenia',
71
+ 'uk': 'Ukraine',
72
+ 'hu': 'Hungary',
73
+ 'he': 'Israel',
74
+ 'sq': 'Albania',
75
+ 'bg': 'Bulgaria',
76
+ 'lt': 'Lithuania',
77
+ 'lv': 'Latvia',
78
+ 'ar': 'Saudi Arabia',
79
+ 'fa': 'Iran',
80
+ 'zh-tw': 'Taiwan',
81
+ 'mk': 'North Macedonia',
82
+ 'hi': 'India',
83
+ 'bn': 'Bangladesh',
84
+ 'ne': 'Nepal',
85
+ 'ml': 'India',
86
+ 'ta': 'India',
87
+ 'kn': 'India',
88
+ 'pa': 'India',
89
+ 'mr': 'India',
90
+ 'te': 'India',
91
+ 'gu': 'India'
92
+ }
93
+
94
+ # Map languages to countries with default value 'Unknown'
95
+ language_counts['country'] = language_counts['language'].map(language_to_country).fillna('Unknown')
96
+
97
+ # Create a pie chart for languages
98
+ fig_languages = px.pie(language_counts, values='count', names='language', title='Language Distribution of Posts')
99
+
100
+ # Create a pie chart for countries
101
+ fig_countries = px.pie(language_counts, values='count', names='country', title='Country Distribution of Posts')
102
+
103
+ # Create a Folium map centered at the world
104
+ world_map = folium.Map(location=[20, 0], zoom_start=2)
105
+
106
+ # Mapping of continents to colors and icons
107
+ continent_colors = {
108
+ 'Africa': 'red',
109
+ 'Asia': 'green',
110
+ 'Europe': 'blue',
111
+ 'North America': 'purple',
112
+ 'Oceania': 'orange',
113
+ 'South America': 'black'
114
+ }
115
+
116
+ # Mapping of continents to icons
117
+ continent_icons = {
118
+ 'Africa': 'fa-globe',
119
+ 'Asia': 'fa-globe',
120
+ 'Europe': 'fa-globe',
121
+ 'North America': 'fa-globe',
122
+ 'Oceania': 'fa-globe',
123
+ 'South America': 'fa-globe'
124
+ }
125
+
126
+ # Function to get continent from country
127
+ def get_continent(country):
128
+ # Simplified mapping of countries to continents
129
+ country_to_continent = {
130
+ 'United States': 'North America',
131
+ 'Japan': 'Asia',
132
+ 'Unknown': 'Unknown',
133
+ 'Spain': 'Europe',
134
+ 'Portugal': 'Europe',
135
+ 'France': 'Europe',
136
+ 'Germany': 'Europe',
137
+ 'South Korea': 'Asia',
138
+ 'Netherlands': 'Europe',
139
+ 'Italy': 'Europe',
140
+ 'Poland': 'Europe',
141
+ 'Somalia': 'Africa',
142
+ 'South Africa': 'Africa',
143
+ 'Russia': 'Europe',
144
+ 'Canada': 'North America',
145
+ 'Turkey': 'Asia',
146
+ 'Norway': 'Europe',
147
+ 'Indonesia': 'Asia',
148
+ 'Finland': 'Europe',
149
+ 'Denmark': 'Europe',
150
+ 'Cyprus': 'Asia',
151
+ 'Philippines': 'Asia',
152
+ 'Sweden': 'Europe',
153
+ 'Thailand': 'Asia',
154
+ 'Romania': 'Europe',
155
+ 'Estonia': 'Europe',
156
+ 'Kenya': 'Africa',
157
+ 'Vietnam': 'Asia',
158
+ 'Greece': 'Europe',
159
+ 'China': 'Asia',
160
+ 'Croatia': 'Europe',
161
+ 'Czech Republic': 'Europe',
162
+ 'Pakistan': 'Asia',
163
+ 'Slovakia': 'Europe',
164
+ 'Slovenia': 'Europe',
165
+ 'Ukraine': 'Europe',
166
+ 'Hungary': 'Europe',
167
+ 'Israel': 'Asia',
168
+ 'Albania': 'Europe',
169
+ 'Bulgaria': 'Europe',
170
+ 'Lithuania': 'Europe',
171
+ 'Latvia': 'Europe',
172
+ 'Saudi Arabia': 'Asia',
173
+ 'Iran': 'Asia',
174
+ 'Taiwan': 'Asia',
175
+ 'North Macedonia': 'Europe',
176
+ 'India': 'Asia',
177
+ 'Bangladesh': 'Asia',
178
+ 'Nepal': 'Asia',
179
+ 'Malaysia': 'Asia',
180
+ 'Singapore': 'Asia',
181
+ 'Brunei': 'Asia',
182
+ 'Cambodia': 'Asia',
183
+ 'Laos': 'Asia',
184
+ 'Myanmar': 'Asia',
185
+ 'Timor-Leste': 'Asia',
186
+ 'Papua New Guinea': 'Oceania',
187
+ 'Australia': 'Oceania',
188
+ 'New Zealand': 'Oceania',
189
+ 'Fiji': 'Oceania',
190
+ 'Solomon Islands': 'Oceania',
191
+ 'Vanuatu': 'Oceania',
192
+ 'Samoa': 'Oceania',
193
+ 'Tonga': 'Oceania',
194
+ 'Kiribati': 'Oceania',
195
+ 'Tuvalu': 'Oceania',
196
+ 'Nauru': 'Oceania',
197
+ 'Palau': 'Oceania',
198
+ 'Marshall Islands': 'Oceania',
199
+ 'Micronesia': 'Oceania',
200
+ 'Guam': 'Oceania',
201
+ 'Northern Mariana Islands': 'Oceania',
202
+ 'Puerto Rico': 'North America',
203
+ 'Dominican Republic': 'North America',
204
+ 'Haiti': 'North America',
205
+ 'Jamaica': 'North America',
206
+ 'Cuba': 'North America',
207
+ 'Bahamas': 'North America',
208
+ 'Barbados': 'North America',
209
+ 'Trinidad and Tobago': 'North America',
210
+ 'Grenada': 'North America',
211
+ 'Saint Vincent and the Grenadines': 'North America',
212
+ 'Antigua and Barbuda': 'North America',
213
+ 'Saint Kitts and Nevis': 'North America',
214
+ 'Belize': 'North America',
215
+ 'Costa Rica': 'North America',
216
+ 'El Salvador': 'North America',
217
+ 'Guatemala': 'North America',
218
+ 'Honduras': 'North America',
219
+ 'Nicaragua': 'North America',
220
+ 'Panama': 'North America',
221
+ 'Argentina': 'South America',
222
+ 'Bolivia': 'South America',
223
+ 'Brazil': 'South America',
224
+ 'Chile': 'South America',
225
+ 'Colombia': 'South America',
226
+ 'Ecuador': 'South America',
227
+ 'Guyana': 'South America',
228
+ 'Paraguay': 'South America',
229
+ 'Peru': 'South America',
230
+ 'Suriname': 'South America',
231
+ 'Uruguay': 'South America',
232
+ 'Venezuela': 'South America'
233
+ }
234
+ return country_to_continent.get(country, 'Unknown')
235
+
236
+ # Add markers for each country
237
+ for index, row in language_counts.iterrows():
238
+ country = row['country']
239
+ count = row['count']
240
+ continent = get_continent(country)
241
+ color = continent_colors.get(continent, 'gray')
242
+ icon = continent_icons.get(continent, 'fa-globe')
243
+
244
+ # Get the coordinates for the country (simplified)
245
+ country_coordinates = {
246
+ 'United States': [37.0902, -95.7129],
247
+ 'Japan': [36.2048, 138.2529],
248
+ 'Unknown': [0, 0],
249
+ 'Spain': [40.4637, -3.7492],
250
+ 'Portugal': [39.3999, -8.2245],
251
+ 'France': [46.6034, 1.8883],
252
+ 'Germany': [51.1657, 10.4515],
253
+ 'South Korea': [35.9078, 127.7669],
254
+ 'Netherlands': [52.1326, 5.2913],
255
+ 'Italy': [41.8719, 12.5674],
256
+ 'Poland': [51.9194, 19.1451],
257
+ 'Somalia': [5.1521, 46.1996],
258
+ 'South Africa': [-30.5595, 22.9375],
259
+ 'Russia': [61.5240, 105.3188],
260
+ 'Canada': [56.1304, -106.3468],
261
+ 'Turkey': [38.9637, 35.2433],
262
+ 'Norway': [60.4720, 8.4689],
263
+ 'Indonesia': [-0.7893, 113.9213],
264
+ 'Finland': [61.9241, 25.7482],
265
+ 'Denmark': [56.2639, 9.5018],
266
+ 'Cyprus': [35.1264, 33.4299],
267
+ 'Philippines': [12.8797, 121.7740],
268
+ 'Sweden': [60.1282, 18.6435],
269
+ 'Thailand': [15.8700, 100.9925],
270
+ 'Romania': [45.9432, 24.9668],
271
+ 'Estonia': [58.5953, 25.0136],
272
+ 'Kenya': [0.0236, 37.9062],
273
+ 'Vietnam': [14.0583, 108.2772],
274
+ 'Greece': [39.0742, 21.8243],
275
+ 'China': [35.8617, 104.1954],
276
+ 'Croatia': [45.1000, 15.2000],
277
+ 'Czech Republic': [49.8175, 15.4730],
278
+ 'Pakistan': [30.3753, 69.3451],
279
+ 'Slovakia': [48.6690, 19.6990],
280
+ 'Slovenia': [46.1512, 14.9955],
281
+ 'Ukraine': [48.3794, 31.1656],
282
+ 'Hungary': [47.1625, 19.5033],
283
+ 'Israel': [31.0461, 34.8516],
284
+ 'Albania': [41.1533, 20.1683],
285
+ 'Bulgaria': [42.7339, 25.4858],
286
+ 'Lithuania': [55.1694, 23.8813],
287
+ 'Latvia': [56.8796, 24.6032],
288
+ 'Saudi Arabia': [23.8859, 45.0792],
289
+ 'Iran': [32.4279, 53.6880],
290
+ 'Taiwan': [23.6978, 120.9605],
291
+ 'North Macedonia': [41.6086, 21.7453],
292
+ 'India': [20.5937, 78.9629],
293
+ 'Bangladesh': [23.6850, 90.3563],
294
+ 'Nepal': [28.3949, 84.1240],
295
+ 'Malaysia': [4.2105, 101.9758],
296
+ 'Singapore': [1.3521, 103.8198],
297
+ 'Brunei': [4.5353, 114.7277],
298
+ 'Cambodia': [12.5657, 104.9910],
299
+ 'Laos': [19.8563, 102.4955],
300
+ 'Myanmar': [21.9162, 95.9560],
301
+ 'Timor-Leste': [-8.8742, 125.7275],
302
+ 'Papua New Guinea': [-6.3149, 143.9555],
303
+ 'Australia': [-25.2744, 133.7751],
304
+ 'New Zealand': [-40.9006, 174.8860],
305
+ 'Fiji': [-17.7134, 178.0650],
306
+ 'Solomon Islands': [-9.6457, 160.1562],
307
+ 'Vanuatu': [-15.3767, 166.9592],
308
+ 'Samoa': [-13.7590, -172.1046],
309
+ 'Tonga': [-21.1790, -175.1982],
310
+ 'Kiribati': [1.4518, 172.9717],
311
+ 'Tuvalu': [-7.1095, 177.6493],
312
+ 'Nauru': [-0.5228, 166.9315],
313
+ 'Palau': [7.5150, 134.5825],
314
+ 'Marshall Islands': [7.1315, 171.1845],
315
+ 'Micronesia': [7.4256, 150.5508],
316
+ 'Guam': [13.4443, 144.7937],
317
+ 'Northern Mariana Islands': [15.0979, 145.6739],
318
+ 'Puerto Rico': [18.2208, -66.5901],
319
+ 'Dominican Republic': [18.7357, -70.1627],
320
+ 'Haiti': [18.9712, -72.2852],
321
+ 'Jamaica': [18.1096, -77.2975],
322
+ 'Cuba': [21.5218, -77.7812],
323
+ 'Bahamas': [25.0343, -77.3963],
324
+ 'Barbados': [13.1939, -59.5432],
325
+ 'Trinidad and Tobago': [10.6918, -61.2225],
326
+ 'Grenada': [12.2627, -61.6041],
327
+ 'Saint Vincent and the Grenadines': [12.9843, -61.2872],
328
+ 'Antigua and Barbuda': [17.0608, -61.7964],
329
+ 'Saint Kitts and Nevis': [17.3578, -62.7830],
330
+ 'Belize': [17.1899, -88.4976],
331
+ 'Costa Rica': [9.7489, -83.7534],
332
+ 'El Salvador': [13.7942, -88.8965],
333
+ 'Guatemala': [15.7835, -90.2308],
334
+ 'Honduras': [15.1997, -86.2419],
335
+ 'Nicaragua': [12.8654, -85.2072],
336
+ 'Panama': [8.5380, -80.7821],
337
+ 'Argentina': [-38.4161, -63.6167],
338
+ 'Bolivia': [-16.2902, -63.5887],
339
+ 'Brazil': [-14.2350, -51.9253],
340
+ 'Chile': [-35.6751, -71.5430],
341
+ 'Colombia': [4.5709, -74.2973],
342
+ 'Ecuador': [-1.8312, -78.1834],
343
+ 'Guyana': [4.8604, -58.9302],
344
+ 'Paraguay': [-23.4425, -58.4438],
345
+ 'Peru': [-9.1900, -75.0152],
346
+ 'Suriname': [3.9193, -56.0278],
347
+ 'Uruguay': [-32.5228, -55.7658],
348
+ 'Venezuela': [6.4238, -66.5897]
349
+ }
350
+
351
+ # Get the coordinates for the country
352
+ coordinates = country_coordinates.get(country, [0, 0])
353
+
354
+ # Create a marker with pop-up information
355
+ folium.Marker(
356
+ location=coordinates,
357
+ popup=f"Country: {country}<br>Count: {count}",
358
+ icon=folium.Icon(color=color, icon=icon, prefix='fa')
359
+ ).add_to(world_map)
360
+
361
+ # Streamlit app
362
+ st.title("Bluesky Posts Language and Country Distribution")
363
+
364
+ st.write("### Language Distribution")
365
+ st.plotly_chart(fig_languages)
366
+
367
+ st.write("### Country Distribution")
368
+ st.plotly_chart(fig_countries)
369
+
370
+ st.write("### World Map of Posts")
371
+ folium_static(world_map)
372
+
373
+ st.write("### Language Counts")
374
+ st.dataframe(language_counts)
375
+