|
|
|
|
|
from datasets import load_dataset |
|
from langdetect import detect |
|
import pandas as pd |
|
import plotly.express as px |
|
import folium |
|
|
|
|
|
|
|
dataset = load_dataset("alpindale/two-million-bluesky-posts") |
|
|
|
|
|
|
|
def detect_language(text): |
|
try: |
|
return detect(text) |
|
except: |
|
return 'unknown' |
|
|
|
|
|
dataset['train'] = dataset['train'].map(lambda x: {'language': detect_language(x['text'])}) |
|
|
|
|
|
languages = dataset['train']['language'] |
|
|
|
|
|
languages = pd.Series(languages) |
|
|
|
|
|
language_counts = pd.DataFrame(languages.value_counts()).reset_index() |
|
language_counts.columns = ['language', 'count'] |
|
|
|
|
|
|
|
language_to_country = { |
|
'en': 'United States', |
|
'ja': 'Japan', |
|
'unknown': 'Unknown', |
|
'es': 'Spain', |
|
'pt': 'Portugal', |
|
'fr': 'France', |
|
'de': 'Germany', |
|
'ko': 'South Korea', |
|
'nl': 'Netherlands', |
|
'it': 'Italy', |
|
'pl': 'Poland', |
|
'so': 'Somalia', |
|
'af': 'South Africa', |
|
'ru': 'Russia', |
|
'ca': 'Canada', |
|
'tr': 'Turkey', |
|
'no': 'Norway', |
|
'id': 'Indonesia', |
|
'fi': 'Finland', |
|
'da': 'Denmark', |
|
'cy': 'Cyprus', |
|
'tl': 'Philippines', |
|
'sv': 'Sweden', |
|
'th': 'Thailand', |
|
'ro': 'Romania', |
|
'et': 'Estonia', |
|
'sw': 'Kenya', |
|
'vi': 'Vietnam', |
|
'el': 'Greece', |
|
'zh-cn': 'China', |
|
'hr': 'Croatia', |
|
'cs': 'Czech Republic', |
|
'ur': 'Pakistan', |
|
'sk': 'Slovakia', |
|
'sl': 'Slovenia', |
|
'uk': 'Ukraine', |
|
'hu': 'Hungary', |
|
'he': 'Israel', |
|
'sq': 'Albania', |
|
'bg': 'Bulgaria', |
|
'lt': 'Lithuania', |
|
'lv': 'Latvia', |
|
'ar': 'Saudi Arabia', |
|
'fa': 'Iran', |
|
'zh-tw': 'Taiwan', |
|
'mk': 'North Macedonia', |
|
'hi': 'India', |
|
'bn': 'Bangladesh', |
|
'ne': 'Nepal', |
|
'ml': 'India', |
|
'ta': 'India', |
|
'kn': 'India', |
|
'pa': 'India', |
|
'mr': 'India', |
|
'te': 'India', |
|
'gu': 'India' |
|
} |
|
|
|
|
|
language_counts['country'] = language_counts['language'].map(language_to_country).fillna('Unknown') |
|
|
|
|
|
language_counts['country'] = language_counts['language'].map(language_to_country) |
|
|
|
|
|
|
|
fig_languages = px.pie(language_counts, values='count', names='language', title='Language Distribution of Posts') |
|
fig_languages.show() |
|
|
|
|
|
fig_countries = px.pie(language_counts, values='count', names='country', title='Country Distribution of Posts') |
|
fig_countries.show() |
|
|
|
fig_languages.write_html('fig_languages.html') |
|
fig_countries.write_html('fig_countries.html') |
|
|
|
|
|
print(language_counts) |
|
|
|
import folium |
|
import pandas as pd |
|
|
|
|
|
world_map = folium.Map(location=[20, 0], zoom_start=2) |
|
|
|
|
|
continent_colors = { |
|
'Africa': 'red', |
|
'Asia': 'green', |
|
'Europe': 'blue', |
|
'North America': 'purple', |
|
'Oceania': 'orange', |
|
'South America': 'black' |
|
} |
|
|
|
|
|
continent_icons = { |
|
'Africa': 'fa-globe', |
|
'Asia': 'fa-globe', |
|
'Europe': 'fa-globe', |
|
'North America': 'fa-globe', |
|
'Oceania': 'fa-globe', |
|
'South America': 'fa-globe' |
|
} |
|
|
|
|
|
def get_continent(country): |
|
|
|
country_to_continent = { |
|
'United States': 'North America', |
|
'Japan': 'Asia', |
|
'Unknown': 'Unknown', |
|
'Spain': 'Europe', |
|
'Portugal': 'Europe', |
|
'France': 'Europe', |
|
'Germany': 'Europe', |
|
'South Korea': 'Asia', |
|
'Netherlands': 'Europe', |
|
'Italy': 'Europe', |
|
'Poland': 'Europe', |
|
'Somalia': 'Africa', |
|
'South Africa': 'Africa', |
|
'Russia': 'Europe', |
|
'Canada': 'North America', |
|
'Turkey': 'Asia', |
|
'Norway': 'Europe', |
|
'Indonesia': 'Asia', |
|
'Finland': 'Europe', |
|
'Denmark': 'Europe', |
|
'Cyprus': 'Asia', |
|
'Philippines': 'Asia', |
|
'Sweden': 'Europe', |
|
'Thailand': 'Asia', |
|
'Romania': 'Europe', |
|
'Estonia': 'Europe', |
|
'Kenya': 'Africa', |
|
'Vietnam': 'Asia', |
|
'Greece': 'Europe', |
|
'China': 'Asia', |
|
'Croatia': 'Europe', |
|
'Czech Republic': 'Europe', |
|
'Pakistan': 'Asia', |
|
'Slovakia': 'Europe', |
|
'Slovenia': 'Europe', |
|
'Ukraine': 'Europe', |
|
'Hungary': 'Europe', |
|
'Israel': 'Asia', |
|
'Albania': 'Europe', |
|
'Bulgaria': 'Europe', |
|
'Lithuania': 'Europe', |
|
'Latvia': 'Europe', |
|
'Saudi Arabia': 'Asia', |
|
'Iran': 'Asia', |
|
'Taiwan': 'Asia', |
|
'North Macedonia': 'Europe', |
|
'India': 'Asia', |
|
'Bangladesh': 'Asia', |
|
'Nepal': 'Asia', |
|
'Malaysia': 'Asia', |
|
'Singapore': 'Asia', |
|
'Brunei': 'Asia', |
|
'Cambodia': 'Asia', |
|
'Laos': 'Asia', |
|
'Myanmar': 'Asia', |
|
'Timor-Leste': 'Asia', |
|
'Papua New Guinea': 'Oceania', |
|
'Australia': 'Oceania', |
|
'New Zealand': 'Oceania', |
|
'Fiji': 'Oceania', |
|
'Solomon Islands': 'Oceania', |
|
'Vanuatu': 'Oceania', |
|
'Samoa': 'Oceania', |
|
'Tonga': 'Oceania', |
|
'Kiribati': 'Oceania', |
|
'Tuvalu': 'Oceania', |
|
'Nauru': 'Oceania', |
|
'Palau': 'Oceania', |
|
'Marshall Islands': 'Oceania', |
|
'Micronesia': 'Oceania', |
|
'Guam': 'Oceania', |
|
'Northern Mariana Islands': 'Oceania', |
|
'Puerto Rico': 'North America', |
|
'Dominican Republic': 'North America', |
|
'Haiti': 'North America', |
|
'Jamaica': 'North America', |
|
'Cuba': 'North America', |
|
'Bahamas': 'North America', |
|
'Barbados': 'North America', |
|
'Trinidad and Tobago': 'North America', |
|
'Grenada': 'North America', |
|
'Saint Vincent and the Grenadines': 'North America', |
|
'Antigua and Barbuda': 'North America', |
|
'Saint Kitts and Nevis': 'North America', |
|
'Belize': 'North America', |
|
'Costa Rica': 'North America', |
|
'El Salvador': 'North America', |
|
'Guatemala': 'North America', |
|
'Honduras': 'North America', |
|
'Nicaragua': 'North America', |
|
'Panama': 'North America', |
|
'Argentina': 'South America', |
|
'Bolivia': 'South America', |
|
'Brazil': 'South America', |
|
'Chile': 'South America', |
|
'Colombia': 'South America', |
|
'Ecuador': 'South America', |
|
'Guyana': 'South America', |
|
'Paraguay': 'South America', |
|
'Peru': 'South America', |
|
'Suriname': 'South America', |
|
'Uruguay': 'South America', |
|
'Venezuela': 'South America' |
|
} |
|
return country_to_continent.get(country, 'Unknown') |
|
|
|
|
|
for index, row in language_counts.iterrows(): |
|
country = row['country'] |
|
count = row['count'] |
|
continent = get_continent(country) |
|
color = continent_colors.get(continent, 'gray') |
|
icon = continent_icons.get(continent, 'fa-globe') |
|
|
|
|
|
country_coordinates = { |
|
'United States': [37.0902, -95.7129], |
|
'Japan': [36.2048, 138.2529], |
|
'Unknown': [0, 0], |
|
'Spain': [40.4637, -3.7492], |
|
'Portugal': [39.3999, -8.2245], |
|
'France': [46.6034, 1.8883], |
|
'Germany': [51.1657, 10.4515], |
|
'South Korea': [35.9078, 127.7669], |
|
'Netherlands': [52.1326, 5.2913], |
|
'Italy': [41.8719, 12.5674], |
|
'Poland': [51.9194, 19.1451], |
|
'Somalia': [5.1521, 46.1996], |
|
'South Africa': [-30.5595, 22.9375], |
|
'Russia': [61.5240, 105.3188], |
|
'Canada': [56.1304, -106.3468], |
|
'Turkey': [38.9637, 35.2433], |
|
'Norway': [60.4720, 8.4689], |
|
'Indonesia': [-0.7893, 113.9213], |
|
'Finland': [61.9241, 25.7482], |
|
'Denmark': [56.2639, 9.5018], |
|
'Cyprus': [35.1264, 33.4299], |
|
'Philippines': [12.8797, 121.7740], |
|
'Sweden': [60.1282, 18.6435], |
|
'Thailand': [15.8700, 100.9925], |
|
'Romania': [45.9432, 24.9668], |
|
'Estonia': [58.5953, 25.0136], |
|
'Kenya': [0.0236, 37.9062], |
|
'Vietnam': [14.0583, 108.2772], |
|
'Greece': [39.0742, 21.8243], |
|
'China': [35.8617, 104.1954], |
|
'Croatia': [45.1000, 15.2000], |
|
'Czech Republic': [49.8175, 15.4730], |
|
'Pakistan': [30.3753, 69.3451], |
|
'Slovakia': [48.6690, 19.6990], |
|
'Slovenia': [46.1512, 14.9955], |
|
'Ukraine': [48.3794, 31.1656], |
|
'Hungary': [47.1625, 19.5033], |
|
'Israel': [31.0461, 34.8516], |
|
'Albania': [41.1533, 20.1683], |
|
'Bulgaria': [42.7339, 25.4858], |
|
'Lithuania': [55.1694, 23.8813], |
|
'Latvia': [56.8796, 24.6032], |
|
'Saudi Arabia': [23.8859, 45.0792], |
|
'Iran': [32.4279, 53.6880], |
|
'Taiwan': [23.6978, 120.9605], |
|
'North Macedonia': [41.6086, 21.7453], |
|
'India': [20.5937, 78.9629], |
|
'Bangladesh': [23.6850, 90.3563], |
|
'Nepal': [28.3949, 84.1240], |
|
'Malaysia': [4.2105, 101.9758], |
|
'Singapore': [1.3521, 103.8198], |
|
'Brunei': [4.5353, 114.7277], |
|
'Cambodia': [12.5657, 104.9910], |
|
'Laos': [19.8563, 102.4955], |
|
'Myanmar': [21.9162, 95.9560], |
|
'Timor-Leste': [-8.8742, 125.7275], |
|
'Papua New Guinea': [-6.3149, 143.9555], |
|
'Australia': [-25.2744, 133.7751], |
|
'New Zealand': [-40.9006, 174.8860], |
|
'Fiji': [-17.7134, 178.0650], |
|
'Solomon Islands': [-9.6457, 160.1562], |
|
'Vanuatu': [-15.3767, 166.9592], |
|
'Samoa': [-13.7590, -172.1046], |
|
'Tonga': [-21.1790, -175.1982], |
|
'Kiribati': [1.4518, 172.9717], |
|
'Tuvalu': [-7.1095, 177.6493], |
|
'Nauru': [-0.5228, 166.9315], |
|
'Palau': [7.5150, 134.5825], |
|
'Marshall Islands': [7.1315, 171.1845], |
|
'Micronesia': [7.4256, 150.5508], |
|
'Guam': [13.4443, 144.7937], |
|
'Northern Mariana Islands': [15.0979, 145.6739], |
|
'Puerto Rico': [18.2208, -66.5901], |
|
'Dominican Republic': [18.7357, -70.1627], |
|
'Haiti': [18.9712, -72.2852], |
|
'Jamaica': [18.1096, -77.2975], |
|
'Cuba': [21.5218, -77.7812], |
|
'Bahamas': [25.0343, -77.3963], |
|
'Barbados': [13.1939, -59.5432], |
|
'Trinidad and Tobago': [10.6918, -61.2225], |
|
'Grenada': [12.2627, -61.6041], |
|
'Saint Vincent and the Grenadines': [12.9843, -61.2872], |
|
'Antigua and Barbuda': [17.0608, -61.7964], |
|
'Saint Kitts and Nevis': [17.3578, -62.7830], |
|
'Belize': [17.1899, -88.4976], |
|
'Costa Rica': [9.7489, -83.7534], |
|
'El Salvador': [13.7942, -88.8965], |
|
'Guatemala': [15.7835, -90.2308], |
|
'Honduras': [15.1997, -86.2419], |
|
'Nicaragua': [12.8654, -85.2072], |
|
'Panama': [8.5380, -80.7821], |
|
'Argentina': [-38.4161, -63.6167], |
|
'Bolivia': [-16.2902, -63.5887], |
|
'Brazil': [-14.2350, -51.9253], |
|
'Chile': [-35.6751, -71.5430], |
|
'Colombia': [4.5709, -74.2973], |
|
'Ecuador': [-1.8312, -78.1834], |
|
'Guyana': [4.8604, -58.9302], |
|
'Paraguay': [-23.4425, -58.4438], |
|
'Peru': [-9.1900, -75.0152], |
|
'Suriname': [3.9193, -56.0278], |
|
'Uruguay': [-32.5228, -55.7658], |
|
'Venezuela': [6.4238, -66.5897] |
|
} |
|
|
|
|
|
coordinates = country_coordinates.get(country, [0, 0]) |
|
|
|
|
|
folium.Marker( |
|
location=coordinates, |
|
popup=f"Country: {country}<br>Count: {count}", |
|
icon=folium.Icon(color=color, icon=icon, prefix='fa') |
|
).add_to(world_map) |
|
|
|
|
|
world_map.save('world_map.html') |
|
|
|
|
|
|
|
|
|
|