TroglodyteDerivations's picture
Create app.py
f2d49da verified
# bluesky_languages_streamlit.py
import streamlit as st
from datasets import load_dataset
from langdetect import detect
import pandas as pd
import plotly.express as px
import folium
from streamlit_folium import folium_static
# Load the dataset
dataset = load_dataset("alpindale/two-million-bluesky-posts")
# Function to detect language
def detect_language(text):
try:
return detect(text)
except:
return 'unknown'
# Apply language detection to the 'text' column
dataset['train'] = dataset['train'].map(lambda x: {'language': detect_language(x['text'])})
# Extract the 'language' column
languages = dataset['train']['language']
# Ensure languages is a pandas Series
languages = pd.Series(languages)
# Create a DataFrame with language counts
language_counts = pd.DataFrame(languages.value_counts()).reset_index()
language_counts.columns = ['language', 'count']
# Expanded mapping of languages to countries
language_to_country = {
'en': 'United States',
'ja': 'Japan',
'unknown': 'Unknown',
'es': 'Spain',
'pt': 'Portugal',
'fr': 'France',
'de': 'Germany',
'ko': 'South Korea',
'nl': 'Netherlands',
'it': 'Italy',
'pl': 'Poland',
'so': 'Somalia',
'af': 'South Africa',
'ru': 'Russia',
'ca': 'Canada',
'tr': 'Turkey',
'no': 'Norway',
'id': 'Indonesia',
'fi': 'Finland',
'da': 'Denmark',
'cy': 'Cyprus',
'tl': 'Philippines',
'sv': 'Sweden',
'th': 'Thailand',
'ro': 'Romania',
'et': 'Estonia',
'sw': 'Kenya',
'vi': 'Vietnam',
'el': 'Greece',
'zh-cn': 'China',
'hr': 'Croatia',
'cs': 'Czech Republic',
'ur': 'Pakistan',
'sk': 'Slovakia',
'sl': 'Slovenia',
'uk': 'Ukraine',
'hu': 'Hungary',
'he': 'Israel',
'sq': 'Albania',
'bg': 'Bulgaria',
'lt': 'Lithuania',
'lv': 'Latvia',
'ar': 'Saudi Arabia',
'fa': 'Iran',
'zh-tw': 'Taiwan',
'mk': 'North Macedonia',
'hi': 'India',
'bn': 'Bangladesh',
'ne': 'Nepal',
'ml': 'India',
'ta': 'India',
'kn': 'India',
'pa': 'India',
'mr': 'India',
'te': 'India',
'gu': 'India'
}
# Map languages to countries with default value 'Unknown'
language_counts['country'] = language_counts['language'].map(language_to_country).fillna('Unknown')
# Create a pie chart for languages
fig_languages = px.pie(language_counts, values='count', names='language', title='Language Distribution of Posts')
# Create a pie chart for countries
fig_countries = px.pie(language_counts, values='count', names='country', title='Country Distribution of Posts')
# Create a Folium map centered at the world
world_map = folium.Map(location=[20, 0], zoom_start=2)
# Mapping of continents to colors and icons
continent_colors = {
'Africa': 'red',
'Asia': 'green',
'Europe': 'blue',
'North America': 'purple',
'Oceania': 'orange',
'South America': 'black'
}
# Mapping of continents to icons
continent_icons = {
'Africa': 'fa-globe',
'Asia': 'fa-globe',
'Europe': 'fa-globe',
'North America': 'fa-globe',
'Oceania': 'fa-globe',
'South America': 'fa-globe'
}
# Function to get continent from country
def get_continent(country):
# Simplified mapping of countries to continents
country_to_continent = {
'United States': 'North America',
'Japan': 'Asia',
'Unknown': 'Unknown',
'Spain': 'Europe',
'Portugal': 'Europe',
'France': 'Europe',
'Germany': 'Europe',
'South Korea': 'Asia',
'Netherlands': 'Europe',
'Italy': 'Europe',
'Poland': 'Europe',
'Somalia': 'Africa',
'South Africa': 'Africa',
'Russia': 'Europe',
'Canada': 'North America',
'Turkey': 'Asia',
'Norway': 'Europe',
'Indonesia': 'Asia',
'Finland': 'Europe',
'Denmark': 'Europe',
'Cyprus': 'Asia',
'Philippines': 'Asia',
'Sweden': 'Europe',
'Thailand': 'Asia',
'Romania': 'Europe',
'Estonia': 'Europe',
'Kenya': 'Africa',
'Vietnam': 'Asia',
'Greece': 'Europe',
'China': 'Asia',
'Croatia': 'Europe',
'Czech Republic': 'Europe',
'Pakistan': 'Asia',
'Slovakia': 'Europe',
'Slovenia': 'Europe',
'Ukraine': 'Europe',
'Hungary': 'Europe',
'Israel': 'Asia',
'Albania': 'Europe',
'Bulgaria': 'Europe',
'Lithuania': 'Europe',
'Latvia': 'Europe',
'Saudi Arabia': 'Asia',
'Iran': 'Asia',
'Taiwan': 'Asia',
'North Macedonia': 'Europe',
'India': 'Asia',
'Bangladesh': 'Asia',
'Nepal': 'Asia',
'Malaysia': 'Asia',
'Singapore': 'Asia',
'Brunei': 'Asia',
'Cambodia': 'Asia',
'Laos': 'Asia',
'Myanmar': 'Asia',
'Timor-Leste': 'Asia',
'Papua New Guinea': 'Oceania',
'Australia': 'Oceania',
'New Zealand': 'Oceania',
'Fiji': 'Oceania',
'Solomon Islands': 'Oceania',
'Vanuatu': 'Oceania',
'Samoa': 'Oceania',
'Tonga': 'Oceania',
'Kiribati': 'Oceania',
'Tuvalu': 'Oceania',
'Nauru': 'Oceania',
'Palau': 'Oceania',
'Marshall Islands': 'Oceania',
'Micronesia': 'Oceania',
'Guam': 'Oceania',
'Northern Mariana Islands': 'Oceania',
'Puerto Rico': 'North America',
'Dominican Republic': 'North America',
'Haiti': 'North America',
'Jamaica': 'North America',
'Cuba': 'North America',
'Bahamas': 'North America',
'Barbados': 'North America',
'Trinidad and Tobago': 'North America',
'Grenada': 'North America',
'Saint Vincent and the Grenadines': 'North America',
'Antigua and Barbuda': 'North America',
'Saint Kitts and Nevis': 'North America',
'Belize': 'North America',
'Costa Rica': 'North America',
'El Salvador': 'North America',
'Guatemala': 'North America',
'Honduras': 'North America',
'Nicaragua': 'North America',
'Panama': 'North America',
'Argentina': 'South America',
'Bolivia': 'South America',
'Brazil': 'South America',
'Chile': 'South America',
'Colombia': 'South America',
'Ecuador': 'South America',
'Guyana': 'South America',
'Paraguay': 'South America',
'Peru': 'South America',
'Suriname': 'South America',
'Uruguay': 'South America',
'Venezuela': 'South America'
}
return country_to_continent.get(country, 'Unknown')
# Add markers for each country
for index, row in language_counts.iterrows():
country = row['country']
count = row['count']
continent = get_continent(country)
color = continent_colors.get(continent, 'gray')
icon = continent_icons.get(continent, 'fa-globe')
# Get the coordinates for the country (simplified)
country_coordinates = {
'United States': [37.0902, -95.7129],
'Japan': [36.2048, 138.2529],
'Unknown': [0, 0],
'Spain': [40.4637, -3.7492],
'Portugal': [39.3999, -8.2245],
'France': [46.6034, 1.8883],
'Germany': [51.1657, 10.4515],
'South Korea': [35.9078, 127.7669],
'Netherlands': [52.1326, 5.2913],
'Italy': [41.8719, 12.5674],
'Poland': [51.9194, 19.1451],
'Somalia': [5.1521, 46.1996],
'South Africa': [-30.5595, 22.9375],
'Russia': [61.5240, 105.3188],
'Canada': [56.1304, -106.3468],
'Turkey': [38.9637, 35.2433],
'Norway': [60.4720, 8.4689],
'Indonesia': [-0.7893, 113.9213],
'Finland': [61.9241, 25.7482],
'Denmark': [56.2639, 9.5018],
'Cyprus': [35.1264, 33.4299],
'Philippines': [12.8797, 121.7740],
'Sweden': [60.1282, 18.6435],
'Thailand': [15.8700, 100.9925],
'Romania': [45.9432, 24.9668],
'Estonia': [58.5953, 25.0136],
'Kenya': [0.0236, 37.9062],
'Vietnam': [14.0583, 108.2772],
'Greece': [39.0742, 21.8243],
'China': [35.8617, 104.1954],
'Croatia': [45.1000, 15.2000],
'Czech Republic': [49.8175, 15.4730],
'Pakistan': [30.3753, 69.3451],
'Slovakia': [48.6690, 19.6990],
'Slovenia': [46.1512, 14.9955],
'Ukraine': [48.3794, 31.1656],
'Hungary': [47.1625, 19.5033],
'Israel': [31.0461, 34.8516],
'Albania': [41.1533, 20.1683],
'Bulgaria': [42.7339, 25.4858],
'Lithuania': [55.1694, 23.8813],
'Latvia': [56.8796, 24.6032],
'Saudi Arabia': [23.8859, 45.0792],
'Iran': [32.4279, 53.6880],
'Taiwan': [23.6978, 120.9605],
'North Macedonia': [41.6086, 21.7453],
'India': [20.5937, 78.9629],
'Bangladesh': [23.6850, 90.3563],
'Nepal': [28.3949, 84.1240],
'Malaysia': [4.2105, 101.9758],
'Singapore': [1.3521, 103.8198],
'Brunei': [4.5353, 114.7277],
'Cambodia': [12.5657, 104.9910],
'Laos': [19.8563, 102.4955],
'Myanmar': [21.9162, 95.9560],
'Timor-Leste': [-8.8742, 125.7275],
'Papua New Guinea': [-6.3149, 143.9555],
'Australia': [-25.2744, 133.7751],
'New Zealand': [-40.9006, 174.8860],
'Fiji': [-17.7134, 178.0650],
'Solomon Islands': [-9.6457, 160.1562],
'Vanuatu': [-15.3767, 166.9592],
'Samoa': [-13.7590, -172.1046],
'Tonga': [-21.1790, -175.1982],
'Kiribati': [1.4518, 172.9717],
'Tuvalu': [-7.1095, 177.6493],
'Nauru': [-0.5228, 166.9315],
'Palau': [7.5150, 134.5825],
'Marshall Islands': [7.1315, 171.1845],
'Micronesia': [7.4256, 150.5508],
'Guam': [13.4443, 144.7937],
'Northern Mariana Islands': [15.0979, 145.6739],
'Puerto Rico': [18.2208, -66.5901],
'Dominican Republic': [18.7357, -70.1627],
'Haiti': [18.9712, -72.2852],
'Jamaica': [18.1096, -77.2975],
'Cuba': [21.5218, -77.7812],
'Bahamas': [25.0343, -77.3963],
'Barbados': [13.1939, -59.5432],
'Trinidad and Tobago': [10.6918, -61.2225],
'Grenada': [12.2627, -61.6041],
'Saint Vincent and the Grenadines': [12.9843, -61.2872],
'Antigua and Barbuda': [17.0608, -61.7964],
'Saint Kitts and Nevis': [17.3578, -62.7830],
'Belize': [17.1899, -88.4976],
'Costa Rica': [9.7489, -83.7534],
'El Salvador': [13.7942, -88.8965],
'Guatemala': [15.7835, -90.2308],
'Honduras': [15.1997, -86.2419],
'Nicaragua': [12.8654, -85.2072],
'Panama': [8.5380, -80.7821],
'Argentina': [-38.4161, -63.6167],
'Bolivia': [-16.2902, -63.5887],
'Brazil': [-14.2350, -51.9253],
'Chile': [-35.6751, -71.5430],
'Colombia': [4.5709, -74.2973],
'Ecuador': [-1.8312, -78.1834],
'Guyana': [4.8604, -58.9302],
'Paraguay': [-23.4425, -58.4438],
'Peru': [-9.1900, -75.0152],
'Suriname': [3.9193, -56.0278],
'Uruguay': [-32.5228, -55.7658],
'Venezuela': [6.4238, -66.5897]
}
# Get the coordinates for the country
coordinates = country_coordinates.get(country, [0, 0])
# Create a marker with pop-up information
folium.Marker(
location=coordinates,
popup=f"Country: {country}<br>Count: {count}",
icon=folium.Icon(color=color, icon=icon, prefix='fa')
).add_to(world_map)
# Streamlit app
st.title("Bluesky Posts Language and Country Distribution")
st.write("### Language Distribution")
st.plotly_chart(fig_languages)
st.write("### Country Distribution")
st.plotly_chart(fig_countries)
st.write("### World Map of Posts")
folium_static(world_map)
st.write("### Language Counts")
st.dataframe(language_counts)