loubnabnl's picture
loubnabnl HF staff
fix syntax highlighting
e4c83c2
raw
history blame
2.73 kB
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset
st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.title("The Stack data Inspection")
df = pd.read_csv("extension_distribution.csv")
all_extensions = df["extension"].tolist()
tags = {}
for index, row in df.iterrows():
if row["language"] not in tags:
tags[row["language"]] = []
tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())
@st.cache()
def load_data(language, ext):
ds = load_dataset(
"loubnabnl/the-stack-inspection-data",
data_dir=f"data/{language}/{ext}",
split="train",
)
return ds
col1, col2, _ = st.columns([1, 1, 4])
with col1:
chosen_language = st.selectbox(
label="Select a programming language", options=all_languages, index=0
)
with col2:
chosen_ext = st.selectbox(
label="Select an extension", options=tags[chosen_language], index=0
)
# load the dataset and get indexes of non lexable files
samples = load_data(chosen_language, chosen_ext)
max_docs = len(samples)
samples = samples.add_column("idx", range(len(samples)))
not_lexed = samples.filter(lambda x: not x["lexable"])
indexes_not_lexed = not_lexed["idx"]
# info about extension
st.markdown("### Information about the extension:")
text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
{df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
are not lexable. These files are at indexes: {indexes_not_lexed}."
st.markdown(text)
col_1, _ = st.columns([2, 4])
with col_1:
index_example = st.number_input(
f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
min_value=0,
max_value=max_docs - 1,
value=0,
step=1,
)
# info about the chosen example
example = samples[index_example]
st.markdown("#### Information about the chosen example:")
text_alpha = "**has**" if example["long_lines"] else "doesn't have"
text_lines = "**has**" if example["low_alphanum"] else "doesn't have"
text_lexer = "is" if example["lexable"] else "**isn't**"
st.markdown(
f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
{text_lines} very long lines, and {text_lexer} lexable."
)
# display file content
st.markdown("#### File content:")
if not example["lexable"]:
st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n {example['content']}")
else:
st.code(example["content"], language=chosen_language)