the-stack-inspection

Sleeping

App Files Files Community

loubnabnl HF staff commited on Feb 13, 2023

Commit

2be75e8

1 Parent(s): c872f77

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -45

app.py CHANGED Viewed

@@ -1,61 +1,68 @@
-"""
-This code was adapted from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
-"""
 import streamlit as st
 import json
 import pandas as pd
-st.set_page_config(page_title="PII Visualization", layout="wide")
-st.title("PII Visualization")
-tags = ["KEY", "IP_ADDRESS", "EMAIL"]
-types = ["False positives", "False negatives"]
-matches = {"False negatives": "fn", "False positives": "fp"}
 @st.cache()
-def load_data():
-    with open(f"data/{chosen_tag.lower()}_detections_{matches[chosen_type]}.json", "r") as f:
-        samples = json.load(f)
-    return samples
 col1, col2, col3 = st.columns([1, 1, 4])
 with col1:
-    chosen_type = st.selectbox(
-    label="Select the type of detections",
-    options=types,
     index=0)
 with col2:
-    chosen_tag = st.selectbox(
-    label="Select the PII TAG",
-    options=tags,
     index=0)
-samples = load_data()
 max_docs = len(samples)
-col1, col2 = st.columns([2, 4])
-with col1:
-    index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
-st.write("Scroll down to visualize PII detections highlighted in yellow, we split the text at the start and end of the key to highlight it.")
-detection = samples[index_example]
-delimiter = f"PI:{matches[chosen_type].upper()}"
-count = detection.count(delimiter)
-st.subheader(f"{count} {chosen_type.lower()} for {chosen_tag} tag in example {index_example}:")
-subparts = []
-advance, found = 0, 0
-last_part = detection
-while found < count:
-    start = advance + last_part.index(delimiter)
-    end = advance + last_part.index("END_PI")+ 6
-    st.code(detection[advance:start])
-    st.markdown("<span style=\"background-color: #FFFF00\">"+detection[start:end]+"</span>", unsafe_allow_html=True)
-    last_part = detection[end:]
-    advance = end
-    found += 1
-st.code(last_part)

 import streamlit as st
 import json
 import pandas as pd
+from datasets import load_dataset
+st.set_page_config(page_title="The Stack data Inspection", layout="wide")
+st.title("The Stack data Inspection")
+df = pd.read_csv("extension_distribution.csv")
+all_extensions = df["extension"].tolist()
+tags = {}
+for index, row in df.iterrows():
+    if row["language"] not in tags:
+        tags[row["language"]] = []
+    tags[row["language"]].append(row["extension"])
+all_languages = list(tags.keys())
 @st.cache()
+def load_data(language, ext):
+    ds = load_dataset("loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train")
+    return ds
 col1, col2, col3 = st.columns([1, 1, 4])
 with col1:
+    chosen_language = st.selectbox(
+    label="Select a programming language",
+    options=all_languages,
     index=0)
 with col2:
+    chosen_ext = st.selectbox(
+    label="Select an extension",
+    options=tags[chosen_language],
     index=0)
+samples = load_data(chosen_language, chosen_ext)
 max_docs = len(samples)
+samples = samples.add_column("idx", range(len(samples)))
+not_lexed = samples.filter(lambda x: not x['lexable'])
+indexes_not_lexed = not_lexed['idx']
+# info about extension
+st.markdown("### Information about the extension:")
+text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
+{df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
+are not lexable. These files are at indexes: {indexes_not_lexed}."
+st.markdown(text)
+col_1, col_2 = st.columns([2, 4])
+with col_1:
+    index_example = st.number_input(f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs-1, value=0, step=1)
+st.write(f"Example chosen:{index_example}")
+# info about the chosen example
+example = samples[index_example]
+st.markdown("#### Information about the chosen example:")
+text_alpha = "**has**" if example['long_lines'] else "doesn't have"
+text_lines = "**has**" if example['low_alphanum'] else "doesn't have"
+text_lexer = "is" if example['lexable'] else "**isn't**"
+st.markdown(f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
+    {text_lines} very long lines,  and {text_lexer} lexable.")
+st.markdown("#### File content:")
+st.code(example["content"], language=chosen_language)