loubnabnl HF staff commited on
Commit
2be75e8
·
1 Parent(s): c872f77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -45
app.py CHANGED
@@ -1,61 +1,68 @@
1
- """
2
- This code was adapted from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
3
- """
4
-
5
  import streamlit as st
6
  import json
7
  import pandas as pd
 
 
 
 
8
 
9
- st.set_page_config(page_title="PII Visualization", layout="wide")
10
- st.title("PII Visualization")
 
 
 
 
 
 
11
 
12
- tags = ["KEY", "IP_ADDRESS", "EMAIL"]
13
- types = ["False positives", "False negatives"]
14
- matches = {"False negatives": "fn", "False positives": "fp"}
15
 
16
  @st.cache()
17
- def load_data():
18
- with open(f"data/{chosen_tag.lower()}_detections_{matches[chosen_type]}.json", "r") as f:
19
- samples = json.load(f)
20
- return samples
21
-
22
  col1, col2, col3 = st.columns([1, 1, 4])
23
  with col1:
24
- chosen_type = st.selectbox(
25
- label="Select the type of detections",
26
- options=types,
27
  index=0)
28
  with col2:
29
- chosen_tag = st.selectbox(
30
- label="Select the PII TAG",
31
- options=tags,
32
  index=0)
33
 
34
- samples = load_data()
35
  max_docs = len(samples)
 
 
 
36
 
37
- col1, col2 = st.columns([2, 4])
38
- with col1:
39
- index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
40
-
41
- st.write("Scroll down to visualize PII detections highlighted in yellow, we split the text at the start and end of the key to highlight it.")
42
-
43
- detection = samples[index_example]
44
- delimiter = f"PI:{matches[chosen_type].upper()}"
45
- count = detection.count(delimiter)
46
-
47
- st.subheader(f"{count} {chosen_type.lower()} for {chosen_tag} tag in example {index_example}:")
48
-
49
- subparts = []
50
- advance, found = 0, 0
51
- last_part = detection
52
- while found < count:
53
- start = advance + last_part.index(delimiter)
54
- end = advance + last_part.index("END_PI")+ 6
55
- st.code(detection[advance:start])
56
- st.markdown("<span style=\"background-color: #FFFF00\">"+detection[start:end]+"</span>", unsafe_allow_html=True)
57
- last_part = detection[end:]
58
- advance = end
59
- found += 1
60
- st.code(last_part)
 
 
61
 
 
 
 
 
 
1
  import streamlit as st
2
  import json
3
  import pandas as pd
4
+ from datasets import load_dataset
5
+
6
+ st.set_page_config(page_title="The Stack data Inspection", layout="wide")
7
+ st.title("The Stack data Inspection")
8
 
9
+ df = pd.read_csv("extension_distribution.csv")
10
+ all_extensions = df["extension"].tolist()
11
+ tags = {}
12
+ for index, row in df.iterrows():
13
+ if row["language"] not in tags:
14
+ tags[row["language"]] = []
15
+ tags[row["language"]].append(row["extension"])
16
+ all_languages = list(tags.keys())
17
 
 
 
 
18
 
19
  @st.cache()
20
+ def load_data(language, ext):
21
+ ds = load_dataset("loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train")
22
+ return ds
23
+
 
24
  col1, col2, col3 = st.columns([1, 1, 4])
25
  with col1:
26
+ chosen_language = st.selectbox(
27
+ label="Select a programming language",
28
+ options=all_languages,
29
  index=0)
30
  with col2:
31
+ chosen_ext = st.selectbox(
32
+ label="Select an extension",
33
+ options=tags[chosen_language],
34
  index=0)
35
 
36
+ samples = load_data(chosen_language, chosen_ext)
37
  max_docs = len(samples)
38
+ samples = samples.add_column("idx", range(len(samples)))
39
+ not_lexed = samples.filter(lambda x: not x['lexable'])
40
+ indexes_not_lexed = not_lexed['idx']
41
 
42
+ # info about extension
43
+ st.markdown("### Information about the extension:")
44
+ text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
45
+ {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
46
+ are not lexable. These files are at indexes: {indexes_not_lexed}."
47
+ st.markdown(text)
48
+
49
+ col_1, col_2 = st.columns([2, 4])
50
+ with col_1:
51
+ index_example = st.number_input(f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs-1, value=0, step=1)
52
+
53
+ st.write(f"Example chosen:{index_example}")
54
+ # info about the chosen example
55
+ example = samples[index_example]
56
+ st.markdown("#### Information about the chosen example:")
57
+ text_alpha = "**has**" if example['long_lines'] else "doesn't have"
58
+ text_lines = "**has**" if example['low_alphanum'] else "doesn't have"
59
+ text_lexer = "is" if example['lexable'] else "**isn't**"
60
+
61
+
62
+ st.markdown(f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
63
+ {text_lines} very long lines, and {text_lexer} lexable.")
64
+
65
+ st.markdown("#### File content:")
66
+
67
+ st.code(example["content"], language=chosen_language)
68