loubnabnl HF staff commited on
Commit
66a3725
·
1 Parent(s): b10125a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -19
app.py CHANGED
@@ -8,6 +8,7 @@ st.title("The Stack data Inspection")
8
 
9
  df = pd.read_csv("extension_distribution.csv")
10
  all_extensions = df["extension"].tolist()
 
11
  tags = {}
12
  for index, row in df.iterrows():
13
  if row["language"] not in tags:
@@ -18,26 +19,32 @@ all_languages = list(tags.keys())
18
 
19
  @st.cache()
20
  def load_data(language, ext):
21
- ds = load_dataset("loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train")
 
 
 
 
22
  return ds
23
 
24
- col1, col2, col3 = st.columns([1, 1, 4])
 
25
  with col1:
26
  chosen_language = st.selectbox(
27
- label="Select a programming language",
28
- options=all_languages,
29
- index=0)
30
  with col2:
31
  chosen_ext = st.selectbox(
32
- label="Select an extension",
33
- options=tags[chosen_language],
34
- index=0)
35
 
 
36
  samples = load_data(chosen_language, chosen_ext)
37
  max_docs = len(samples)
38
  samples = samples.add_column("idx", range(len(samples)))
39
- not_lexed = samples.filter(lambda x: not x['lexable'])
40
- indexes_not_lexed = not_lexed['idx']
 
41
 
42
  # info about extension
43
  st.markdown("### Information about the extension:")
@@ -46,23 +53,30 @@ text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == cho
46
  are not lexable. These files are at indexes: {indexes_not_lexed}."
47
  st.markdown(text)
48
 
49
- col_1, col_2 = st.columns([2, 4])
50
  with col_1:
51
- index_example = st.number_input(f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs-1, value=0, step=1)
 
 
 
 
 
 
52
 
53
 
54
  # info about the chosen example
55
  example = samples[index_example]
56
  st.markdown("#### Information about the chosen example:")
57
- text_alpha = "**has**" if example['long_lines'] else "doesn't have"
58
- text_lines = "**has**" if example['low_alphanum'] else "doesn't have"
59
- text_lexer = "is" if example['lexable'] else "**isn't**"
60
 
 
 
 
 
61
 
62
- st.markdown(f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
63
- {text_lines} very long lines, and {text_lexer} lexable.")
64
 
 
65
  st.markdown("#### File content:")
66
-
67
  st.code(example["content"], language=chosen_language)
68
-
 
8
 
9
  df = pd.read_csv("extension_distribution.csv")
10
  all_extensions = df["extension"].tolist()
11
+
12
  tags = {}
13
  for index, row in df.iterrows():
14
  if row["language"] not in tags:
 
19
 
20
  @st.cache()
21
  def load_data(language, ext):
22
+ ds = load_dataset(
23
+ "loubnabnl/the-stack-inspection-data",
24
+ data_dir=f"data/{language}/{ext}",
25
+ split="train",
26
+ )
27
  return ds
28
 
29
+
30
+ col1, col2, _ = st.columns([1, 1, 4])
31
  with col1:
32
  chosen_language = st.selectbox(
33
+ label="Select a programming language", options=all_languages, index=0
34
+ )
 
35
  with col2:
36
  chosen_ext = st.selectbox(
37
+ label="Select an extension", options=tags[chosen_language], index=0
38
+ )
39
+
40
 
41
+ # load the dataset and get indexes of non lexable files
42
  samples = load_data(chosen_language, chosen_ext)
43
  max_docs = len(samples)
44
  samples = samples.add_column("idx", range(len(samples)))
45
+ not_lexed = samples.filter(lambda x: not x["lexable"])
46
+ indexes_not_lexed = not_lexed["idx"]
47
+
48
 
49
  # info about extension
50
  st.markdown("### Information about the extension:")
 
53
  are not lexable. These files are at indexes: {indexes_not_lexed}."
54
  st.markdown(text)
55
 
56
+ col_1, _ = st.columns([2, 4])
57
  with col_1:
58
+ index_example = st.number_input(
59
+ f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
60
+ min_value=0,
61
+ max_value=max_docs - 1,
62
+ value=0,
63
+ step=1,
64
+ )
65
 
66
 
67
  # info about the chosen example
68
  example = samples[index_example]
69
  st.markdown("#### Information about the chosen example:")
70
+ text_alpha = "**has**" if example["long_lines"] else "doesn't have"
71
+ text_lines = "**has**" if example["low_alphanum"] else "doesn't have"
72
+ text_lexer = "is" if example["lexable"] else "**isn't**"
73
 
74
+ st.markdown(
75
+ f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
76
+ {text_lines} very long lines, and {text_lexer} lexable."
77
+ )
78
 
 
 
79
 
80
+ # display file content
81
  st.markdown("#### File content:")
 
82
  st.code(example["content"], language=chosen_language)