loubnabnl HF staff commited on
Commit
249a7b0
·
1 Parent(s): 666dbec

cache filtering

Browse files
Files changed (1) hide show
  1. app.py +10 -14
app.py CHANGED
@@ -17,15 +17,19 @@ for index, row in df.iterrows():
17
  all_languages = list(tags.keys())
18
 
19
 
20
-
21
- @st.cache()
22
- def load_data(language, ext):
23
- ds = load_dataset(
24
  "loubnabnl/the-stack-inspection-data",
25
  data_dir=f"data/{language}/{ext}",
26
  split="train",
27
  )
28
- return ds
 
 
 
 
 
29
 
30
  col1, col2, _ = st.columns([1, 1, 4])
31
  with col1:
@@ -45,15 +49,7 @@ max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500,
45
  st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
46
  `alphanumeric_fraction` is smaller than the selected value.")
47
 
48
- # load and filter dataset
49
- samples = load_data(chosen_language, chosen_ext)
50
-
51
- samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
52
- samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
53
- samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
54
-
55
- if not_lexable:
56
- samples = samples.filter(lambda x: not x["lexable"])
57
 
58
  max_docs = len(samples)
59
 
 
17
  all_languages = list(tags.keys())
18
 
19
 
20
+ @st.cache(max_entries=100)
21
+ def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
22
+ samples = load_dataset(
 
23
  "loubnabnl/the-stack-inspection-data",
24
  data_dir=f"data/{language}/{ext}",
25
  split="train",
26
  )
27
+ samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
28
+ samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
29
+ samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
30
+ if non_lexable:
31
+ samples = samples.filter(lambda x: not x["lexable"])
32
+ return samples
33
 
34
  col1, col2, _ = st.columns([1, 1, 4])
35
  with col1:
 
49
  st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
50
  `alphanumeric_fraction` is smaller than the selected value.")
51
 
52
+ samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)
 
 
 
 
 
 
 
 
53
 
54
  max_docs = len(samples)
55