the-stack-inspection

Sleeping

loubnabnl HF staff commited on Feb 14, 2023

Commit

249a7b0

1 Parent(s): 666dbec

cache filtering

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,15 +17,19 @@ for index, row in df.iterrows():
 all_languages = list(tags.keys())
-@st.cache()
-def load_data(language, ext):
-    ds = load_dataset(
         "loubnabnl/the-stack-inspection-data",
         data_dir=f"data/{language}/{ext}",
         split="train",
     )
-    return ds
 col1, col2, _ = st.columns([1, 1, 4])
 with col1:
@@ -45,15 +49,7 @@ max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500,
 st.sidebar.markdown("Printed files have `max_line_length`  and `average_line_length` larger than the selected values.\
 `alphanumeric_fraction` is smaller than the selected value.")
-# load and filter dataset
-samples = load_data(chosen_language, chosen_ext)
-samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
-samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
-samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
-if not_lexable:
-    samples = samples.filter(lambda x: not x["lexable"])
 max_docs = len(samples)

 all_languages = list(tags.keys())
+@st.cache(max_entries=100)
+def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
+    samples = load_dataset(
         "loubnabnl/the-stack-inspection-data",
         data_dir=f"data/{language}/{ext}",
         split="train",
     )
+    samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
+    samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
+    samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
+    if non_lexable:
+        samples = samples.filter(lambda x: not x["lexable"])
+    return samples
 col1, col2, _ = st.columns([1, 1, 4])
 with col1:
 st.sidebar.markdown("Printed files have `max_line_length`  and `average_line_length` larger than the selected values.\
 `alphanumeric_fraction` is smaller than the selected value.")
+samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)
 max_docs = len(samples)