File size: 2,153 Bytes
215f60a
 
724b1ea
 
 
 
 
 
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
 
 
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
215f60a
 
 
 
 
724b1ea
 
 
 
215f60a
 
 
 
 
 
 
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
215f60a
724b1ea
 
215f60a
724b1ea
215f60a
 
 
 
724b1ea
215f60a
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
 
 
215f60a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import json

import streamlit as st
import streamlit.components.v1 as components

BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"


def load_jsonl(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))

    return data


if "idx" not in st.session_state:
    st.session_state.idx = 0


def get_next_item():
    st.session_state.idx += 1


def save_and_get_next_item(sample, issue):
    sample["issue"] = issue

    with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
        f.write(json.dumps(sample) + "\n")

    get_next_item()


datasets = [
    "gutenberg_raw",
    "stackexchange2",
    "bigcode_python_code",
    "bigcode_python_github_issues",
    "bigcode_python_jupyter_scripts_dedup_filtered",
    "books3",
    "c4",
    "s2orc_raw",
    "reddit_threaded",
    "cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")

# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
    pass

st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))

with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
    st.sidebar.download_button(
        "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
    )

st.sidebar.button(
    "Clear bad examples file",
    on_click=lambda: open(
        f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
    ).close(),
)

with st.form(key="bad_form", clear_on_submit=True):
    sample = data[st.session_state.idx]
    text = sample["text"]
    st.text_area(f"text id: {st.session_state.idx}", text, height=500)

    issue = st.text_input(
        "What's wrong with this example? (leave blank if example is fine)"
    )

    good = st.form_submit_button(
        "GOOD", on_click=get_next_item, use_container_width=True
    )
    bad = st.form_submit_button(
        "BAD",
        on_click=save_and_get_next_item,
        args=(sample, issue),
        use_container_width=True,
    )