leonard-dls commited on
Commit
6136624
·
1 Parent(s): 5c4ad30
__pycache__/app.cpython-310.pyc ADDED
Binary file (4.18 kB). View file
 
app.py CHANGED
@@ -4,17 +4,31 @@ import random
4
  import gradio as gr
5
  from difflib import SequenceMatcher
6
 
7
- file_path = "dataset.jsonl"
8
- similarity_threshold = 0.85
9
- current_index = 0
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  description_text = """
12
  This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post.
13
  He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance.
 
 
 
14
 
15
- This space aims to partially reproduce this work. I chose to look at the contamination of **Qwen/Qwen2.5-14B** by **GSM8K** dataset.
 
16
 
17
- I found **729** GSM8K Example that had a least a 0.9 text similarity ratio between generated an original.
18
  """
19
 
20
 
@@ -29,14 +43,11 @@ def find_similar_chunks(original, output):
29
  left = j + n
30
  if j+n < len(output) - 1:
31
  highlighted_sequence.append((output[j+n:], None))
32
-
33
  return highlighted_sequence
34
 
35
- with open(file_path, "r") as file:
36
- examples = [json.loads(line) for line in file if json.loads(line)["similarity_ratio"] > similarity_threshold]
37
-
38
- def next_example():
39
- new_example = random.choice(examples)
40
 
41
  highlighted_output = find_similar_chunks(new_example["original"], new_example["output"])
42
  return(
@@ -49,30 +60,53 @@ def next_example():
49
  ]
50
  )
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  with gr.Blocks() as demo:
53
  with gr.Row():
54
  with gr.Column(scale=1):
55
  gr.Markdown(description_text)
56
  with gr.Column(scale=1):
57
  pass
58
- prompt = gr.Textbox(
59
- label="Prompt",
60
- interactive=False,
61
- value=examples[current_index]["prompt"],
62
- )
 
 
 
 
 
 
 
 
 
63
  with gr.Row():
64
  with gr.Column(scale=4):
65
  original = gr.Textbox(
66
  label="Original",
67
  interactive=False,
68
- value=examples[current_index]["original"],
69
  )
70
  with gr.Column(scale=4):
71
  output = gr.HighlightedText(
72
  label="Output",
73
  color_map={"1": "yellow"},
74
- value=find_similar_chunks(examples[current_index]["original"],
75
- examples[current_index]["output"]),
76
  )
77
 
78
  with gr.Row():
@@ -80,20 +114,23 @@ with gr.Blocks() as demo:
80
  similarity = gr.Textbox(
81
  label="Similarity ratio",
82
  interactive=False,
83
- value=examples[current_index]["similarity_ratio"],
84
  )
85
  with gr.Column(scale=1):
86
  seed = gr.Textbox(
87
  label="Seed",
88
  interactive=False,
89
- value=examples[current_index]["seed"],
90
  )
91
 
92
  next_btn = gr.Button("Anoter example")
93
 
94
  next_btn.click(fn=next_example,
95
- outputs=[prompt, original, output, similarity, seed])
96
-
97
 
 
 
 
98
 
99
  demo.launch()
 
4
  import gradio as gr
5
  from difflib import SequenceMatcher
6
 
7
+ with open("qwen_gsm8k_output.jsonl", "r") as file:
8
+ qwen_dict = [json.loads(line) for line in file]
9
+
10
+ with open("phi4_gsm8k_output.jsonl", "r") as file:
11
+ phi4_dict = [json.loads(line) for line in file]
12
+
13
+ models_data = {
14
+ "Qwen/Qwen2.5-14B" : qwen_dict,
15
+ "microsoft/phi-4" : phi4_dict
16
+ }
17
+
18
+ starting_index = 0
19
+ starting_model = [model_name for model_name in models_data.keys()][0]
20
+
21
 
22
  description_text = """
23
  This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post.
24
  He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance.
25
+ This space aims to partially reproduce this work.
26
+
27
+ I chose to look at the contamination of **Qwen/Qwen2.5-14B** and **microsoft/phi-4** by **GSM8K** dataset.
28
 
29
+ For **Qwen/Qwen2.5-14B** I found **729** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original.
30
+ For **microsoft/phi-4** I found **172** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original.
31
 
 
32
  """
33
 
34
 
 
43
  left = j + n
44
  if j+n < len(output) - 1:
45
  highlighted_sequence.append((output[j+n:], None))
46
+ highlighted_sequence = highlighted_sequence[:-1]
47
  return highlighted_sequence
48
 
49
+ def next_example(selected_model):
50
+ new_example = random.choice(models_data[selected_model])
 
 
 
51
 
52
  highlighted_output = find_similar_chunks(new_example["original"], new_example["output"])
53
  return(
 
60
  ]
61
  )
62
 
63
+ def change_model(selected_model):
64
+ example = models_data[selected_model][starting_index]
65
+
66
+ highlighted_output = find_similar_chunks(example["original"], example["output"])
67
+ return(
68
+ [
69
+ example["prompt"],
70
+ example["original"],
71
+ highlighted_output,
72
+ example["similarity_ratio"],
73
+ example["seed"]
74
+ ]
75
+ )
76
+
77
  with gr.Blocks() as demo:
78
  with gr.Row():
79
  with gr.Column(scale=1):
80
  gr.Markdown(description_text)
81
  with gr.Column(scale=1):
82
  pass
83
+ with gr.Row():
84
+ with gr.Column(scale=1):
85
+ selected_model = gr.Dropdown(
86
+ [model_name for model_name in models_data.keys()],
87
+ value=[model_name for model_name in models_data.keys()][0],
88
+ interactive=True,
89
+ label="Model"
90
+ )
91
+ with gr.Column(scale=4):
92
+ prompt = gr.Textbox(
93
+ label="Prompt",
94
+ interactive=False,
95
+ value=models_data[starting_model][starting_index]["prompt"],
96
+ )
97
  with gr.Row():
98
  with gr.Column(scale=4):
99
  original = gr.Textbox(
100
  label="Original",
101
  interactive=False,
102
+ value=models_data[starting_model][starting_index]["original"],
103
  )
104
  with gr.Column(scale=4):
105
  output = gr.HighlightedText(
106
  label="Output",
107
  color_map={"1": "yellow"},
108
+ value=find_similar_chunks(models_data[starting_model][starting_index]["original"],
109
+ models_data[starting_model][starting_index]["output"]),
110
  )
111
 
112
  with gr.Row():
 
114
  similarity = gr.Textbox(
115
  label="Similarity ratio",
116
  interactive=False,
117
+ value=models_data[starting_model][starting_index]["similarity_ratio"],
118
  )
119
  with gr.Column(scale=1):
120
  seed = gr.Textbox(
121
  label="Seed",
122
  interactive=False,
123
+ value=models_data[starting_model][starting_index]["seed"],
124
  )
125
 
126
  next_btn = gr.Button("Anoter example")
127
 
128
  next_btn.click(fn=next_example,
129
+ inputs=[selected_model],
130
+ outputs=[prompt, original, output, similarity, seed])
131
 
132
+ selected_model.change(fn=change_model,
133
+ inputs=[selected_model],
134
+ outputs=[prompt, original, output, similarity, seed])
135
 
136
  demo.launch()
phi4_gsm8k_output.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
dataset.jsonl → qwen_gsm8k_output.jsonl RENAMED
File without changes