Spaces:

leonard-dls
/

benchmark_data_contamination

Running

App Files Files Community

leonard-dls commited on 3 days ago

Commit

6136624

1 Parent(s): 5c4ad30

add phi4

Browse files

Files changed (4) hide show

__pycache__/app.cpython-310.pyc +0 -0
app.py +60 -23
phi4_gsm8k_output.jsonl +0 -0
dataset.jsonl → qwen_gsm8k_output.jsonl +0 -0

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (4.18 kB). View file

app.py CHANGED Viewed

@@ -4,17 +4,31 @@ import random
 import gradio as gr
 from difflib import SequenceMatcher
-file_path = "dataset.jsonl"
-similarity_threshold = 0.85
-current_index = 0
 description_text = """
 This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post.
 He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance.
-This space aims to partially reproduce this work. I chose to look at the contamination of **Qwen/Qwen2.5-14B** by **GSM8K** dataset.
-I found **729** GSM8K Example that had a least a 0.9 text similarity ratio between generated an original.
 """
@@ -29,14 +43,11 @@ def find_similar_chunks(original, output):
         left = j + n
     if j+n < len(output) - 1:
         highlighted_sequence.append((output[j+n:], None))
     return highlighted_sequence
-with open(file_path, "r") as file:
-    examples = [json.loads(line) for line in file if json.loads(line)["similarity_ratio"] > similarity_threshold]
-def next_example():
-    new_example = random.choice(examples)
     highlighted_output = find_similar_chunks(new_example["original"], new_example["output"])
     return(
@@ -49,30 +60,53 @@ def next_example():
         ]
     )
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown(description_text)
         with gr.Column(scale=1):
             pass
-    prompt = gr.Textbox(
-        label="Prompt",
-        interactive=False,
-        value=examples[current_index]["prompt"],
-    )
     with gr.Row():
         with gr.Column(scale=4):
             original = gr.Textbox(
                 label="Original",
                 interactive=False,
-                value=examples[current_index]["original"],
             )
         with gr.Column(scale=4):
             output = gr.HighlightedText(
                 label="Output",
                 color_map={"1": "yellow"},
-                value=find_similar_chunks(examples[current_index]["original"],
-                                        examples[current_index]["output"]),
             )
         with gr.Row():
@@ -80,20 +114,23 @@ with gr.Blocks() as demo:
                 similarity = gr.Textbox(
                     label="Similarity ratio",
                     interactive=False,
-                    value=examples[current_index]["similarity_ratio"],
                 )
             with gr.Column(scale=1):
                 seed = gr.Textbox(
                     label="Seed",
                     interactive=False,
-                    value=examples[current_index]["seed"],
                 )
     next_btn = gr.Button("Anoter example")
     next_btn.click(fn=next_example,
-                    outputs=[prompt, original, output, similarity, seed])
 demo.launch()

 import gradio as gr
 from difflib import SequenceMatcher
+with open("qwen_gsm8k_output.jsonl", "r") as file:
+    qwen_dict = [json.loads(line) for line in file]
+with open("phi4_gsm8k_output.jsonl", "r") as file:
+    phi4_dict = [json.loads(line) for line in file]
+models_data = {
+    "Qwen/Qwen2.5-14B" : qwen_dict,
+    "microsoft/phi-4" : phi4_dict
+}
+starting_index = 0
+starting_model = [model_name for model_name in models_data.keys()][0]
 description_text = """
 This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post.
 He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance.
+This space aims to partially reproduce this work.
+I chose to look at the contamination of **Qwen/Qwen2.5-14B** and **microsoft/phi-4** by **GSM8K** dataset.
+For **Qwen/Qwen2.5-14B** I found **729** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original.
+For **microsoft/phi-4** I found **172** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original.
 """
         left = j + n
     if j+n < len(output) - 1:
         highlighted_sequence.append((output[j+n:], None))
+    highlighted_sequence = highlighted_sequence[:-1]
     return highlighted_sequence
+def next_example(selected_model):
+    new_example = random.choice(models_data[selected_model])
     highlighted_output = find_similar_chunks(new_example["original"], new_example["output"])
     return(
         ]
     )
+def change_model(selected_model):
+    example = models_data[selected_model][starting_index]
+    highlighted_output = find_similar_chunks(example["original"], example["output"])
+    return(
+        [
+            example["prompt"],
+            example["original"],
+            highlighted_output,
+            example["similarity_ratio"],
+            example["seed"]
+        ]
+    )
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown(description_text)
         with gr.Column(scale=1):
             pass
+    with gr.Row():
+        with gr.Column(scale=1):
+            selected_model = gr.Dropdown(
+                [model_name for model_name in models_data.keys()],
+                value=[model_name for model_name in models_data.keys()][0],
+                interactive=True,
+                label="Model"
+            )
+        with gr.Column(scale=4):
+            prompt = gr.Textbox(
+                label="Prompt",
+                interactive=False,
+                value=models_data[starting_model][starting_index]["prompt"],
+            )
     with gr.Row():
         with gr.Column(scale=4):
             original = gr.Textbox(
                 label="Original",
                 interactive=False,
+                value=models_data[starting_model][starting_index]["original"],
             )
         with gr.Column(scale=4):
             output = gr.HighlightedText(
                 label="Output",
                 color_map={"1": "yellow"},
+                value=find_similar_chunks(models_data[starting_model][starting_index]["original"],
+                                        models_data[starting_model][starting_index]["output"]),
             )
         with gr.Row():
                 similarity = gr.Textbox(
                     label="Similarity ratio",
                     interactive=False,
+                    value=models_data[starting_model][starting_index]["similarity_ratio"],
                 )
             with gr.Column(scale=1):
                 seed = gr.Textbox(
                     label="Seed",
                     interactive=False,
+                    value=models_data[starting_model][starting_index]["seed"],
                 )
     next_btn = gr.Button("Anoter example")
     next_btn.click(fn=next_example,
+                   inputs=[selected_model],
+                   outputs=[prompt, original, output, similarity, seed])
+    selected_model.change(fn=change_model,
+                          inputs=[selected_model],
+                          outputs=[prompt, original, output, similarity, seed])
 demo.launch()

phi4_gsm8k_output.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset.jsonl → qwen_gsm8k_output.jsonl RENAMED Viewed

File without changes