|
import json |
|
import random |
|
|
|
import gradio as gr |
|
from difflib import SequenceMatcher |
|
|
|
with open("qwen_gsm8k_output.jsonl", "r") as file: |
|
qwen_dict = [json.loads(line) for line in file] |
|
|
|
with open("phi4_gsm8k_output.jsonl", "r") as file: |
|
phi4_dict = [json.loads(line) for line in file] |
|
|
|
models_data = { |
|
"Qwen/Qwen2.5-14B" : qwen_dict, |
|
"microsoft/phi-4" : phi4_dict |
|
} |
|
|
|
starting_index = 0 |
|
starting_model = [model_name for model_name in models_data.keys()][0] |
|
|
|
|
|
description_text = """ |
|
This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post. |
|
He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance. |
|
This space aims to partially reproduce this work. |
|
|
|
I chose to look at the contamination of **Qwen/Qwen2.5-14B** and **microsoft/phi-4** by **GSM8K** dataset. |
|
|
|
For **Qwen/Qwen2.5-14B** I found **729** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original. |
|
For **microsoft/phi-4** I found **172** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original. |
|
|
|
""" |
|
|
|
|
|
def find_similar_chunks(original, output): |
|
matcher = SequenceMatcher(None, original, output) |
|
left = 0 |
|
highlighted_sequence = [] |
|
for _, j, n in matcher.get_matching_blocks(): |
|
if left < j: |
|
highlighted_sequence.append((output[left:j], None)) |
|
highlighted_sequence.append((output[j:j+n], 1)) |
|
left = j + n |
|
if j+n < len(output) - 1: |
|
highlighted_sequence.append((output[j+n:], None)) |
|
highlighted_sequence = highlighted_sequence[:-1] |
|
return highlighted_sequence |
|
|
|
def next_example(selected_model): |
|
new_example = random.choice(models_data[selected_model]) |
|
|
|
highlighted_output = find_similar_chunks(new_example["original"], new_example["output"]) |
|
return( |
|
[ |
|
new_example["prompt"], |
|
new_example["original"], |
|
highlighted_output, |
|
new_example["similarity_ratio"], |
|
new_example["seed"] |
|
] |
|
) |
|
|
|
def change_model(selected_model): |
|
example = models_data[selected_model][starting_index] |
|
|
|
highlighted_output = find_similar_chunks(example["original"], example["output"]) |
|
return( |
|
[ |
|
example["prompt"], |
|
example["original"], |
|
highlighted_output, |
|
example["similarity_ratio"], |
|
example["seed"] |
|
] |
|
) |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown(description_text) |
|
with gr.Column(scale=1): |
|
pass |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
selected_model = gr.Dropdown( |
|
[model_name for model_name in models_data.keys()], |
|
value=[model_name for model_name in models_data.keys()][0], |
|
interactive=True, |
|
label="Model" |
|
) |
|
with gr.Column(scale=4): |
|
prompt = gr.Textbox( |
|
label="Prompt", |
|
interactive=False, |
|
value=models_data[starting_model][starting_index]["prompt"], |
|
) |
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
original = gr.Textbox( |
|
label="Original", |
|
interactive=False, |
|
value=models_data[starting_model][starting_index]["original"], |
|
) |
|
with gr.Column(scale=4): |
|
output = gr.HighlightedText( |
|
label="Output", |
|
color_map={"1": "yellow"}, |
|
value=find_similar_chunks(models_data[starting_model][starting_index]["original"], |
|
models_data[starting_model][starting_index]["output"]), |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
similarity = gr.Textbox( |
|
label="Similarity ratio", |
|
interactive=False, |
|
value=models_data[starting_model][starting_index]["similarity_ratio"], |
|
) |
|
with gr.Column(scale=1): |
|
seed = gr.Textbox( |
|
label="Seed", |
|
interactive=False, |
|
value=models_data[starting_model][starting_index]["seed"], |
|
) |
|
|
|
next_btn = gr.Button("Anoter example") |
|
|
|
next_btn.click(fn=next_example, |
|
inputs=[selected_model], |
|
outputs=[prompt, original, output, similarity, seed]) |
|
|
|
selected_model.change(fn=change_model, |
|
inputs=[selected_model], |
|
outputs=[prompt, original, output, similarity, seed]) |
|
|
|
demo.launch() |