polish_eq-bench / plot_results.py
djstrong's picture
Fix GPU precision label in performance plot
288816d
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import csv
def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
# Define whitelist of interesting models (partial matches)
WHITELIST = [
'Meta-Llama-3.1-70B-Instruct'
]
# Read the benchmark results with error handling for inconsistent rows
valid_rows = []
expected_fields = 14 # Number of expected fields in each row
with open(csv_path, 'r') as f:
reader = csv.reader(f)
header = next(reader) # Get header row
# Strip whitespace from header names
header = [h.strip() for h in header]
for row in reader:
if len(row) == expected_fields: # Only keep rows with correct number of fields
# Strip whitespace from values
valid_rows.append([val.strip() for val in row])
# Create DataFrame from valid rows
df = pd.DataFrame(valid_rows, columns=header)
# Read model sizes from metadata
with open(metadata_path, 'r') as f:
metadata = json.load(f)
# Process the data
# Keep only successful runs (where Benchmark Score is not FAILED)
df = df[df['Benchmark Score'] != 'FAILED']
df = df[df['Benchmark Score'].notna()]
# Convert score to numeric, handling invalid values
df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed
# Convert Num Questions Parseable to numeric and calculate adjusted score
df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)
# For each model, keep only the latest run
df['Run ID'] = df['Run ID'].fillna('')
df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
df = df.sort_values('timestamp')
df = df.drop_duplicates(subset=['Model Path'], keep='last')
# Get model sizes
def get_model_size(model_path):
# Try exact match first
if model_path in metadata:
return metadata[model_path]
# Try with max_length suffix
if f"{model_path},max_length=4096" in metadata:
return metadata[f"{model_path},max_length=4096"]
return None
# Print models without size before filtering
print("\nModels without size assigned:")
models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
for model in models_without_size['Model Path']:
print(f"- {model}")
df['Model Size'] = df['Model Path'].apply(get_model_size)
df = df[df['Model Size'].notna()]
# Remove extreme outliers (scores that are clearly errors)
q1 = df['Benchmark Score'].quantile(0.25)
q3 = df['Benchmark Score'].quantile(0.75)
iqr = q3 - q1
df = df[
(df['Benchmark Score'] >= q1 - 1.5 * iqr) &
(df['Benchmark Score'] <= q3 + 1.5 * iqr)
]
# Find models on Pareto frontier
sizes = sorted(df['Model Size'].unique())
frontier_points = []
max_score = float('-inf')
frontier_models = set()
for size in sizes:
# Get scores for models of this size or smaller
subset = df[df['Model Size'] <= size]
if len(subset) > 0:
max_score_idx = subset['Benchmark Score'].idxmax()
current_max = subset.loc[max_score_idx, 'Benchmark Score']
if current_max > max_score:
max_score = current_max
frontier_points.append((size, max_score))
frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
# Filter models - keep those on Pareto frontier or matching whitelist
df['Keep'] = False
for idx, row in df.iterrows():
if row['Model Path'] in frontier_models:
df.loc[idx, 'Keep'] = True
else:
for pattern in WHITELIST:
if pattern in row['Model Path']:
df.loc[idx, 'Keep'] = True
break
df = df[df['Keep']]
# Create the plot
fig = plt.figure(figsize=(12, 8))
# Create scatter plot
plt.scatter(df['Model Size'],
df['Benchmark Score'],
alpha=0.6)
# Add labels for points
for idx, row in df.iterrows():
# Get model name - either last part of path or full name for special cases
model_name = row['Model Path'].split('/')[-1]
if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
model_name = row['Model Path']
plt.annotate(model_name,
(row['Model Size'], row['Benchmark Score']),
xytext=(5, 5), textcoords='offset points',
fontsize=8,
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
# Plot the Pareto frontier line
if frontier_points:
frontier_x, frontier_y = zip(*frontier_points)
plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')
# Add vertical line for consumer GPU budget
plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin half precision',
horizontalalignment='center', verticalalignment='top',
transform=plt.gca().get_xaxis_transform())
# Customize the plot
plt.grid(True, linestyle='--', alpha=0.7)
plt.xlabel('Model Size (billions of parameters)')
plt.ylabel('Benchmark Score')
plt.title('Model Performance vs Size (Pareto Frontier)')
# Add legend
plt.legend()
# Adjust layout to prevent label cutoff
plt.tight_layout()
return fig
if __name__ == "__main__":
# When run as a script, save the plot to a file
fig = create_performance_plot()
fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')