Spaces:

speakleash
/

polish_eq-bench

Running

App Files Files Community

polish_eq-bench / plot_results.py

djstrong

Fix GPU precision label in performance plot

288816d 2 days ago

raw

history blame contribute delete

6.01 kB

	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	import json
	import csv

	def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
	# Define whitelist of interesting models (partial matches)
	WHITELIST = [
	'Meta-Llama-3.1-70B-Instruct'
	]

	# Read the benchmark results with error handling for inconsistent rows
	valid_rows = []
	expected_fields = 14 # Number of expected fields in each row

	with open(csv_path, 'r') as f:
	reader = csv.reader(f)
	header = next(reader) # Get header row
	# Strip whitespace from header names
	header = [h.strip() for h in header]
	for row in reader:
	if len(row) == expected_fields: # Only keep rows with correct number of fields
	# Strip whitespace from values
	valid_rows.append([val.strip() for val in row])

	# Create DataFrame from valid rows
	df = pd.DataFrame(valid_rows, columns=header)

	# Read model sizes from metadata
	with open(metadata_path, 'r') as f:
	metadata = json.load(f)

	# Process the data
	# Keep only successful runs (where Benchmark Score is not FAILED)
	df = df[df['Benchmark Score'] != 'FAILED']
	df = df[df['Benchmark Score'].notna()]
	# Convert score to numeric, handling invalid values
	df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
	df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed

	# Convert Num Questions Parseable to numeric and calculate adjusted score
	df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
	df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)

	# For each model, keep only the latest run
	df['Run ID'] = df['Run ID'].fillna('')
	df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
	df = df.sort_values('timestamp')
	df = df.drop_duplicates(subset=['Model Path'], keep='last')

	# Get model sizes
	def get_model_size(model_path):
	# Try exact match first
	if model_path in metadata:
	return metadata[model_path]
	# Try with max_length suffix
	if f"{model_path},max_length=4096" in metadata:
	return metadata[f"{model_path},max_length=4096"]
	return None

	# Print models without size before filtering
	print("\nModels without size assigned:")
	models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
	for model in models_without_size['Model Path']:
	print(f"- {model}")

	df['Model Size'] = df['Model Path'].apply(get_model_size)
	df = df[df['Model Size'].notna()]

	# Remove extreme outliers (scores that are clearly errors)
	q1 = df['Benchmark Score'].quantile(0.25)
	q3 = df['Benchmark Score'].quantile(0.75)
	iqr = q3 - q1
	df = df[
	(df['Benchmark Score'] >= q1 - 1.5 * iqr) &
	(df['Benchmark Score'] <= q3 + 1.5 * iqr)
	]

	# Find models on Pareto frontier
	sizes = sorted(df['Model Size'].unique())
	frontier_points = []
	max_score = float('-inf')
	frontier_models = set()

	for size in sizes:
	# Get scores for models of this size or smaller
	subset = df[df['Model Size'] <= size]
	if len(subset) > 0:
	max_score_idx = subset['Benchmark Score'].idxmax()
	current_max = subset.loc[max_score_idx, 'Benchmark Score']
	if current_max > max_score:
	max_score = current_max
	frontier_points.append((size, max_score))
	frontier_models.add(subset.loc[max_score_idx, 'Model Path'])

	# Filter models - keep those on Pareto frontier or matching whitelist
	df['Keep'] = False
	for idx, row in df.iterrows():
	if row['Model Path'] in frontier_models:
	df.loc[idx, 'Keep'] = True
	else:
	for pattern in WHITELIST:
	if pattern in row['Model Path']:
	df.loc[idx, 'Keep'] = True
	break

	df = df[df['Keep']]

	# Create the plot
	fig = plt.figure(figsize=(12, 8))

	# Create scatter plot
	plt.scatter(df['Model Size'],
	df['Benchmark Score'],
	alpha=0.6)

	# Add labels for points
	for idx, row in df.iterrows():
	# Get model name - either last part of path or full name for special cases
	model_name = row['Model Path'].split('/')[-1]
	if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
	model_name = row['Model Path']

	plt.annotate(model_name,
	(row['Model Size'], row['Benchmark Score']),
	xytext=(5, 5), textcoords='offset points',
	fontsize=8,
	bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))

	# Plot the Pareto frontier line
	if frontier_points:
	frontier_x, frontier_y = zip(*frontier_points)
	plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')

	# Add vertical line for consumer GPU budget
	plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
	plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin half precision',
	horizontalalignment='center', verticalalignment='top',
	transform=plt.gca().get_xaxis_transform())

	# Customize the plot
	plt.grid(True, linestyle='--', alpha=0.7)
	plt.xlabel('Model Size (billions of parameters)')
	plt.ylabel('Benchmark Score')
	plt.title('Model Performance vs Size (Pareto Frontier)')

	# Add legend
	plt.legend()

	# Adjust layout to prevent label cutoff
	plt.tight_layout()

	return fig

	if __name__ == "__main__":
	# When run as a script, save the plot to a file
	fig = create_performance_plot()
	fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')