Spaces:
Running
Running
feat(scoring): use model size as direct multiplier
Browse files- docs/ranking_system.md +77 -0
- src/core/scoring.py +5 -13
docs/ranking_system.md
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Device Ranking System
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
The ranking system implements a multi-dimensional approach to evaluate and compare device performance across different aspects of LLM (GGUF) model runs.
|
5 |
+
|
6 |
+
## Scoring Algorithm
|
7 |
+
|
8 |
+
### Standard Benchmark Conditions
|
9 |
+
```python
|
10 |
+
PP_CONFIG = 512 # Standard prompt processing token count
|
11 |
+
TG_CONFIG = 128 # Standard token generation count
|
12 |
+
|
13 |
+
# Component Weights
|
14 |
+
TG_WEIGHT = 0.6 # Token generation weight (60%)
|
15 |
+
PP_WEIGHT = 0.4 # Prompt processing weight (40%)
|
16 |
+
```
|
17 |
+
- PP given 40% weight as it's a one-time cost per prompt
|
18 |
+
- TG given higher weight (60%) as it represents ongoing performance
|
19 |
+
|
20 |
+
### Quantization Quality Factors
|
21 |
+
```python
|
22 |
+
QUANT_TIERS = {
|
23 |
+
"F16": 1.0,
|
24 |
+
"F32": 1.0,
|
25 |
+
"Q8": 0.8,
|
26 |
+
"Q6": 0.6,
|
27 |
+
"Q5": 0.5,
|
28 |
+
"Q4": 0.4,
|
29 |
+
"Q3": 0.3,
|
30 |
+
"Q2": 0.2,
|
31 |
+
"Q1": 0.1,
|
32 |
+
}
|
33 |
+
```
|
34 |
+
|
35 |
+
- Linear scale from 0.1 to 1.0 based on quantization level
|
36 |
+
- F16/F32 are considered 1.0 (this skews the results a bit towards quantization)
|
37 |
+
|
38 |
+
|
39 |
+
### Performance Score Formula
|
40 |
+
The final performance score is calculated as follows:
|
41 |
+
|
42 |
+
1. **Base Performance**:
|
43 |
+
```
|
44 |
+
base_score = (TG_speed * TG_WEIGHT + PP_speed * PP_WEIGHT)
|
45 |
+
```
|
46 |
+
|
47 |
+
2. **Size and Quantization Adjustment**:
|
48 |
+
```
|
49 |
+
# Direct multiplication by model size (in billions)
|
50 |
+
performance_score = base_score * model_size * quant_factor
|
51 |
+
```
|
52 |
+
- Linear multiplier by model size
|
53 |
+
|
54 |
+
3. **Normalization**:
|
55 |
+
```
|
56 |
+
normalized_score = (performance_score / max_performance_score) * 100
|
57 |
+
```
|
58 |
+
|
59 |
+
### Filtering
|
60 |
+
- Only benchmarks matching standard conditions are considered:
|
61 |
+
- PP_CONFIG (512) tokens for prompt processing
|
62 |
+
- TG_CONFIG (128) tokens for token generation
|
63 |
+
|
64 |
+
## Data Aggregation Strategy
|
65 |
+
|
66 |
+
### Primary Grouping
|
67 |
+
- Groups data by `Normalized Device ID` and `Platform`
|
68 |
+
- Uses normalized device IDs to ensure consistent device identification across different submissions
|
69 |
+
|
70 |
+
```python
|
71 |
+
def normalize_device_id(device_info: dict) -> str:
|
72 |
+
if device_info["systemName"].lower() == "ios":
|
73 |
+
return f"iOS/{device_info['model']}"
|
74 |
+
|
75 |
+
memory_tier = f"{device_info['totalMemory'] // (1024**3)}GB"
|
76 |
+
return f"{device_info['brand']}/{device_info['model']}/{memory_tier}"
|
77 |
+
```
|
src/core/scoring.py
CHANGED
@@ -20,8 +20,8 @@ def get_default_quant_tiers() -> Dict[str, float]:
|
|
20 |
"Q6": 0.6, # Still fancy
|
21 |
"Q5": 0.5, # The "medium rare" of quantization
|
22 |
"Q4": 0.4, # Gets the job done
|
23 |
-
"Q3": 0.3, # Nice try
|
24 |
-
"Q2": 0.2, # eh
|
25 |
"Q1": 0.1, # At this point, just use a Magic 8-Ball
|
26 |
}
|
27 |
|
@@ -36,7 +36,6 @@ class StandardBenchmarkConditions:
|
|
36 |
# Weights for different components in scoring
|
37 |
TG_WEIGHT: float = 0.6 # Token generation weight
|
38 |
PP_WEIGHT: float = 0.4 # Prompt processing weight
|
39 |
-
SIZE_BONUS_FACTOR: float = 0.2 # Bonus factor for model size
|
40 |
|
41 |
# Quantization quality tiers
|
42 |
QUANT_TIERS: Dict[str, float] = field(default_factory=get_default_quant_tiers)
|
@@ -83,7 +82,7 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
83 |
This function computes a normalized performance score taking into account:
|
84 |
- Token generation speed
|
85 |
- Prompt processing speed
|
86 |
-
- Model size
|
87 |
- Quantization quality
|
88 |
|
89 |
Only considers benchmarks that match the standard conditions:
|
@@ -114,31 +113,25 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
114 |
df["quant_factor"] = df["Model ID"].apply(
|
115 |
lambda x: get_quantization_tier(x, std)
|
116 |
)
|
117 |
-
df["size_factor"] = df["Model Size"] / df["Model Size"].max()
|
118 |
return df
|
119 |
|
120 |
# Calculate base metrics (no normalization needed as we're using standard conditions)
|
121 |
standard_df["normalized_tg"] = standard_df["Token Generation"]
|
122 |
standard_df["normalized_pp"] = standard_df["Prompt Processing"]
|
123 |
|
124 |
-
# Model size factor (bonus for larger models)
|
125 |
-
standard_df["size_factor"] = (
|
126 |
-
standard_df["Model Size"] / standard_df["Model Size"].max()
|
127 |
-
)
|
128 |
-
|
129 |
# Quantization quality factor
|
130 |
standard_df["quant_factor"] = standard_df["Model ID"].apply(
|
131 |
lambda x: get_quantization_tier(x, std)
|
132 |
)
|
133 |
|
134 |
-
# Combined performance score
|
135 |
standard_df["performance_score"] = (
|
136 |
(
|
137 |
standard_df["normalized_tg"] * std.TG_WEIGHT
|
138 |
+ standard_df["normalized_pp"] * std.PP_WEIGHT
|
139 |
)
|
|
|
140 |
* standard_df["quant_factor"] # Apply quantization penalty
|
141 |
-
* (1 + standard_df["size_factor"] * std.SIZE_BONUS_FACTOR) # Apply size bonus
|
142 |
)
|
143 |
|
144 |
# Normalize final score to 0-100 range
|
@@ -157,7 +150,6 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
157 |
"Model ID",
|
158 |
"performance_score",
|
159 |
"quant_factor",
|
160 |
-
"size_factor",
|
161 |
]
|
162 |
],
|
163 |
on=["Device", "Platform", "Model ID"],
|
|
|
20 |
"Q6": 0.6, # Still fancy
|
21 |
"Q5": 0.5, # The "medium rare" of quantization
|
22 |
"Q4": 0.4, # Gets the job done
|
23 |
+
"Q3": 0.3, # Nice try
|
24 |
+
"Q2": 0.2, # eh
|
25 |
"Q1": 0.1, # At this point, just use a Magic 8-Ball
|
26 |
}
|
27 |
|
|
|
36 |
# Weights for different components in scoring
|
37 |
TG_WEIGHT: float = 0.6 # Token generation weight
|
38 |
PP_WEIGHT: float = 0.4 # Prompt processing weight
|
|
|
39 |
|
40 |
# Quantization quality tiers
|
41 |
QUANT_TIERS: Dict[str, float] = field(default_factory=get_default_quant_tiers)
|
|
|
82 |
This function computes a normalized performance score taking into account:
|
83 |
- Token generation speed
|
84 |
- Prompt processing speed
|
85 |
+
- Model size (direct multiplier)
|
86 |
- Quantization quality
|
87 |
|
88 |
Only considers benchmarks that match the standard conditions:
|
|
|
113 |
df["quant_factor"] = df["Model ID"].apply(
|
114 |
lambda x: get_quantization_tier(x, std)
|
115 |
)
|
|
|
116 |
return df
|
117 |
|
118 |
# Calculate base metrics (no normalization needed as we're using standard conditions)
|
119 |
standard_df["normalized_tg"] = standard_df["Token Generation"]
|
120 |
standard_df["normalized_pp"] = standard_df["Prompt Processing"]
|
121 |
|
|
|
|
|
|
|
|
|
|
|
122 |
# Quantization quality factor
|
123 |
standard_df["quant_factor"] = standard_df["Model ID"].apply(
|
124 |
lambda x: get_quantization_tier(x, std)
|
125 |
)
|
126 |
|
127 |
+
# Combined performance score using model size as direct multiplier
|
128 |
standard_df["performance_score"] = (
|
129 |
(
|
130 |
standard_df["normalized_tg"] * std.TG_WEIGHT
|
131 |
+ standard_df["normalized_pp"] * std.PP_WEIGHT
|
132 |
)
|
133 |
+
* standard_df["Model Size"] # Direct size multiplier
|
134 |
* standard_df["quant_factor"] # Apply quantization penalty
|
|
|
135 |
)
|
136 |
|
137 |
# Normalize final score to 0-100 range
|
|
|
150 |
"Model ID",
|
151 |
"performance_score",
|
152 |
"quant_factor",
|
|
|
153 |
]
|
154 |
],
|
155 |
on=["Device", "Platform", "Model ID"],
|