agh123 commited on
Commit
19c7047
·
1 Parent(s): 6a390d7

feat(scoring): use model size as direct multiplier

Browse files
Files changed (2) hide show
  1. docs/ranking_system.md +77 -0
  2. src/core/scoring.py +5 -13
docs/ranking_system.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Device Ranking System
2
+
3
+ ## Overview
4
+ The ranking system implements a multi-dimensional approach to evaluate and compare device performance across different aspects of LLM (GGUF) model runs.
5
+
6
+ ## Scoring Algorithm
7
+
8
+ ### Standard Benchmark Conditions
9
+ ```python
10
+ PP_CONFIG = 512 # Standard prompt processing token count
11
+ TG_CONFIG = 128 # Standard token generation count
12
+
13
+ # Component Weights
14
+ TG_WEIGHT = 0.6 # Token generation weight (60%)
15
+ PP_WEIGHT = 0.4 # Prompt processing weight (40%)
16
+ ```
17
+ - PP given 40% weight as it's a one-time cost per prompt
18
+ - TG given higher weight (60%) as it represents ongoing performance
19
+
20
+ ### Quantization Quality Factors
21
+ ```python
22
+ QUANT_TIERS = {
23
+ "F16": 1.0,
24
+ "F32": 1.0,
25
+ "Q8": 0.8,
26
+ "Q6": 0.6,
27
+ "Q5": 0.5,
28
+ "Q4": 0.4,
29
+ "Q3": 0.3,
30
+ "Q2": 0.2,
31
+ "Q1": 0.1,
32
+ }
33
+ ```
34
+
35
+ - Linear scale from 0.1 to 1.0 based on quantization level
36
+ - F16/F32 are considered 1.0 (this skews the results a bit towards quantization)
37
+
38
+
39
+ ### Performance Score Formula
40
+ The final performance score is calculated as follows:
41
+
42
+ 1. **Base Performance**:
43
+ ```
44
+ base_score = (TG_speed * TG_WEIGHT + PP_speed * PP_WEIGHT)
45
+ ```
46
+
47
+ 2. **Size and Quantization Adjustment**:
48
+ ```
49
+ # Direct multiplication by model size (in billions)
50
+ performance_score = base_score * model_size * quant_factor
51
+ ```
52
+ - Linear multiplier by model size
53
+
54
+ 3. **Normalization**:
55
+ ```
56
+ normalized_score = (performance_score / max_performance_score) * 100
57
+ ```
58
+
59
+ ### Filtering
60
+ - Only benchmarks matching standard conditions are considered:
61
+ - PP_CONFIG (512) tokens for prompt processing
62
+ - TG_CONFIG (128) tokens for token generation
63
+
64
+ ## Data Aggregation Strategy
65
+
66
+ ### Primary Grouping
67
+ - Groups data by `Normalized Device ID` and `Platform`
68
+ - Uses normalized device IDs to ensure consistent device identification across different submissions
69
+
70
+ ```python
71
+ def normalize_device_id(device_info: dict) -> str:
72
+ if device_info["systemName"].lower() == "ios":
73
+ return f"iOS/{device_info['model']}"
74
+
75
+ memory_tier = f"{device_info['totalMemory'] // (1024**3)}GB"
76
+ return f"{device_info['brand']}/{device_info['model']}/{memory_tier}"
77
+ ```
src/core/scoring.py CHANGED
@@ -20,8 +20,8 @@ def get_default_quant_tiers() -> Dict[str, float]:
20
  "Q6": 0.6, # Still fancy
21
  "Q5": 0.5, # The "medium rare" of quantization
22
  "Q4": 0.4, # Gets the job done
23
- "Q3": 0.3, # Nice try
24
- "Q2": 0.2, # eh
25
  "Q1": 0.1, # At this point, just use a Magic 8-Ball
26
  }
27
 
@@ -36,7 +36,6 @@ class StandardBenchmarkConditions:
36
  # Weights for different components in scoring
37
  TG_WEIGHT: float = 0.6 # Token generation weight
38
  PP_WEIGHT: float = 0.4 # Prompt processing weight
39
- SIZE_BONUS_FACTOR: float = 0.2 # Bonus factor for model size
40
 
41
  # Quantization quality tiers
42
  QUANT_TIERS: Dict[str, float] = field(default_factory=get_default_quant_tiers)
@@ -83,7 +82,7 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
83
  This function computes a normalized performance score taking into account:
84
  - Token generation speed
85
  - Prompt processing speed
86
- - Model size
87
  - Quantization quality
88
 
89
  Only considers benchmarks that match the standard conditions:
@@ -114,31 +113,25 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
114
  df["quant_factor"] = df["Model ID"].apply(
115
  lambda x: get_quantization_tier(x, std)
116
  )
117
- df["size_factor"] = df["Model Size"] / df["Model Size"].max()
118
  return df
119
 
120
  # Calculate base metrics (no normalization needed as we're using standard conditions)
121
  standard_df["normalized_tg"] = standard_df["Token Generation"]
122
  standard_df["normalized_pp"] = standard_df["Prompt Processing"]
123
 
124
- # Model size factor (bonus for larger models)
125
- standard_df["size_factor"] = (
126
- standard_df["Model Size"] / standard_df["Model Size"].max()
127
- )
128
-
129
  # Quantization quality factor
130
  standard_df["quant_factor"] = standard_df["Model ID"].apply(
131
  lambda x: get_quantization_tier(x, std)
132
  )
133
 
134
- # Combined performance score
135
  standard_df["performance_score"] = (
136
  (
137
  standard_df["normalized_tg"] * std.TG_WEIGHT
138
  + standard_df["normalized_pp"] * std.PP_WEIGHT
139
  )
 
140
  * standard_df["quant_factor"] # Apply quantization penalty
141
- * (1 + standard_df["size_factor"] * std.SIZE_BONUS_FACTOR) # Apply size bonus
142
  )
143
 
144
  # Normalize final score to 0-100 range
@@ -157,7 +150,6 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
157
  "Model ID",
158
  "performance_score",
159
  "quant_factor",
160
- "size_factor",
161
  ]
162
  ],
163
  on=["Device", "Platform", "Model ID"],
 
20
  "Q6": 0.6, # Still fancy
21
  "Q5": 0.5, # The "medium rare" of quantization
22
  "Q4": 0.4, # Gets the job done
23
+ "Q3": 0.3, # Nice try
24
+ "Q2": 0.2, # eh
25
  "Q1": 0.1, # At this point, just use a Magic 8-Ball
26
  }
27
 
 
36
  # Weights for different components in scoring
37
  TG_WEIGHT: float = 0.6 # Token generation weight
38
  PP_WEIGHT: float = 0.4 # Prompt processing weight
 
39
 
40
  # Quantization quality tiers
41
  QUANT_TIERS: Dict[str, float] = field(default_factory=get_default_quant_tiers)
 
82
  This function computes a normalized performance score taking into account:
83
  - Token generation speed
84
  - Prompt processing speed
85
+ - Model size (direct multiplier)
86
  - Quantization quality
87
 
88
  Only considers benchmarks that match the standard conditions:
 
113
  df["quant_factor"] = df["Model ID"].apply(
114
  lambda x: get_quantization_tier(x, std)
115
  )
 
116
  return df
117
 
118
  # Calculate base metrics (no normalization needed as we're using standard conditions)
119
  standard_df["normalized_tg"] = standard_df["Token Generation"]
120
  standard_df["normalized_pp"] = standard_df["Prompt Processing"]
121
 
 
 
 
 
 
122
  # Quantization quality factor
123
  standard_df["quant_factor"] = standard_df["Model ID"].apply(
124
  lambda x: get_quantization_tier(x, std)
125
  )
126
 
127
+ # Combined performance score using model size as direct multiplier
128
  standard_df["performance_score"] = (
129
  (
130
  standard_df["normalized_tg"] * std.TG_WEIGHT
131
  + standard_df["normalized_pp"] * std.PP_WEIGHT
132
  )
133
+ * standard_df["Model Size"] # Direct size multiplier
134
  * standard_df["quant_factor"] # Apply quantization penalty
 
135
  )
136
 
137
  # Normalize final score to 0-100 range
 
150
  "Model ID",
151
  "performance_score",
152
  "quant_factor",
 
153
  ]
154
  ],
155
  on=["Device", "Platform", "Model ID"],