Update README.md
Browse filesbetter table markdown
![image.png](https://cdn-uploads.huggingface.co/production/uploads/6527e89a8808d80ccff88b7a/l0zLjf1QyOUpKflpEleX2.png)
README.md
CHANGED
@@ -58,7 +58,7 @@ Hermes 3 is competitive, if not superior, to Llama-3.1 Instruct models at genera
|
|
58 |
|
59 |
|
60 |
## GPT4All:
|
61 |
-
|
62 |
| Task |Version| Metric |Value | |Stderr|
|
63 |
|-------------|------:|--------|-----:|---|-----:|
|
64 |
|arc_challenge| 0|acc |0.5529|± |0.0145|
|
@@ -73,12 +73,12 @@ Hermes 3 is competitive, if not superior, to Llama-3.1 Instruct models at genera
|
|
73 |
|piqa | 0|acc |0.8063|± |0.0092|
|
74 |
| | |acc_norm|0.8156|± |0.0090|
|
75 |
|winogrande | 0|acc |0.7372|± |0.0124|
|
76 |
-
|
77 |
|
78 |
Average: 72.59
|
79 |
|
80 |
## AGIEval:
|
81 |
-
|
82 |
| Task |Version| Metric |Value | |Stderr|
|
83 |
|------------------------------|------:|--------|-----:|---|-----:|
|
84 |
|agieval_aqua_rat | 0|acc |0.2441|± |0.0270|
|
@@ -97,13 +97,12 @@ Average: 72.59
|
|
97 |
| | |acc_norm|0.4223|± |0.0345|
|
98 |
|agieval_sat_math | 0|acc |0.4000|± |0.0331|
|
99 |
| | |acc_norm|0.3455|± |0.0321|
|
100 |
-
|
101 |
|
102 |
Average: 44.05
|
103 |
|
104 |
## BigBench:
|
105 |
|
106 |
-
```
|
107 |
|
108 |
| Task |Version| Metric |Value | |Stderr|
|
109 |
|------------------------------------------------|------:|---------------------|-----:|---|-----:|
|
@@ -126,7 +125,7 @@ Average: 44.05
|
|
126 |
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|0.2216|± |0.0118|
|
127 |
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|0.1594|± |0.0088|
|
128 |
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|0.5367|± |0.0288|
|
129 |
-
|
130 |
|
131 |
Average: 44.13
|
132 |
|
|
|
58 |
|
59 |
|
60 |
## GPT4All:
|
61 |
+
|
62 |
| Task |Version| Metric |Value | |Stderr|
|
63 |
|-------------|------:|--------|-----:|---|-----:|
|
64 |
|arc_challenge| 0|acc |0.5529|± |0.0145|
|
|
|
73 |
|piqa | 0|acc |0.8063|± |0.0092|
|
74 |
| | |acc_norm|0.8156|± |0.0090|
|
75 |
|winogrande | 0|acc |0.7372|± |0.0124|
|
76 |
+
|
77 |
|
78 |
Average: 72.59
|
79 |
|
80 |
## AGIEval:
|
81 |
+
|
82 |
| Task |Version| Metric |Value | |Stderr|
|
83 |
|------------------------------|------:|--------|-----:|---|-----:|
|
84 |
|agieval_aqua_rat | 0|acc |0.2441|± |0.0270|
|
|
|
97 |
| | |acc_norm|0.4223|± |0.0345|
|
98 |
|agieval_sat_math | 0|acc |0.4000|± |0.0331|
|
99 |
| | |acc_norm|0.3455|± |0.0321|
|
100 |
+
|
101 |
|
102 |
Average: 44.05
|
103 |
|
104 |
## BigBench:
|
105 |
|
|
|
106 |
|
107 |
| Task |Version| Metric |Value | |Stderr|
|
108 |
|------------------------------------------------|------:|---------------------|-----:|---|-----:|
|
|
|
125 |
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|0.2216|± |0.0118|
|
126 |
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|0.1594|± |0.0088|
|
127 |
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|0.5367|± |0.0288|
|
128 |
+
|
129 |
|
130 |
Average: 44.13
|
131 |
|