Roman Solomatin commited on
Commit
f2e732e
·
unverified ·
1 Parent(s): df36f2a

align results with models card

Browse files
Files changed (3) hide show
  1. EXTERNAL_MODEL_RESULTS.json +0 -0
  2. config.yaml +42 -42
  3. refresh.py +9 -5
EXTERNAL_MODEL_RESULTS.json CHANGED
The diff for this file is too large to render. See raw diff
 
config.yaml CHANGED
@@ -23,7 +23,7 @@ tasks:
23
  metric: max_ap
24
  metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
25
  task_description: "Pair classification is the task of determining whether two texts are similar."
26
- Reranking:
27
  icon: "🥈"
28
  metric: map
29
  metric_description: "Mean Average Precision (MAP)"
@@ -345,35 +345,35 @@ boards:
345
  credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
346
  tasks:
347
  Classification:
348
- - GeoreviewClassification (rus-Cyrl)
349
- - HeadlineClassification (rus-Cyrl)
350
- - InappropriatenessClassification (rus-Cyrl)
351
- - KinopoiskClassification (rus-Cyrl)
352
- - RuReviewsClassification (rus-Cyrl)
353
- - RuSciBenchGRNTIClassification (rus-Cyrl)
354
- - RuSciBenchOECDClassification (rus-Cyrl)
355
- - MassiveIntentClassification (rus-Cyrl)
356
- - MassiveScenarioClassification (rus-Cyrl)
357
  Clustering:
358
- - GeoreviewClusteringP2P (rus-Cyrl)
359
- - RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
360
- - RuSciBenchOECDClusteringP2P (rus-Cyrl)
361
  PairClassification:
362
- - TERRa (rus-Cyrl)
363
  Reranking:
364
- - RuBQReranking (rus-Cyrl)
365
- - MIRACLReranking (rus-Cyrl)
366
  Retrieval:
367
- - RiaNewsRetrieval (rus-Cyrl)
368
- - RuBQRetrieval (rus-Cyrl)
369
- - MIRACLRetrieval (rus-Cyrl)
370
  STS:
371
- - RUParaPhraserSTS (rus-Cyrl)
372
- - RuSTSBenchmarkSTS (rus-Cyrl)
373
- - STS22 (rus-Cyrl)
374
  MultilabelClassification:
375
- - CEDRClassification (rus-Cyrl)
376
- - SensitiveTopicsClassification (rus-Cyrl)
377
  se:
378
  title: Swedish
379
  language_long: Swedish
@@ -530,23 +530,23 @@ boards:
530
  metric: nDCG@10
531
  tasks:
532
  Retrieval:
533
- - AppsRetrieval (eng-Latn_python-Code)
534
- - CodeFeedbackMT (c-Code_sql-Code_python-Code_shell-Code_swift-Code_eng-Latn)
535
- - CodeFeedbackST (python-Code_javascript-Code_go-Code_ruby-Code_java-Code_php-Code_eng-Latn)
536
- - CodeSearchNetCCRetrieval (python-Code)
537
- - CodeSearchNetCCRetrieval (javascript-Code)
538
- - CodeSearchNetCCRetrieval (go-Code)
539
- - CodeSearchNetCCRetrieval (ruby-Code)
540
- - CodeSearchNetCCRetrieval (java-Code)
541
- - CodeSearchNetCCRetrieval (php-Code)
542
- - CodeSearchNetRetrieval (python-Code)
543
- - CodeSearchNetRetrieval (javascript-Code)
544
- - CodeSearchNetRetrieval (go-Code)
545
- - CodeSearchNetRetrieval (ruby-Code)
546
- - CodeSearchNetRetrieval (java-Code)
547
- - CodeSearchNetRetrieval (php-Code)
548
- - CodeTransOceanContest (python-Code_c++-Code)
549
  - CodeTransOceanDL
550
- - CosQA (eng-Latn_python-Code)
551
  - StackOverflowQA
552
- - SyntheticText2SQL (eng-Latn_sql-Code)
 
23
  metric: max_ap
24
  metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
25
  task_description: "Pair classification is the task of determining whether two texts are similar."
26
+ Reranking:
27
  icon: "🥈"
28
  metric: map
29
  metric_description: "Mean Average Precision (MAP)"
 
345
  credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)"
346
  tasks:
347
  Classification:
348
+ - GeoreviewClassification
349
+ - HeadlineClassification
350
+ - InappropriatenessClassification
351
+ - KinopoiskClassification
352
+ - RuReviewsClassification
353
+ - RuSciBenchGRNTIClassification
354
+ - RuSciBenchOECDClassification
355
+ - MassiveIntentClassification (ru)
356
+ - MassiveScenarioClassification (ru)
357
  Clustering:
358
+ - GeoreviewClusteringP2P
359
+ - RuSciBenchGRNTIClusteringP2P
360
+ - RuSciBenchOECDClusteringP2P
361
  PairClassification:
362
+ - TERRa
363
  Reranking:
364
+ - RuBQReranking
365
+ - MIRACLReranking (ru)
366
  Retrieval:
367
+ - RiaNewsRetrieval
368
+ - RuBQRetrieval
369
+ - MIRACLRetrieval (ru)
370
  STS:
371
+ - RUParaPhraserSTS
372
+ - RuSTSBenchmarkSTS
373
+ - STS22 (ru)
374
  MultilabelClassification:
375
+ - CEDRClassification
376
+ - SensitiveTopicsClassification
377
  se:
378
  title: Swedish
379
  language_long: Swedish
 
530
  metric: nDCG@10
531
  tasks:
532
  Retrieval:
533
+ - AppsRetrieval
534
+ - CodeFeedbackMT
535
+ - CodeFeedbackST
536
+ - CodeSearchNetCCRetrieval (python)
537
+ - CodeSearchNetCCRetrieval (javascript)
538
+ - CodeSearchNetCCRetrieval (go)
539
+ - CodeSearchNetCCRetrieval (ruby)
540
+ - CodeSearchNetCCRetrieval (java)
541
+ - CodeSearchNetCCRetrieval (php)
542
+ - CodeSearchNetRetrieval (python)
543
+ - CodeSearchNetRetrieval (javascript)
544
+ - CodeSearchNetRetrieval (go)
545
+ - CodeSearchNetRetrieval (ruby)
546
+ - CodeSearchNetRetrieval (java)
547
+ - CodeSearchNetRetrieval (php)
548
+ - CodeTransOceanContest
549
  - CodeTransOceanDL
550
+ - CosQA
551
  - StackOverflowQA
552
+ - SyntheticText2SQL
refresh.py CHANGED
@@ -132,11 +132,11 @@ def make_clickable_model(model_name: str, link: None | str = None) -> str:
132
 
133
 
134
  def add_lang(examples):
135
- if not (examples["eval_language"]) or (examples["eval_language"] == "default"):
136
  examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
137
  else:
138
  examples["mteb_dataset_name_with_lang"] = (
139
- examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
140
  )
141
  return examples
142
 
@@ -313,7 +313,7 @@ def get_external_model_results():
313
 
314
  # Save & cache EXTERNAL_MODEL_RESULTS
315
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
316
- json.dump(EXTERNAL_MODEL_RESULTS, f, indent=4)
317
 
318
  return EXTERNAL_MODEL_RESULTS
319
 
@@ -332,6 +332,10 @@ def download_or_use_cache(modelId: str):
332
  return meta
333
 
334
 
 
 
 
 
335
  def get_mteb_data(
336
  tasks: list = ["Clustering"],
337
  langs: list = [],
@@ -450,11 +454,11 @@ def get_mteb_data(
450
  try:
451
  out = [
452
  {
453
- res["dataset"]["name"].replace("MTEB ", ""): [
454
  round(score["value"], 2)
455
  for score in res["metrics"]
456
  if filter_metric_fetched(
457
- res["dataset"]["name"].replace("MTEB ", ""),
458
  score["type"],
459
  task_to_metric.get(res["task"]["type"]),
460
  res["dataset"]["split"],
 
132
 
133
 
134
  def add_lang(examples):
135
+ if not (examples["hf_subset"]) or (examples["hf_subset"] == "default"):
136
  examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
137
  else:
138
  examples["mteb_dataset_name_with_lang"] = (
139
+ examples["mteb_dataset_name"] + f' ({examples["hf_subset"]})'
140
  )
141
  return examples
142
 
 
313
 
314
  # Save & cache EXTERNAL_MODEL_RESULTS
315
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
316
+ json.dump(dict(sorted(EXTERNAL_MODEL_RESULTS.items())), f, indent=4)
317
 
318
  return EXTERNAL_MODEL_RESULTS
319
 
 
332
  return meta
333
 
334
 
335
+ def simplify_dataset_name(name):
336
+ return name.replace("MTEB ", "").replace(" (default)", "")
337
+
338
+
339
  def get_mteb_data(
340
  tasks: list = ["Clustering"],
341
  langs: list = [],
 
454
  try:
455
  out = [
456
  {
457
+ simplify_dataset_name(res["dataset"]["name"]): [
458
  round(score["value"], 2)
459
  for score in res["metrics"]
460
  if filter_metric_fetched(
461
+ simplify_dataset_name(res["dataset"]["name"]),
462
  score["type"],
463
  task_to_metric.get(res["task"]["type"]),
464
  res["dataset"]["split"],