|
import os |
|
import json |
|
import wandb |
|
import argparse |
|
|
|
curr_dir = os.path.dirname(os.path.realpath(__file__)) |
|
|
|
LANGUAGES = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul'] |
|
BENCHMARCK2PROJECT = { |
|
"afrimmlu_direct" : "african-research-collective/llm-evaluation-afrimmlu-direct", |
|
"afrimmlu_translate" : "african-research-collective/llm-evaluation-afrimmlu-translate" |
|
} |
|
BENCHMARK_VERSION = 1.0 |
|
|
|
|
|
|
|
def main(args): |
|
api = wandb.Api() |
|
|
|
runs = api.runs(BENCHMARCK2PROJECT[args.benchmark]) |
|
|
|
print(runs) |
|
|
|
for run in runs: |
|
|
|
|
|
|
|
for lang in LANGUAGES: |
|
try: |
|
lang_result_key = f'{args.benchmark}_{lang}' |
|
|
|
results = {lang_result_key: {}} |
|
config = {} |
|
versions = {} |
|
|
|
|
|
results[lang_result_key]['acc'] = run.summary._json_dict[f'{lang_result_key}/acc'] |
|
results[lang_result_key]['acc_stderr'] = run.summary._json_dict[f'{lang_result_key}/acc_stderr'] |
|
results[lang_result_key]['f1'] = run.summary._json_dict[f'{lang_result_key}/f1'] |
|
results[lang_result_key]['f1_stderr'] = run.summary._json_dict[f'{lang_result_key}/f1_stderr'] |
|
|
|
versions[lang_result_key] = BENCHMARK_VERSION |
|
versions['wandb_run_name'] = run.name |
|
|
|
config['model'] = run.config['cli_configs']['model'] |
|
config['model_args'] = run.config['cli_configs']['model_args'] |
|
config['batch_size'] = run.config['cli_configs']['batch_size'] |
|
config['device'] = run.config['cli_configs']['device'] |
|
config['model_dtype'] = run.config['cli_configs']['model_dtype'] |
|
config['numpy_seed'] = run.config['cli_configs']['numpy_seed'] |
|
config['torch_seed'] = run.config['cli_configs']['torch_seed'] |
|
config['random_seed'] = run.config['cli_configs']['random_seed'] |
|
config['fewshot_seed'] = run.config['cli_configs']['fewshot_seed'] |
|
|
|
final_json_object = { |
|
'results': results, |
|
'versions': versions, |
|
'config': config |
|
} |
|
|
|
pretrained_model = config['model_args'].split(',')[0].split('=')[1].split('/')[-1] |
|
|
|
with open(os.path.join(curr_dir, f"evals/{args.benchmark}/{args.benchmark}_{lang}-{pretrained_model}.json"), 'w') as f: |
|
json.dump(final_json_object, f, indent=2) |
|
|
|
except KeyError as e: |
|
print(f"KeyError: {e}") |
|
continue |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--benchmark', type=str, required=True) |
|
args = parser.parse_args() |
|
main(args) |
|
|