open_Deep-Research / scripts /gaia_scorer.py
m-ric's picture
m-ric HF staff
Start agent
b97f2de
import re
import string
import warnings
def normalize_number_str(number_str: str) -> float:
# we replace these common units and commas to allow
# conversion to float
for char in ["$", "%", ","]:
number_str = number_str.replace(char, "")
try:
return float(number_str)
except ValueError:
print(f"String {number_str} cannot be normalized to number str.")
return float("inf")
def split_string(
s: str,
char_list: list[str] = [",", ";"],
) -> list[str]:
pattern = f"[{''.join(char_list)}]"
return re.split(pattern, s)
def is_float(element: any) -> bool:
try:
float(element)
return True
except ValueError:
return False
def question_scorer(
model_answer: str,
ground_truth: str,
) -> bool:
# if gt is a number
if is_float(ground_truth):
normalized_answer = normalize_number_str(str(model_answer))
return normalized_answer == float(ground_truth)
# if gt is a list
elif any(char in ground_truth for char in [",", ";"]):
# question with the fish: normalization removes punct
gt_elems = split_string(ground_truth)
ma_elems = split_string(model_answer)
# check length is the same
if len(gt_elems) != len(ma_elems):
warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
return False
# compare each element as float or str
comparisons = []
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
if is_float(gt_elem):
normalized_ma_elem = normalize_number_str(ma_elem)
comparisons.append(normalized_ma_elem == float(gt_elem))
else:
# we do not remove punct since comparisons can include punct
comparisons.append(
normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
)
return all(comparisons)
# if gt is a str
else:
return normalize_str(model_answer) == normalize_str(ground_truth)
def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
prediction = prediction.lower()
true_answer = true_answer.lower()
if len(prediction) > len(true_answer) * 3:
return False
i = 0
for letter in true_answer:
if letter in prediction[i:]:
i += prediction[i:].index(letter)
else:
return False
return True
def check_close_call(prediction, true_answer, is_correct):
if is_correct:
return True
else:
if is_float(true_answer):
return is_correct
else:
if (
check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
):
print(f"Close call: {prediction} vs {true_answer}")
return True
else:
return False
def normalize_str(input_str, remove_punct=True) -> str:
"""
Normalize a string by:
- Removing all white spaces
- Optionally removing punctuation (if remove_punct is True)
- Converting to lowercase
Parameters:
- input_str: str, the string to normalize
- remove_punct: bool, whether to remove punctuation (default: True)
Returns:
- str, the normalized string
"""
# Remove all white spaces. Required e.g for seagull vs. sea gull
no_spaces = re.sub(r"\s", "", input_str)
# Remove punctuation, if specified.
if remove_punct:
translator = str.maketrans("", "", string.punctuation)
return no_spaces.lower().translate(translator)
else:
return no_spaces.lower()