import re
[docs]
def strip_answer(answer):
answer = re.sub("The", "", answer)
answer = re.sub("If", "", answer)
answer = re.sub("[INST]", "", answer)
answer = re.sub("[/INST]", "", answer)
answer = re.sub("<Img>", "", answer)
answer = re.sub("</Img>", "", answer)
answer = answer.strip()
return answer
[docs]
def remove_special_characters(text):
pattern = r"[-`\\【】\*\$、,,。.;;::?\?!!\s\n\u4e00-\u9fff0-9①②③④⑤⑥⑦\[\]\<>a-z=\'\"\(\)\{\}]+"
cleaned_text = re.sub(pattern, "", text)
return cleaned_text
[docs]
def process_multiple_choice(answer):
answer = strip_answer(answer)
pattern = r"^([A-Z])\."
matches = re.match(pattern, answer)
if matches:
return matches.group(1)
key_words = [
"boxed",
"Answer:",
"Answer is",
"answer is",
"option is",
"Correct option",
"correct option",
"Answer",
"answer",
"故选",
"选择",
"正确选项为",
"答案选",
"答案为",
"答案是",
"因此",
"答案",
]
for key_word in key_words:
if key_word in answer:
answer = answer.split(key_word)[-1]
break
answer = remove_special_characters(answer)
# keep the last line
answer = answer.split("\n")[-1]
pattern = r"[A-Z]"
matches = re.findall(pattern, answer)
return "".join(matches)
[docs]
def remove_unit(value):
units = [
"cm",
"m",
"km",
"mm",
"s",
"h",
"kg",
"g",
"l",
"ml",
"mol",
"厘米",
"米",
"千米",
"°",
"毫米",
"月",
"秒",
"小时",
"克",
"千克",
"升",
"毫升",
"摩尔",
]
unit_pattern = r"^(\d+)(?:" + "|".join(units) + ")$"
match = re.match(unit_pattern, value)
if match:
return match.group(1)
else:
return value
[docs]
def convert_circled_numbers(text):
circled_numbers = {
"①": "1",
"②": "2",
"③": "3",
"④": "4",
"⑤": "5",
"⑥": "6",
"⑦": "7",
"⑧": "8",
"⑨": "9",
"⑩": "10",
}
for circled, number in circled_numbers.items():
text = text.replace(circled, number)
return text
[docs]
def normalize_string(raw_answer):
if "$" not in raw_answer:
wrong_answer_words = ["\\times", "不对", "不正确", "×"]
for word in wrong_answer_words:
raw_answer = raw_answer.replace(word, "错误")
raw_answer = re.sub(r"\\text\s*\{(.*?)\}", r"\1", raw_answer)
replace_dict = {
"√": "正确",
":": ":",
"$": "",
"(": "(",
")": ")",
",": ",",
"。": ".",
"变小": "减小",
"变大": "增大",
"路程": "距离",
"\\pi": "π",
">": ">",
"<": "<",
";": ";",
}
# write to convert characters like ①②③④ to 1234
for k, v in replace_dict.items():
raw_answer = raw_answer.replace(k, v)
# Convert circled numbers to regular numbers
raw_answer = convert_circled_numbers(raw_answer)
strict_replace_dict = {
"错": "错误",
"对": "正确",
"(F)": "F",
"(T)": "T",
"(正确)": "正确",
"(错误)": "错误",
"“T”": "T",
"“F”": "F",
}
if raw_answer in strict_replace_dict:
raw_answer = strict_replace_dict[raw_answer]
key_words = [
"Answer:",
"Answer is",
"answer is",
"Answer",
"answer",
"答案为",
"答案是",
"解是",
"解为",
"答案",
"结果",
"为",
"因此",
" = ",
]
# get text after key_word
for key_word in key_words:
if key_word in raw_answer:
raw_answer = raw_answer.split(key_word)[-1]
break
raw_answer = raw_answer.strip()
# remove leading :
if raw_answer.startswith(":"):
raw_answer = raw_answer[1:]
if len(raw_answer) > 0 and raw_answer[-1] in [".", ",", ":", ";"]:
raw_answer = raw_answer[:-1]
raw_answer = remove_unit(raw_answer)
return raw_answer.strip()