Spaces:
Runtime error
Runtime error
Update scorer.py
Browse files
scorer.py
CHANGED
|
@@ -18,6 +18,25 @@ def normalize_number_str(number_str: str) -> float:
|
|
| 18 |
return float("inf")
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def split_string(
|
| 22 |
s: str,
|
| 23 |
char_list: list[str] = [",", ";"],
|
|
@@ -37,6 +56,57 @@ def question_scorer(
|
|
| 37 |
except ValueError:
|
| 38 |
return False
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
score = 0
|
| 41 |
if user_task["final_answer"] == val["Final answer"]:
|
| 42 |
score = val["Total score"]
|
|
|
|
| 18 |
return float("inf")
|
| 19 |
|
| 20 |
|
| 21 |
+
def normalize_answer(a):
|
| 22 |
+
# Lower case
|
| 23 |
+
# Trim (left and right)
|
| 24 |
+
# Replace multiple spaces with one space
|
| 25 |
+
# Remove trailing punctuation
|
| 26 |
+
# return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))
|
| 27 |
+
if isinstance(a, list):
|
| 28 |
+
a = ''.join(a)
|
| 29 |
+
return a.strip().lower()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def exact_match(answer, ground_truth):
|
| 33 |
+
return normalize_answer(answer) == normalize_answer(ground_truth)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def keyword_match(answer, ground_truth):
|
| 37 |
+
return normalize_answer(ground_truth) in normalize_answer(answer)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
def split_string(
|
| 41 |
s: str,
|
| 42 |
char_list: list[str] = [",", ";"],
|
|
|
|
| 56 |
except ValueError:
|
| 57 |
return False
|
| 58 |
|
| 59 |
+
# 打分机制
|
| 60 |
+
level = 0
|
| 61 |
+
expertise = 0
|
| 62 |
+
reasoning = 0
|
| 63 |
+
comprehension = 0
|
| 64 |
+
|
| 65 |
+
final_answer = user_task["final_answer"]
|
| 66 |
+
expected_answer = val["Final answer"]
|
| 67 |
+
|
| 68 |
+
data = val["score"]
|
| 69 |
+
chat_score = []
|
| 70 |
+
for i in range(len(data['type'])):
|
| 71 |
+
item = {
|
| 72 |
+
'type': data['type'][i],
|
| 73 |
+
'question': data['question'][i],
|
| 74 |
+
'choices': data['choices'][i],
|
| 75 |
+
'answer': data['answer'][i],
|
| 76 |
+
'expertise': data['expertise'][i],
|
| 77 |
+
'reasoning': data['reasoning'][i],
|
| 78 |
+
'comprehension': data['comprehension'][i],
|
| 79 |
+
'score': data['score'][i]
|
| 80 |
+
}
|
| 81 |
+
chat_score.append(item)
|
| 82 |
+
|
| 83 |
+
for i, score_item in enumerate(chat_score):
|
| 84 |
+
answer_true = False
|
| 85 |
+
if score_item['type'].lower() == 'multiple choice':
|
| 86 |
+
if exact_match(user_task["score_answer"][i], score_item['answer']):
|
| 87 |
+
answer_true = True
|
| 88 |
+
elif score_item['type'].lower() == 'fill in the blanks':
|
| 89 |
+
if keyword_match(user_task["score_answer"][i], score_item['answer']):
|
| 90 |
+
answer_true = True
|
| 91 |
+
elif score_item['type'].lower() == 'short answer questions':
|
| 92 |
+
for ground_truth in score_item['answer']:
|
| 93 |
+
if keyword_match(user_task["score_answer"][i], ground_truth):
|
| 94 |
+
answer_true = True
|
| 95 |
+
break
|
| 96 |
+
# print(answer, score_item['answer'], answer_true)
|
| 97 |
+
# 加分
|
| 98 |
+
if answer_true:
|
| 99 |
+
expertise += score_item['expertise']
|
| 100 |
+
reasoning += score_item['reasoning']
|
| 101 |
+
comprehension += score_item['comprehension']
|
| 102 |
+
if score_item['score'] > level:
|
| 103 |
+
level = score_item['score']
|
| 104 |
+
print([level, expertise, reasoning, comprehension])
|
| 105 |
+
# final_answer正确 则满分,但是能力分不加了
|
| 106 |
+
if expected_answer and exact_match(final_answer, expected_answer):
|
| 107 |
+
level = 10
|
| 108 |
+
return [level, expertise, reasoning, comprehension]
|
| 109 |
+
|
| 110 |
score = 0
|
| 111 |
if user_task["final_answer"] == val["Final answer"]:
|
| 112 |
score = val["Total score"]
|