Spaces:
Running
Running
update
Browse files- .gitignore +3 -1
- pyproject.toml +14 -0
- tasks.py +6 -23
- tlem.py +2 -4
.gitignore
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
__pycache__
|
| 2 |
-
|
|
|
|
|
|
|
|
|
| 1 |
__pycache__
|
| 2 |
+
*.ju.py
|
| 3 |
+
tests
|
| 4 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "tlem"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = ["fecet <[email protected]>"]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
|
| 8 |
+
[tool.poetry.dependencies]
|
| 9 |
+
python = "^3.10"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
[build-system]
|
| 13 |
+
requires = ["poetry-core"]
|
| 14 |
+
build-backend = "poetry.core.masonry.api"
|
tasks.py
CHANGED
|
@@ -225,14 +225,11 @@ class Metrics:
|
|
| 225 |
return {"error": "predictions and references have different " "length"}
|
| 226 |
responses = [general_postprocess(pred) for pred in responses]
|
| 227 |
processed_answers = [[general_postprocess(j) for j in i] for i in answers]
|
| 228 |
-
|
| 229 |
-
for pred, ans
|
| 230 |
-
if
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
matched_answers.append(ans[0])
|
| 234 |
-
|
| 235 |
-
return responses, matched_answers
|
| 236 |
|
| 237 |
def bbh_mcq(responses: list[str], answers: list[str | int]):
|
| 238 |
if len(responses) != len(answers):
|
|
@@ -624,8 +621,6 @@ Text: [PROMPT]
|
|
| 624 |
Question: [QUESTION]
|
| 625 |
Anawer:"""
|
| 626 |
|
| 627 |
-
categories = ["validation"]
|
| 628 |
-
|
| 629 |
@classmethod
|
| 630 |
def prompt_drop(cls, example):
|
| 631 |
prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
|
|
@@ -633,19 +628,7 @@ Anawer:"""
|
|
| 633 |
)
|
| 634 |
|
| 635 |
validated_answers = example["answers_spans"]["spans"]
|
| 636 |
-
|
| 637 |
-
answers = []
|
| 638 |
-
for answer_item, answer_type in zip(validated_answers, validated_types):
|
| 639 |
-
# if answer_type == "number":
|
| 640 |
-
# answers.append(answer_item)
|
| 641 |
-
# elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
|
| 642 |
-
# d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
|
| 643 |
-
# answers.append(' '.join(d).strip())
|
| 644 |
-
# else:
|
| 645 |
-
# for span in answer_item['spans']:
|
| 646 |
-
# answers.append(span)
|
| 647 |
-
answers.append(answer_item)
|
| 648 |
-
answers = list(set(answers))
|
| 649 |
|
| 650 |
return {cls.input_column: prompt, cls.label_column: answers}
|
| 651 |
|
|
|
|
| 225 |
return {"error": "predictions and references have different " "length"}
|
| 226 |
responses = [general_postprocess(pred) for pred in responses]
|
| 227 |
processed_answers = [[general_postprocess(j) for j in i] for i in answers]
|
| 228 |
+
scores = []
|
| 229 |
+
for pred, ans in zip(responses, processed_answers):
|
| 230 |
+
score = np.mean([1 if a in pred else 0 for a in ans])
|
| 231 |
+
scores.append(score)
|
| 232 |
+
return {"em": np.mean(scores)}
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
def bbh_mcq(responses: list[str], answers: list[str | int]):
|
| 235 |
if len(responses) != len(answers):
|
|
|
|
| 621 |
Question: [QUESTION]
|
| 622 |
Anawer:"""
|
| 623 |
|
|
|
|
|
|
|
| 624 |
@classmethod
|
| 625 |
def prompt_drop(cls, example):
|
| 626 |
prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
|
|
|
|
| 628 |
)
|
| 629 |
|
| 630 |
validated_answers = example["answers_spans"]["spans"]
|
| 631 |
+
answers = list(set(validated_answers))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
return {cls.input_column: prompt, cls.label_column: answers}
|
| 634 |
|
tlem.py
CHANGED
|
@@ -49,8 +49,7 @@ class ReasoningMetric(evaluate.Metric):
|
|
| 49 |
def _compute(self, responses, references):
|
| 50 |
return_value = getattr(Metrics, self.config_name)(responses, references)
|
| 51 |
match return_value:
|
| 52 |
-
case
|
| 53 |
-
extract_responses, extract_references = return_value
|
| 54 |
results = {
|
| 55 |
self.config_name: np.mean(
|
| 56 |
sync_pipe(lambda x, y: x == y)(
|
|
@@ -182,8 +181,7 @@ class Suite(EvaluationSuite):
|
|
| 182 |
def singleton(self, task):
|
| 183 |
try:
|
| 184 |
return self.tasks[self.tasks.index(task)]
|
| 185 |
-
|
| 186 |
-
except Exception as e:
|
| 187 |
self.tasks.append(task)
|
| 188 |
return self.tasks[-1]
|
| 189 |
|
|
|
|
| 49 |
def _compute(self, responses, references):
|
| 50 |
return_value = getattr(Metrics, self.config_name)(responses, references)
|
| 51 |
match return_value:
|
| 52 |
+
case extract_responses, extract_references:
|
|
|
|
| 53 |
results = {
|
| 54 |
self.config_name: np.mean(
|
| 55 |
sync_pipe(lambda x, y: x == y)(
|
|
|
|
| 181 |
def singleton(self, task):
|
| 182 |
try:
|
| 183 |
return self.tasks[self.tasks.index(task)]
|
| 184 |
+
except ValueError:
|
|
|
|
| 185 |
self.tasks.append(task)
|
| 186 |
return self.tasks[-1]
|
| 187 |
|