Spaces:
Running
Running
update
Browse files
tasks.py
CHANGED
|
@@ -85,6 +85,9 @@ class Task:
|
|
| 85 |
}
|
| 86 |
self.label_column = self.label_column or self.input_column
|
| 87 |
|
|
|
|
|
|
|
|
|
|
| 88 |
@cached_property
|
| 89 |
def samples(self):
|
| 90 |
return self.dataset[self.input_column]
|
|
|
|
| 85 |
}
|
| 86 |
self.label_column = self.label_column or self.input_column
|
| 87 |
|
| 88 |
+
def __eq__(self, __value: object) -> bool:
|
| 89 |
+
return self.name == __value.name
|
| 90 |
+
|
| 91 |
@cached_property
|
| 92 |
def samples(self):
|
| 93 |
return self.dataset[self.input_column]
|
tlem.py
CHANGED
|
@@ -13,6 +13,7 @@ import pandas as pd
|
|
| 13 |
from .tasks import *
|
| 14 |
from .utils import *
|
| 15 |
from itertools import chain
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
class ReasoningMetric(evaluate.Metric):
|
|
@@ -78,26 +79,29 @@ class Suite(EvaluationSuite):
|
|
| 78 |
# case _:
|
| 79 |
# return list(chain(*self.suite.values()))[key]
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def run(
|
| 82 |
self,
|
| 83 |
model_or_pipeline: Any,
|
| 84 |
-
suite=None,
|
| 85 |
) -> dict[str, float]:
|
| 86 |
self.assert_suite_nonempty()
|
| 87 |
-
if suite is None:
|
| 88 |
-
suite = self.suite
|
| 89 |
|
| 90 |
self.suite: dict[str, list[Task]]
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
results[category] = self.run(model_or_pipeline, tasks)
|
| 96 |
-
else:
|
| 97 |
-
for task in tasks:
|
| 98 |
-
results[category].update(task.run(model_or_pipeline))
|
| 99 |
-
results[category] = np.mean(list(results[category].values()))
|
| 100 |
-
return results
|
| 101 |
|
| 102 |
def get_suite(self, name) -> dict[str, Task]:
|
| 103 |
chat = False
|
|
@@ -144,6 +148,20 @@ class Suite(EvaluationSuite):
|
|
| 144 |
input_column="problem",
|
| 145 |
label_column="solution",
|
| 146 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
if isinstance(suite, Task):
|
| 148 |
suite = [suite]
|
| 149 |
if isinstance(suite, list):
|
|
|
|
| 13 |
from .tasks import *
|
| 14 |
from .utils import *
|
| 15 |
from itertools import chain
|
| 16 |
+
from copy import deepcopy
|
| 17 |
|
| 18 |
|
| 19 |
class ReasoningMetric(evaluate.Metric):
|
|
|
|
| 79 |
# case _:
|
| 80 |
# return list(chain(*self.suite.values()))[key]
|
| 81 |
|
| 82 |
+
def aggregate(self, suite):
|
| 83 |
+
for cate, tasks in suite.items():
|
| 84 |
+
if isinstance(tasks, dict):
|
| 85 |
+
suite[cate] = self.aggregate(tasks)
|
| 86 |
+
else:
|
| 87 |
+
result = []
|
| 88 |
+
for task in tasks:
|
| 89 |
+
result.extend(task.result.values())
|
| 90 |
+
suite[cate] = np.mean(result)
|
| 91 |
+
|
| 92 |
+
return suite
|
| 93 |
+
|
| 94 |
def run(
|
| 95 |
self,
|
| 96 |
model_or_pipeline: Any,
|
|
|
|
| 97 |
) -> dict[str, float]:
|
| 98 |
self.assert_suite_nonempty()
|
|
|
|
|
|
|
| 99 |
|
| 100 |
self.suite: dict[str, list[Task]]
|
| 101 |
+
for task in (bar := tqdm(self.tasks)):
|
| 102 |
+
bar.desc = f"complete {task.name}."
|
| 103 |
+
_ = task.run(model_or_pipeline)
|
| 104 |
+
return self.aggregate(deepcopy(self.suite))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
def get_suite(self, name) -> dict[str, Task]:
|
| 107 |
chat = False
|
|
|
|
| 148 |
input_column="problem",
|
| 149 |
label_column="solution",
|
| 150 |
)
|
| 151 |
+
|
| 152 |
+
case "open-leaderboard":
|
| 153 |
+
suite = {}
|
| 154 |
+
for name in [
|
| 155 |
+
"arc",
|
| 156 |
+
"hellaswag",
|
| 157 |
+
"mmlu-chat",
|
| 158 |
+
"winogrande",
|
| 159 |
+
"gsm8k",
|
| 160 |
+
# "truthful_qa",
|
| 161 |
+
"drop",
|
| 162 |
+
]:
|
| 163 |
+
suite[name] = self.get_suite(name)
|
| 164 |
+
|
| 165 |
if isinstance(suite, Task):
|
| 166 |
suite = [suite]
|
| 167 |
if isinstance(suite, list):
|