Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Sep 6, 2023

Commit

a6d7b1c

1 Parent(s): e01a5f6

add suite

Browse files

Files changed (1) hide show

tlem.py +56 -9

tlem.py CHANGED Viewed

@@ -6,6 +6,8 @@ except Exception as e:
     import logging
 from typing import Any, Optional, Protocol, Iterable, Callable
 # %%
@@ -33,14 +35,18 @@ TextGenerationPipeline = Callable[[Iterable[str]], list[str]]
 from evaluate import load
 @dataclass
 class Task:
-    dataset_name: str = "gsm8k"
-    dataset_params: dict = field(default_factory=dict)
     # metrics: list[str] = field(default_factory=list)
-    metric_name: str | tuple[str, str] = "gsm8k"
     input_column: str = "question"
-    label_column: str = "reference"
     prompt: Optional[Callable | str] = None
     @cached_property
@@ -49,7 +55,12 @@ class Task:
     @cached_property
     def dataset(self):
-        ds = load_dataset(self.dataset_name, **self.dataset_params)
         if self.prompt is not None:
             ds = ds.map(
                 lambda example: {
@@ -72,9 +83,11 @@ class Task:
         )
         return metric
-    def run(self, pipeline: TextGenerationPipeline):
         outputs = pipeline(self.samples)
-        return self.metric.compute(outputs, self.dataset[self.label_column])
 class Metrics:
@@ -224,7 +237,41 @@ class ReasoningMetric(evaluate.Metric):
         return results
-# %%
-load("sustech/tlem", "gsm8k")

     import logging
 from typing import Any, Optional, Protocol, Iterable, Callable
+from tqdm.auto import tqdm
+from evaluate.evaluation_suite import EvaluationSuite
 # %%
 from evaluate import load
+def fake_pipeline(prompts: Iterable[str]) -> list[str]:
+    return [prompt for prompt in tqdm(prompts)]
 @dataclass
 class Task:
+    dataset_name: str | tuple[str, str] = ("gsm8k", "main")
+    split: str = "test"
     # metrics: list[str] = field(default_factory=list)
+    metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k")
     input_column: str = "question"
+    label_column: str = "answer"
     prompt: Optional[Callable | str] = None
     @cached_property
     @cached_property
     def dataset(self):
+        ds = load_dataset(
+            *self.dataset_name
+            if isinstance(self.dataset_name, tuple)
+            else self.dataset_name,
+            split=self.split
+        )
         if self.prompt is not None:
             ds = ds.map(
                 lambda example: {
         )
         return metric
+    def run(self, pipeline: TextGenerationPipeline = fake_pipeline):
         outputs = pipeline(self.samples)
+        return self.metric.compute(
+            responses=outputs, references=self.dataset[self.label_column]
+        )
 class Metrics:
         return results
+class Suite(EvaluationSuite):
+    def run(
+        self, model_or_pipeline: Any, prompt: str = "{instruction}"
+    ) -> dict[str, float]:
+        self.assert_suite_nonempty()
+        results_all = {}
+        for task in tqdm(self.suite, desc="Running tasks"):
+            task_name = task.name
+            results = task.run(model_or_pipeline)
+            results_all[task_name] = results
+        return results_all
+    def __init__(self, name):
+        super().__init__(name)
+        self.suite = [
+            Task(
+                dataset_name=("gsm8k", "main"),
+                metric_name=("sustech/tlem", "gsm8k"),
+                input_column="question",
+                label_column="answer",
+            )
+            # TASK_REGISTRY["gsm8k"],
+            # TASK_REGISTRY["competition_math"],
+        ]
+# %%
+if __name__ == "__main__":
+    # metric = load("sustech/tlem", "gsm8k")
+    # output = metric.compute(responses=["answer is 2", "1+2"], references=["2", "3"])
+    # logging.info(output)
+    suite = EvaluationSuite.load("sustech/tlem")
+    suite.run(fake_pipeline)
+# %%