Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
36e3010
1
Parent(s):
5408125
Refactor code for adding generic tasks
Browse files- src/display/utils.py +49 -25
- src/leaderboard/read_evals.py +3 -1
src/display/utils.py
CHANGED
|
@@ -12,14 +12,16 @@ class Task:
|
|
| 12 |
benchmark: str
|
| 13 |
metric: str
|
| 14 |
col_name: str
|
|
|
|
|
|
|
| 15 |
|
| 16 |
class Tasks(Enum):
|
| 17 |
-
arc = Task("arc:challenge", "acc_norm", "ARC")
|
| 18 |
-
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
| 19 |
-
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
| 20 |
-
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
| 21 |
-
winogrande = Task("winogrande", "acc", "Winogrande")
|
| 22 |
-
gsm8k = Task("gsm8k", "acc", "GSM8K")
|
| 23 |
|
| 24 |
# These classes are for user facing column names,
|
| 25 |
# to avoid having to change them all around the code
|
|
@@ -75,26 +77,33 @@ baseline_row = {
|
|
| 75 |
AutoEvalColumn.revision.name: "N/A",
|
| 76 |
AutoEvalColumn.precision.name: None,
|
| 77 |
AutoEvalColumn.merged.name: False,
|
| 78 |
-
AutoEvalColumn.average.name: 31.0,
|
| 79 |
-
AutoEvalColumn.arc.name: 25.0,
|
| 80 |
-
AutoEvalColumn.hellaswag.name: 25.0,
|
| 81 |
-
AutoEvalColumn.mmlu.name: 25.0,
|
| 82 |
-
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 83 |
-
AutoEvalColumn.winogrande.name: 50.0,
|
| 84 |
-
AutoEvalColumn.gsm8k.name: 0.21,
|
| 85 |
AutoEvalColumn.dummy.name: "baseline",
|
| 86 |
AutoEvalColumn.model_type.name: "",
|
| 87 |
AutoEvalColumn.flagged.name: False,
|
| 88 |
AutoEvalColumn.model_type_symbol.name: None,
|
| 89 |
AutoEvalColumn.architecture.name: None,
|
| 90 |
AutoEvalColumn.weight_type.name: None,
|
| 91 |
-
AutoEvalColumn.params.name:
|
| 92 |
-
AutoEvalColumn.likes.name:
|
| 93 |
-
AutoEvalColumn.license.name:
|
| 94 |
-
AutoEvalColumn.still_on_hub.name:
|
| 95 |
-
AutoEvalColumn.moe.name:
|
| 96 |
}
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
| 99 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
| 100 |
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
|
@@ -107,19 +116,34 @@ human_baseline_row = {
|
|
| 107 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
| 108 |
AutoEvalColumn.revision.name: "N/A",
|
| 109 |
AutoEvalColumn.precision.name: None,
|
| 110 |
-
AutoEvalColumn.average.name: 92.75,
|
| 111 |
AutoEvalColumn.merged.name: False,
|
| 112 |
-
AutoEvalColumn.arc.name: 80.0,
|
| 113 |
-
AutoEvalColumn.hellaswag.name: 95.0,
|
| 114 |
-
AutoEvalColumn.mmlu.name: 89.8,
|
| 115 |
-
AutoEvalColumn.truthfulqa.name: 94.0,
|
| 116 |
-
AutoEvalColumn.winogrande.name: 94.0,
|
| 117 |
-
AutoEvalColumn.gsm8k.name: 100,
|
| 118 |
AutoEvalColumn.dummy.name: "human_baseline",
|
| 119 |
AutoEvalColumn.model_type.name: "",
|
| 120 |
AutoEvalColumn.flagged.name: False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
@dataclass
|
| 124 |
class ModelDetails:
|
| 125 |
name: str
|
|
|
|
| 12 |
benchmark: str
|
| 13 |
metric: str
|
| 14 |
col_name: str
|
| 15 |
+
baseline: float = 0.0
|
| 16 |
+
human_baseline: float = 0.0
|
| 17 |
|
| 18 |
class Tasks(Enum):
|
| 19 |
+
arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
|
| 20 |
+
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
|
| 21 |
+
mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
|
| 22 |
+
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
|
| 23 |
+
winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
|
| 24 |
+
gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
|
| 25 |
|
| 26 |
# These classes are for user facing column names,
|
| 27 |
# to avoid having to change them all around the code
|
|
|
|
| 77 |
AutoEvalColumn.revision.name: "N/A",
|
| 78 |
AutoEvalColumn.precision.name: None,
|
| 79 |
AutoEvalColumn.merged.name: False,
|
| 80 |
+
#AutoEvalColumn.average.name: 31.0,
|
| 81 |
+
#AutoEvalColumn.arc.name: 25.0,
|
| 82 |
+
#AutoEvalColumn.hellaswag.name: 25.0,
|
| 83 |
+
#AutoEvalColumn.mmlu.name: 25.0,
|
| 84 |
+
#AutoEvalColumn.truthfulqa.name: 25.0,
|
| 85 |
+
#AutoEvalColumn.winogrande.name: 50.0,
|
| 86 |
+
#AutoEvalColumn.gsm8k.name: 0.21,
|
| 87 |
AutoEvalColumn.dummy.name: "baseline",
|
| 88 |
AutoEvalColumn.model_type.name: "",
|
| 89 |
AutoEvalColumn.flagged.name: False,
|
| 90 |
AutoEvalColumn.model_type_symbol.name: None,
|
| 91 |
AutoEvalColumn.architecture.name: None,
|
| 92 |
AutoEvalColumn.weight_type.name: None,
|
| 93 |
+
AutoEvalColumn.params.name: 0,
|
| 94 |
+
AutoEvalColumn.likes.name: 0,
|
| 95 |
+
AutoEvalColumn.license.name: "",
|
| 96 |
+
AutoEvalColumn.still_on_hub.name: False,
|
| 97 |
+
AutoEvalColumn.moe.name: False
|
| 98 |
}
|
| 99 |
|
| 100 |
+
baseline_list = []
|
| 101 |
+
for task in Tasks:
|
| 102 |
+
baseline_row[task.name] = task.value.baseline
|
| 103 |
+
if task.value.baseline is not None:
|
| 104 |
+
baseline_list.append(task.value.baseline)
|
| 105 |
+
baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
| 106 |
+
|
| 107 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
| 108 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
| 109 |
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
|
|
|
| 116 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
| 117 |
AutoEvalColumn.revision.name: "N/A",
|
| 118 |
AutoEvalColumn.precision.name: None,
|
| 119 |
+
#AutoEvalColumn.average.name: 92.75,
|
| 120 |
AutoEvalColumn.merged.name: False,
|
| 121 |
+
#AutoEvalColumn.arc.name: 80.0,
|
| 122 |
+
#AutoEvalColumn.hellaswag.name: 95.0,
|
| 123 |
+
#AutoEvalColumn.mmlu.name: 89.8,
|
| 124 |
+
#AutoEvalColumn.truthfulqa.name: 94.0,
|
| 125 |
+
#AutoEvalColumn.winogrande.name: 94.0,
|
| 126 |
+
#AutoEvalColumn.gsm8k.name: 100,
|
| 127 |
AutoEvalColumn.dummy.name: "human_baseline",
|
| 128 |
AutoEvalColumn.model_type.name: "",
|
| 129 |
AutoEvalColumn.flagged.name: False,
|
| 130 |
+
AutoEvalColumn.model_type_symbol.name: None,
|
| 131 |
+
AutoEvalColumn.architecture.name: None,
|
| 132 |
+
AutoEvalColumn.weight_type.name: None,
|
| 133 |
+
AutoEvalColumn.params.name: 0,
|
| 134 |
+
AutoEvalColumn.likes.name: 0,
|
| 135 |
+
AutoEvalColumn.license.name: "",
|
| 136 |
+
AutoEvalColumn.still_on_hub.name: False,
|
| 137 |
+
AutoEvalColumn.moe.name: False
|
| 138 |
}
|
| 139 |
|
| 140 |
+
baseline_list = []
|
| 141 |
+
for task in Tasks:
|
| 142 |
+
human_baseline_row[task.name] = task.value.human_baseline
|
| 143 |
+
if task.value.human_baseline is not None:
|
| 144 |
+
baseline_list.append(task.value.human_baseline)
|
| 145 |
+
human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
| 146 |
+
|
| 147 |
@dataclass
|
| 148 |
class ModelDetails:
|
| 149 |
name: str
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -66,6 +66,7 @@ class EvalResult:
|
|
| 66 |
results = {}
|
| 67 |
for task in Tasks:
|
| 68 |
task = task.value
|
|
|
|
| 69 |
# We skip old mmlu entries
|
| 70 |
wrong_mmlu_version = False
|
| 71 |
if task.benchmark == "hendrycksTest":
|
|
@@ -81,11 +82,12 @@ class EvalResult:
|
|
| 81 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
| 82 |
results[task.benchmark] = 0.0
|
| 83 |
continue
|
| 84 |
-
|
| 85 |
# We average all scores of a given metric (mostly for mmlu)
|
| 86 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 88 |
continue
|
|
|
|
| 89 |
|
| 90 |
mean_acc = np.mean(accs) * 100.0
|
| 91 |
results[task.benchmark] = mean_acc
|
|
|
|
| 66 |
results = {}
|
| 67 |
for task in Tasks:
|
| 68 |
task = task.value
|
| 69 |
+
"""
|
| 70 |
# We skip old mmlu entries
|
| 71 |
wrong_mmlu_version = False
|
| 72 |
if task.benchmark == "hendrycksTest":
|
|
|
|
| 82 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
| 83 |
results[task.benchmark] = 0.0
|
| 84 |
continue
|
| 85 |
+
"""
|
| 86 |
# We average all scores of a given metric (mostly for mmlu)
|
| 87 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 88 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 89 |
continue
|
| 90 |
+
|
| 91 |
|
| 92 |
mean_acc = np.mean(accs) * 100.0
|
| 93 |
results[task.benchmark] = mean_acc
|