Alpay Ariyak
commited on
Changed Bench Eval to report metrics correctly by split. Added total accuracy and renamed previously used bench_accuracy to bench_average_accuracy. (#512)
Browse files* Added "eval_" prefix
* Added total bench accuracy and renamed the previous one to bench_average_accuracy. Changed naming to use bench_split instead of always using eval_ prefix.
src/axolotl/utils/callbacks.py
CHANGED
|
@@ -275,7 +275,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|
| 275 |
else:
|
| 276 |
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
| 277 |
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
| 278 |
-
results = {"
|
| 279 |
|
| 280 |
# Combine results from all GPUs
|
| 281 |
combined_bench_names: Dict[str, Dict[str, List]] = {}
|
|
@@ -287,6 +287,8 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|
| 287 |
combined_bench_names[name]["preds"].extend(data["preds"])
|
| 288 |
|
| 289 |
bench_scores = []
|
|
|
|
|
|
|
| 290 |
for (
|
| 291 |
bench_name
|
| 292 |
) in combined_bench_names: # pylint: disable=consider-using-dict-items
|
|
@@ -294,15 +296,20 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|
| 294 |
references=combined_bench_names[bench_name]["refs"],
|
| 295 |
predictions=combined_bench_names[bench_name]["preds"],
|
| 296 |
)["accuracy"]
|
|
|
|
|
|
|
| 297 |
if not pd.isna(bench_score):
|
| 298 |
results[
|
| 299 |
-
f"
|
| 300 |
] = bench_score
|
| 301 |
bench_scores.append(bench_score)
|
| 302 |
else:
|
| 303 |
-
results[f"
|
| 304 |
bench_scores.append(0.0)
|
| 305 |
-
results[f"
|
|
|
|
|
|
|
|
|
|
| 306 |
trainer.log(results)
|
| 307 |
|
| 308 |
return BenchEvalCallback
|
|
|
|
| 275 |
else:
|
| 276 |
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
| 277 |
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
| 278 |
+
results = {f"{bench_split}_bench_loss": bench_loss}
|
| 279 |
|
| 280 |
# Combine results from all GPUs
|
| 281 |
combined_bench_names: Dict[str, Dict[str, List]] = {}
|
|
|
|
| 287 |
combined_bench_names[name]["preds"].extend(data["preds"])
|
| 288 |
|
| 289 |
bench_scores = []
|
| 290 |
+
bench_refs = []
|
| 291 |
+
bench_preds = []
|
| 292 |
for (
|
| 293 |
bench_name
|
| 294 |
) in combined_bench_names: # pylint: disable=consider-using-dict-items
|
|
|
|
| 296 |
references=combined_bench_names[bench_name]["refs"],
|
| 297 |
predictions=combined_bench_names[bench_name]["preds"],
|
| 298 |
)["accuracy"]
|
| 299 |
+
bench_refs.extend(combined_bench_names[bench_name]["refs"])
|
| 300 |
+
bench_preds.extend(combined_bench_names[bench_name]["preds"])
|
| 301 |
if not pd.isna(bench_score):
|
| 302 |
results[
|
| 303 |
+
f"{bench_split}_bench_accuracy_{bench_name}"
|
| 304 |
] = bench_score
|
| 305 |
bench_scores.append(bench_score)
|
| 306 |
else:
|
| 307 |
+
results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
|
| 308 |
bench_scores.append(0.0)
|
| 309 |
+
results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
|
| 310 |
+
results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
|
| 311 |
+
references=bench_refs, predictions=bench_preds
|
| 312 |
+
)["accuracy"]
|
| 313 |
trainer.log(results)
|
| 314 |
|
| 315 |
return BenchEvalCallback
|