Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaia
commited on
Commit
·
c74b7d7
1
Parent(s):
f86eaae
Changes as per comments
Browse files- app.py +16 -0
- src/leaderboard/read_evals.py +23 -15
app.py
CHANGED
|
@@ -50,6 +50,9 @@ from src.tools.collections import update_collections
|
|
| 50 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
| 51 |
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 54 |
enable_space_ci()
|
| 55 |
|
|
@@ -57,6 +60,19 @@ enable_space_ci()
|
|
| 57 |
def restart_space():
|
| 58 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 61 |
"""Download dataset with exponential backoff retries."""
|
| 62 |
attempt = 0
|
|
|
|
| 50 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
| 51 |
|
| 52 |
|
| 53 |
+
# Configure logging
|
| 54 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 55 |
+
|
| 56 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 57 |
enable_space_ci()
|
| 58 |
|
|
|
|
| 60 |
def restart_space():
|
| 61 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 62 |
|
| 63 |
+
|
| 64 |
+
def time_diff_wrapper(func):
|
| 65 |
+
def wrapper(*args, **kwargs):
|
| 66 |
+
start_time = time.time()
|
| 67 |
+
result = func(*args, **kwargs)
|
| 68 |
+
end_time = time.time()
|
| 69 |
+
diff = end_time - start_time
|
| 70 |
+
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
| 71 |
+
return result
|
| 72 |
+
return wrapper
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@time_diff_wrapper
|
| 76 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 77 |
"""Download dataset with exponential backoff retries."""
|
| 78 |
attempt = 0
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -78,39 +78,47 @@ class EvalResult:
|
|
| 78 |
@staticmethod
|
| 79 |
def extract_results(data: Dict) -> Dict[str, float]:
|
| 80 |
"""
|
| 81 |
-
|
| 82 |
-
Skips entries based on specific conditions and handles NaN values appropriately.
|
| 83 |
-
Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
|
| 84 |
|
| 85 |
Parameters:
|
| 86 |
-
- data (Dict):
|
|
|
|
| 87 |
|
| 88 |
Returns:
|
| 89 |
-
- Dict[str, float]: A dictionary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"""
|
| 91 |
results = {}
|
| 92 |
for task in Tasks:
|
| 93 |
task = task.value
|
| 94 |
-
|
| 95 |
# We skip old mmlu entries
|
| 96 |
if task.benchmark == "hendrycksTest":
|
| 97 |
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
| 98 |
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
| 99 |
continue
|
| 100 |
|
| 101 |
-
# Some
|
| 102 |
-
if
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# We average all scores of a given metric (mostly for mmlu)
|
| 108 |
-
accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
|
| 109 |
-
if accs or any([acc is None for acc in accs]):
|
| 110 |
continue
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
return results
|
| 115 |
|
| 116 |
|
|
|
|
| 78 |
@staticmethod
|
| 79 |
def extract_results(data: Dict) -> Dict[str, float]:
|
| 80 |
"""
|
| 81 |
+
Extract and process benchmark results from a given dict.
|
|
|
|
|
|
|
| 82 |
|
| 83 |
Parameters:
|
| 84 |
+
- data (Dict): A dictionary containing benchmark data. This dictionary must
|
| 85 |
+
include 'versions' and 'results' keys with respective sub-data.
|
| 86 |
|
| 87 |
Returns:
|
| 88 |
+
- Dict[str, float]: A dictionary where keys are benchmark names and values
|
| 89 |
+
are the processed average scores as percentages.
|
| 90 |
+
|
| 91 |
+
Notes:
|
| 92 |
+
- The method specifically checks for certain benchmark names to skip outdated entries.
|
| 93 |
+
- Handles NaN values by setting the corresponding benchmark result to 0.0.
|
| 94 |
+
- Averages scores across metrics for benchmarks found in the data, in a percentage format.
|
| 95 |
"""
|
| 96 |
results = {}
|
| 97 |
for task in Tasks:
|
| 98 |
task = task.value
|
|
|
|
| 99 |
# We skip old mmlu entries
|
| 100 |
if task.benchmark == "hendrycksTest":
|
| 101 |
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
| 102 |
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
| 103 |
continue
|
| 104 |
|
| 105 |
+
# Some benchamrk values are NaNs, mostly truthfulQA
|
| 106 |
+
# Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
|
| 107 |
+
# e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
|
| 108 |
+
for k, v in data["results"].items():
|
| 109 |
+
if task.benchmark in k:
|
| 110 |
+
if math.isnan(float(v[task.metric])):
|
| 111 |
+
results[task.benchmark] = 0.0
|
| 112 |
+
continue
|
| 113 |
|
| 114 |
# We average all scores of a given metric (mostly for mmlu)
|
| 115 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 116 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 117 |
continue
|
| 118 |
|
| 119 |
+
mean_acc = np.mean(accs) * 100.0
|
| 120 |
+
results[task.benchmark] = mean_acc
|
| 121 |
+
|
| 122 |
return results
|
| 123 |
|
| 124 |
|