hmacdope commited on
Commit
b9a3c9e
·
1 Parent(s): 26bb373

update leaderboard code

Browse files
Files changed (4) hide show
  1. cld.py +1 -1
  2. final_lb.py +0 -143
  3. intermediate_leaderboard.py +124 -0
  4. utils.py +22 -6
cld.py CHANGED
@@ -6,7 +6,7 @@ from itertools import product
6
 
7
  # Make large CLD alphabet
8
  single_chars = list(ascii_lowercase) + list(ascii_uppercase)
9
- underscore_chars = [''.join(p) for p in product(['_'], single_chars)]
10
  CLD_ALPHABET = single_chars + underscore_chars
11
 
12
  def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
 
6
 
7
  # Make large CLD alphabet
8
  single_chars = list(ascii_lowercase) + list(ascii_uppercase)
9
+ underscore_chars = [''.join(p) for p in product(['@'], single_chars)]
10
  CLD_ALPHABET = single_chars + underscore_chars
11
 
12
  def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
final_lb.py DELETED
@@ -1,143 +0,0 @@
1
- ''' Code to generate intermediate and final leadeboard '''
2
- from cld import add_cld_to_leaderboard
3
- from utils import (
4
- check_page_exists,
5
- map_metric_to_stats,
6
- fetch_dataset_df,
7
- )
8
- from about import ENDPOINTS, LB_COLS, results_repo_test
9
-
10
- from loguru import logger
11
- import pandas as pd
12
- import numpy as np
13
- from pathlib import Path
14
-
15
- ALL_EPS = ['Average'] + ENDPOINTS
16
-
17
- def build_leaderboard(df_results, df_results_raw, avg_only=True):
18
- per_ep = {}
19
- for ep in ALL_EPS:
20
- df = df_results[df_results["Endpoint"] == ep].copy()
21
- if df is None:
22
- print(f"[refresh] {ep} returned None; using empty DF")
23
- if df.empty:
24
- per_ep[ep] = pd.DataFrame(columns=LB_COLS) # Empty df
25
- continue
26
-
27
- # Make model details clickable
28
- df['model details'] = df['model_report'].apply(lambda x: validate_model_details(x)).astype(str)
29
-
30
- if ep == "Average":
31
- # MA-RAE is the average of the RAE per endpoint
32
- df = df.rename(columns={"mean_RAE": "mean_MA-RAE",
33
- "std_RAE": "std_MA-RAE"})
34
- # Delete duplicate entries before sorting (fixing case-sensitive duplicate check)
35
- df['hf_username'] = df['hf_username'].apply(lambda s: s.lower())
36
- df = df.sort_values(by="submission time", ascending=False, kind="stable")
37
- df = df.drop_duplicates(subset=['hf_username'], keep='first')
38
-
39
- # Sort by MAE-RAE
40
- sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
41
- sorted_df = map_metric_to_stats(sorted_df, average=True)
42
-
43
- # Make sure Hugging Face username exists, if not, delete the row
44
- sorted_df['user_real'] = sorted_df['hf_username'].apply(validate_hf_username)
45
- sorted_df_clean = sorted_df[sorted_df['user_real']].reset_index(drop=True)
46
- # Add ranking column
47
- sorted_df_clean['rank'] = np.arange(1, len(sorted_df_clean) + 1)
48
- avg_leaderboard = sorted_df_clean.copy()
49
-
50
- # Clean raw data as well
51
- df_raw = df_results_raw[df_results_raw["Endpoint"] == ep].copy()
52
- df_raw = df_raw.rename(columns={"RAE": "MA-RAE"})
53
-
54
- df_raw['hf_username'] = df_raw['hf_username'].apply(lambda s: s.lower())
55
- df_raw = df_raw.sort_values(by="submission_time", ascending=False, kind="stable")
56
- df_raw = df_raw.drop_duplicates(subset=['hf_username','Sample'], keep='first')
57
-
58
- valid_usernames = sorted_df_clean['hf_username'].unique()
59
- df_raw_clean = df_raw[df_raw['hf_username'].isin(valid_usernames)].reset_index(drop=True)
60
-
61
- # Make sure order of raw dataframe is the same as sorted dataframe
62
- username_order = sorted_df['hf_username'].unique()
63
- df_raw_sorted = df_raw_clean.copy()
64
- df_raw_sorted['hf_username'] = pd.Categorical(
65
- df_raw_sorted['hf_username'],
66
- categories=username_order,
67
- ordered=True
68
- )
69
- df_raw_sorted = df_raw_sorted.sort_values(
70
- by=['hf_username', 'Sample'],
71
- ascending=[True, True]
72
- )
73
- df_raw_sorted['hf_username'] = df_raw_sorted['hf_username'].astype(str)
74
- df_raw_sorted = df_raw_sorted.reset_index(drop=True)
75
-
76
- avg_leaderboard = add_cld_to_leaderboard(
77
- sorted_df_clean,
78
- df_raw_sorted,
79
- "MA-RAE",
80
- )
81
- avg_cols = ["rank",
82
- "user",
83
- "CLD",
84
- "MA-RAE",
85
- "R2",
86
- "Spearman R",
87
- "Kendall's Tau",
88
- "model details"]
89
-
90
- per_ep[ep] = avg_leaderboard[avg_cols]
91
-
92
- else:
93
- if avg_only:
94
- continue
95
- # Delete duplicate entries before sorting (fixing case-sensitive duplicate check)
96
- df['hf_username'] = df['hf_username'].apply(lambda s: s.lower())
97
- df = df.sort_values(by="submission time", ascending=False, kind="stable")
98
- df = df.drop_duplicates(subset=['hf_username'], keep='first')
99
- sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
100
- sorted_df = map_metric_to_stats(sorted_df)
101
- # Make sure Hugging Face username exists, if not, delete the row
102
- sorted_df['user_real'] = sorted_df['hf_username'].apply(validate_hf_username)
103
- sorted_df_clean = sorted_df[sorted_df['user_real']]
104
- per_ep[ep] = sorted_df_clean[LB_COLS]
105
- logger.info("Finished building leaderboard data.")
106
- return per_ep
107
-
108
- def validate_hf_username(username):
109
- username = str(username).strip()
110
- hf_url = f"https://huggingface.co/{username}"
111
- return check_page_exists(hf_url, delay=1)
112
-
113
- def validate_model_details(tag):
114
- if tag is None:
115
- return "Not submitted"
116
- safe_tag = str(tag).strip()
117
- if not safe_tag.startswith("https://"):
118
- return "Invalid link"
119
- is_real_url = check_page_exists(safe_tag, delay=2)
120
- if not is_real_url:
121
- return "Invalid link"
122
- else:
123
- return safe_tag
124
-
125
- def prepare_lb_csv(save_folder:str, avg_only:bool):
126
- logger.info("Fetching data")
127
- df_latest, df_latest_raw = fetch_dataset_df(
128
- download_raw=True,
129
- test_repo=results_repo_test
130
- )
131
- logger.info("Building leaderboard")
132
- per_ep_df = build_leaderboard(df_latest, df_latest_raw, avg_only)
133
- logger.info("Saving leaderboard")
134
- for ep in ALL_EPS:
135
- if ep != "Average" and avg_only:
136
- continue
137
- df_lb = per_ep_df[ep]
138
- save_path = Path(save_folder) / f"{ep}_leaderboard.csv"
139
- df_lb.to_csv(save_path, index=False)
140
- return
141
-
142
- if __name__ == "__main__":
143
- prepare_lb_csv("intermediate_lbs", avg_only=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
intermediate_leaderboard.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
2
+ from cld import cld
3
+ from utils import (
4
+ check_page_exists,
5
+ map_metric_to_stats,
6
+ fetch_dataset_df,
7
+ )
8
+ from about import ENDPOINTS, LB_COLS, results_repo_test, METRICS
9
+ import pandas as pd
10
+
11
+
12
+
13
+ def validate_hf_username(username):
14
+ username = str(username).strip()
15
+ hf_url = f"https://huggingface.co/{username}"
16
+ return check_page_exists(hf_url, delay=1)
17
+ # return True # For testing purposes, assume all usernames are valid
18
+
19
+ def validate_model_details(tag):
20
+ if tag is None:
21
+ return "Not submitted"
22
+ safe_tag = str(tag).strip()
23
+ if not safe_tag.startswith("https://"):
24
+ return "Invalid link"
25
+ is_real_url = check_page_exists(safe_tag, delay=2)
26
+ if not is_real_url:
27
+ return "Invalid link"
28
+ else:
29
+ return safe_tag
30
+
31
+
32
+
33
+ def make_intermediate_lb():
34
+
35
+ df_latest, df_latest_raw = fetch_dataset_df(
36
+ download_raw=True,
37
+ test_repo=results_repo_test
38
+ )
39
+
40
+
41
+ # HF username validation
42
+ hf_usernames = df_latest_raw["hf_username"].unique()
43
+ valid_hf_usernames = {username: validate_hf_username(username) for username in hf_usernames}
44
+
45
+ # print all users and their validation status
46
+ for username, is_valid in valid_hf_usernames.items():
47
+ print(f"Username: {username}, Valid: {is_valid}")
48
+
49
+ df_latest_raw["hf_user_valid"] = df_latest_raw["hf_username"].map(valid_hf_usernames)
50
+ # drop invalid usernames
51
+ df_latest_raw = df_latest_raw[df_latest_raw["hf_user_valid"]].reset_index(drop=True)
52
+
53
+ # make sure to only keep the latest submission per user for the 'Average' endpoint
54
+ df_latest_raw["submission_time"] = pd.to_datetime(df_latest_raw["submission_time"])
55
+ df_latest_raw = df_latest_raw.query("Endpoint == 'Average'")
56
+ df_latest_raw['latest_time_per_user'] = df_latest_raw.groupby('user')['submission_time'].transform('max')
57
+ latest_submissions_df = df_latest_raw[df_latest_raw['submission_time'] == df_latest_raw['latest_time_per_user']].copy()
58
+ latest_submissions_df = latest_submissions_df.sort_values(
59
+ ['RAE','user', 'Sample'], ascending=True
60
+ ).reset_index(drop=True)
61
+
62
+ # Get the unique users in the order of their first appearance
63
+ unique_users_ordered = latest_submissions_df['user'].unique()
64
+
65
+ # Create a mapping dictionary: original_user -> prefixed_user
66
+ user_mapping = {}
67
+ for idx, user in enumerate(unique_users_ordered):
68
+ # The prefix is the index starting from 0, formatted to be 3 digits (001, 002, etc.)
69
+ # We use idx + 1 to start the sequence from 001 instead of 000
70
+ prefix = f"{idx + 1:03d}"
71
+ prefixed_user = f"{prefix}___{user}"
72
+ user_mapping[user] = prefixed_user
73
+
74
+ # Apply the mapping to create a new column with prefixed usernames
75
+ latest_submissions_df['user'] = latest_submissions_df['user'].map(user_mapping)
76
+
77
+
78
+
79
+
80
+ # Perform Tukey's HSD test
81
+ tukey = pairwise_tukeyhsd(endog=latest_submissions_df['RAE'], groups=latest_submissions_df['user'], alpha=0.05)
82
+ tukey_df = pd.DataFrame(data=tukey._results_table.data[1:],
83
+ columns=tukey._results_table.data[0])
84
+
85
+ # add CLDs
86
+ cld_dict = cld(tukey_df)
87
+
88
+ cld_df = pd.DataFrame(cld_dict.items(),columns=["group","letter"]).sort_values("group")
89
+ cld_df.letter = [",".join(x) for x in cld_df.letter]
90
+ cld_df["user"] = cld_df.group
91
+ cld_df["user_fixed"] = cld_df.group.str.split("___").str[1]
92
+
93
+ # clean up CLD letters for extended alphabet (i.e with @ symbols)
94
+ def clean_up(ser):
95
+ ser = ser.split(",")
96
+ # rejoin for late in alphabet
97
+ if "@" in ser and len(ser) == 2:
98
+ let = "@" + ser[1]
99
+ elif "@" in ser and len(ser) == 4:
100
+ let = "@" + ser[2] + "," + "@" + ser[3]
101
+ else:
102
+ let = ",".join(ser)
103
+ return let
104
+
105
+ cld_df["fixed_letter"] = cld_df["letter"].apply(lambda x: clean_up(x))
106
+
107
+
108
+ # gather means and stds for each metric for each user
109
+ for metric in METRICS:
110
+ metric_stats = latest_submissions_df.groupby('user')[metric].agg(['mean', 'std']).reset_index()
111
+ metric_stats = metric_stats.rename(columns={'mean': f'{metric}_mean', 'std': f'{metric}_std'})
112
+ metric_stats[f"{metric}_display"] = metric_stats.apply(
113
+ lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
114
+ )
115
+ cld_df = cld_df.merge(metric_stats[['user', f'{metric}_mean', f'{metric}_std', f'{metric}_display']], on='user', how='left')
116
+
117
+
118
+ cld_subset = cld_df[['user_fixed', 'fixed_letter'] + [f'{metric}_display' for metric in METRICS]]
119
+ cld_subset = cld_subset.rename(columns={'user_fixed': 'user', 'fixed_letter': 'CLD'})
120
+ print(cld_subset.head())
121
+ cld_subset.to_csv("leaderboard_cld_results.csv", index=False)
122
+
123
+ if __name__ == "__main__":
124
+ make_intermediate_lb()
utils.py CHANGED
@@ -9,8 +9,11 @@ from loguru import logger
9
  import time
10
  import requests
11
 
12
- def check_page_exists(url: str, delay=0.2):
13
- """Checks if a web page exists at the given URL.
 
 
 
14
 
15
  Parameters
16
  ----------
@@ -18,6 +21,10 @@ def check_page_exists(url: str, delay=0.2):
18
  Url of the page
19
  delay : float, optional
20
  Seconds to wait until submitting another request, by default 0.2
 
 
 
 
21
 
22
  Returns
23
  -------
@@ -25,17 +32,26 @@ def check_page_exists(url: str, delay=0.2):
25
  If the page exists
26
  """
27
  safe_url = str(url).strip()
 
28
  # Attempt to fix url
29
  if not safe_url.startswith(('http://', 'https://')):
30
  safe_url = f"https://{safe_url}"
 
31
  try:
32
  response = requests.get(safe_url, timeout=5)
33
 
 
34
  if response.status_code == 429:
35
- print(f"Warning: Rate limit hit on {safe_url}. Waiting for 5 seconds...")
36
- time.sleep(5)
37
- return check_page_exists(safe_url, delay=delay)
38
-
 
 
 
 
 
 
39
  return response.status_code == 200
40
 
41
  except requests.exceptions.RequestException as e:
 
9
  import time
10
  import requests
11
 
12
+ import requests
13
+ import time
14
+
15
+ def check_page_exists(url: str, delay=0.2, max_retries=3, current_retries=0):
16
+ """Checks if a web page exists at the given URL with a retry limit for 429 errors.
17
 
18
  Parameters
19
  ----------
 
21
  Url of the page
22
  delay : float, optional
23
  Seconds to wait until submitting another request, by default 0.2
24
+ max_retries : int, optional
25
+ Maximum number of times to retry on a 429 error, by default 3
26
+ current_retries : int, optional
27
+ Current number of retries performed (internal counter), by default 0
28
 
29
  Returns
30
  -------
 
32
  If the page exists
33
  """
34
  safe_url = str(url).strip()
35
+
36
  # Attempt to fix url
37
  if not safe_url.startswith(('http://', 'https://')):
38
  safe_url = f"https://{safe_url}"
39
+
40
  try:
41
  response = requests.get(safe_url, timeout=5)
42
 
43
+ # Check for Rate Limit Error and retry if under the limit
44
  if response.status_code == 429:
45
+ if current_retries < max_retries:
46
+ print(f"Warning: Rate limit hit on {safe_url}. Attempt {current_retries + 1}/{max_retries}. Waiting for 5 seconds...")
47
+ time.sleep(5)
48
+ # Recurse with an incremented retry counter
49
+ return check_page_exists(safe_url, delay=delay, max_retries=max_retries, current_retries=current_retries + 1)
50
+ else:
51
+ print(f"Error: Max retries ({max_retries}) reached for rate limit on {safe_url}.")
52
+ return False # Give up after max retries
53
+
54
+ # Return True only for a successful status code (200)
55
  return response.status_code == 200
56
 
57
  except requests.exceptions.RequestException as e: