update leaderboard code
Browse files- cld.py +1 -1
- final_lb.py +0 -143
- intermediate_leaderboard.py +124 -0
- utils.py +22 -6
cld.py
CHANGED
|
@@ -6,7 +6,7 @@ from itertools import product
|
|
| 6 |
|
| 7 |
# Make large CLD alphabet
|
| 8 |
single_chars = list(ascii_lowercase) + list(ascii_uppercase)
|
| 9 |
-
underscore_chars = [''.join(p) for p in product(['
|
| 10 |
CLD_ALPHABET = single_chars + underscore_chars
|
| 11 |
|
| 12 |
def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
|
|
|
|
| 6 |
|
| 7 |
# Make large CLD alphabet
|
| 8 |
single_chars = list(ascii_lowercase) + list(ascii_uppercase)
|
| 9 |
+
underscore_chars = [''.join(p) for p in product(['@'], single_chars)]
|
| 10 |
CLD_ALPHABET = single_chars + underscore_chars
|
| 11 |
|
| 12 |
def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
|
final_lb.py
DELETED
|
@@ -1,143 +0,0 @@
|
|
| 1 |
-
''' Code to generate intermediate and final leadeboard '''
|
| 2 |
-
from cld import add_cld_to_leaderboard
|
| 3 |
-
from utils import (
|
| 4 |
-
check_page_exists,
|
| 5 |
-
map_metric_to_stats,
|
| 6 |
-
fetch_dataset_df,
|
| 7 |
-
)
|
| 8 |
-
from about import ENDPOINTS, LB_COLS, results_repo_test
|
| 9 |
-
|
| 10 |
-
from loguru import logger
|
| 11 |
-
import pandas as pd
|
| 12 |
-
import numpy as np
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
|
| 15 |
-
ALL_EPS = ['Average'] + ENDPOINTS
|
| 16 |
-
|
| 17 |
-
def build_leaderboard(df_results, df_results_raw, avg_only=True):
|
| 18 |
-
per_ep = {}
|
| 19 |
-
for ep in ALL_EPS:
|
| 20 |
-
df = df_results[df_results["Endpoint"] == ep].copy()
|
| 21 |
-
if df is None:
|
| 22 |
-
print(f"[refresh] {ep} returned None; using empty DF")
|
| 23 |
-
if df.empty:
|
| 24 |
-
per_ep[ep] = pd.DataFrame(columns=LB_COLS) # Empty df
|
| 25 |
-
continue
|
| 26 |
-
|
| 27 |
-
# Make model details clickable
|
| 28 |
-
df['model details'] = df['model_report'].apply(lambda x: validate_model_details(x)).astype(str)
|
| 29 |
-
|
| 30 |
-
if ep == "Average":
|
| 31 |
-
# MA-RAE is the average of the RAE per endpoint
|
| 32 |
-
df = df.rename(columns={"mean_RAE": "mean_MA-RAE",
|
| 33 |
-
"std_RAE": "std_MA-RAE"})
|
| 34 |
-
# Delete duplicate entries before sorting (fixing case-sensitive duplicate check)
|
| 35 |
-
df['hf_username'] = df['hf_username'].apply(lambda s: s.lower())
|
| 36 |
-
df = df.sort_values(by="submission time", ascending=False, kind="stable")
|
| 37 |
-
df = df.drop_duplicates(subset=['hf_username'], keep='first')
|
| 38 |
-
|
| 39 |
-
# Sort by MAE-RAE
|
| 40 |
-
sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
|
| 41 |
-
sorted_df = map_metric_to_stats(sorted_df, average=True)
|
| 42 |
-
|
| 43 |
-
# Make sure Hugging Face username exists, if not, delete the row
|
| 44 |
-
sorted_df['user_real'] = sorted_df['hf_username'].apply(validate_hf_username)
|
| 45 |
-
sorted_df_clean = sorted_df[sorted_df['user_real']].reset_index(drop=True)
|
| 46 |
-
# Add ranking column
|
| 47 |
-
sorted_df_clean['rank'] = np.arange(1, len(sorted_df_clean) + 1)
|
| 48 |
-
avg_leaderboard = sorted_df_clean.copy()
|
| 49 |
-
|
| 50 |
-
# Clean raw data as well
|
| 51 |
-
df_raw = df_results_raw[df_results_raw["Endpoint"] == ep].copy()
|
| 52 |
-
df_raw = df_raw.rename(columns={"RAE": "MA-RAE"})
|
| 53 |
-
|
| 54 |
-
df_raw['hf_username'] = df_raw['hf_username'].apply(lambda s: s.lower())
|
| 55 |
-
df_raw = df_raw.sort_values(by="submission_time", ascending=False, kind="stable")
|
| 56 |
-
df_raw = df_raw.drop_duplicates(subset=['hf_username','Sample'], keep='first')
|
| 57 |
-
|
| 58 |
-
valid_usernames = sorted_df_clean['hf_username'].unique()
|
| 59 |
-
df_raw_clean = df_raw[df_raw['hf_username'].isin(valid_usernames)].reset_index(drop=True)
|
| 60 |
-
|
| 61 |
-
# Make sure order of raw dataframe is the same as sorted dataframe
|
| 62 |
-
username_order = sorted_df['hf_username'].unique()
|
| 63 |
-
df_raw_sorted = df_raw_clean.copy()
|
| 64 |
-
df_raw_sorted['hf_username'] = pd.Categorical(
|
| 65 |
-
df_raw_sorted['hf_username'],
|
| 66 |
-
categories=username_order,
|
| 67 |
-
ordered=True
|
| 68 |
-
)
|
| 69 |
-
df_raw_sorted = df_raw_sorted.sort_values(
|
| 70 |
-
by=['hf_username', 'Sample'],
|
| 71 |
-
ascending=[True, True]
|
| 72 |
-
)
|
| 73 |
-
df_raw_sorted['hf_username'] = df_raw_sorted['hf_username'].astype(str)
|
| 74 |
-
df_raw_sorted = df_raw_sorted.reset_index(drop=True)
|
| 75 |
-
|
| 76 |
-
avg_leaderboard = add_cld_to_leaderboard(
|
| 77 |
-
sorted_df_clean,
|
| 78 |
-
df_raw_sorted,
|
| 79 |
-
"MA-RAE",
|
| 80 |
-
)
|
| 81 |
-
avg_cols = ["rank",
|
| 82 |
-
"user",
|
| 83 |
-
"CLD",
|
| 84 |
-
"MA-RAE",
|
| 85 |
-
"R2",
|
| 86 |
-
"Spearman R",
|
| 87 |
-
"Kendall's Tau",
|
| 88 |
-
"model details"]
|
| 89 |
-
|
| 90 |
-
per_ep[ep] = avg_leaderboard[avg_cols]
|
| 91 |
-
|
| 92 |
-
else:
|
| 93 |
-
if avg_only:
|
| 94 |
-
continue
|
| 95 |
-
# Delete duplicate entries before sorting (fixing case-sensitive duplicate check)
|
| 96 |
-
df['hf_username'] = df['hf_username'].apply(lambda s: s.lower())
|
| 97 |
-
df = df.sort_values(by="submission time", ascending=False, kind="stable")
|
| 98 |
-
df = df.drop_duplicates(subset=['hf_username'], keep='first')
|
| 99 |
-
sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
|
| 100 |
-
sorted_df = map_metric_to_stats(sorted_df)
|
| 101 |
-
# Make sure Hugging Face username exists, if not, delete the row
|
| 102 |
-
sorted_df['user_real'] = sorted_df['hf_username'].apply(validate_hf_username)
|
| 103 |
-
sorted_df_clean = sorted_df[sorted_df['user_real']]
|
| 104 |
-
per_ep[ep] = sorted_df_clean[LB_COLS]
|
| 105 |
-
logger.info("Finished building leaderboard data.")
|
| 106 |
-
return per_ep
|
| 107 |
-
|
| 108 |
-
def validate_hf_username(username):
|
| 109 |
-
username = str(username).strip()
|
| 110 |
-
hf_url = f"https://huggingface.co/{username}"
|
| 111 |
-
return check_page_exists(hf_url, delay=1)
|
| 112 |
-
|
| 113 |
-
def validate_model_details(tag):
|
| 114 |
-
if tag is None:
|
| 115 |
-
return "Not submitted"
|
| 116 |
-
safe_tag = str(tag).strip()
|
| 117 |
-
if not safe_tag.startswith("https://"):
|
| 118 |
-
return "Invalid link"
|
| 119 |
-
is_real_url = check_page_exists(safe_tag, delay=2)
|
| 120 |
-
if not is_real_url:
|
| 121 |
-
return "Invalid link"
|
| 122 |
-
else:
|
| 123 |
-
return safe_tag
|
| 124 |
-
|
| 125 |
-
def prepare_lb_csv(save_folder:str, avg_only:bool):
|
| 126 |
-
logger.info("Fetching data")
|
| 127 |
-
df_latest, df_latest_raw = fetch_dataset_df(
|
| 128 |
-
download_raw=True,
|
| 129 |
-
test_repo=results_repo_test
|
| 130 |
-
)
|
| 131 |
-
logger.info("Building leaderboard")
|
| 132 |
-
per_ep_df = build_leaderboard(df_latest, df_latest_raw, avg_only)
|
| 133 |
-
logger.info("Saving leaderboard")
|
| 134 |
-
for ep in ALL_EPS:
|
| 135 |
-
if ep != "Average" and avg_only:
|
| 136 |
-
continue
|
| 137 |
-
df_lb = per_ep_df[ep]
|
| 138 |
-
save_path = Path(save_folder) / f"{ep}_leaderboard.csv"
|
| 139 |
-
df_lb.to_csv(save_path, index=False)
|
| 140 |
-
return
|
| 141 |
-
|
| 142 |
-
if __name__ == "__main__":
|
| 143 |
-
prepare_lb_csv("intermediate_lbs", avg_only=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
intermediate_leaderboard.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
| 2 |
+
from cld import cld
|
| 3 |
+
from utils import (
|
| 4 |
+
check_page_exists,
|
| 5 |
+
map_metric_to_stats,
|
| 6 |
+
fetch_dataset_df,
|
| 7 |
+
)
|
| 8 |
+
from about import ENDPOINTS, LB_COLS, results_repo_test, METRICS
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def validate_hf_username(username):
|
| 14 |
+
username = str(username).strip()
|
| 15 |
+
hf_url = f"https://huggingface.co/{username}"
|
| 16 |
+
return check_page_exists(hf_url, delay=1)
|
| 17 |
+
# return True # For testing purposes, assume all usernames are valid
|
| 18 |
+
|
| 19 |
+
def validate_model_details(tag):
|
| 20 |
+
if tag is None:
|
| 21 |
+
return "Not submitted"
|
| 22 |
+
safe_tag = str(tag).strip()
|
| 23 |
+
if not safe_tag.startswith("https://"):
|
| 24 |
+
return "Invalid link"
|
| 25 |
+
is_real_url = check_page_exists(safe_tag, delay=2)
|
| 26 |
+
if not is_real_url:
|
| 27 |
+
return "Invalid link"
|
| 28 |
+
else:
|
| 29 |
+
return safe_tag
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def make_intermediate_lb():
|
| 34 |
+
|
| 35 |
+
df_latest, df_latest_raw = fetch_dataset_df(
|
| 36 |
+
download_raw=True,
|
| 37 |
+
test_repo=results_repo_test
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# HF username validation
|
| 42 |
+
hf_usernames = df_latest_raw["hf_username"].unique()
|
| 43 |
+
valid_hf_usernames = {username: validate_hf_username(username) for username in hf_usernames}
|
| 44 |
+
|
| 45 |
+
# print all users and their validation status
|
| 46 |
+
for username, is_valid in valid_hf_usernames.items():
|
| 47 |
+
print(f"Username: {username}, Valid: {is_valid}")
|
| 48 |
+
|
| 49 |
+
df_latest_raw["hf_user_valid"] = df_latest_raw["hf_username"].map(valid_hf_usernames)
|
| 50 |
+
# drop invalid usernames
|
| 51 |
+
df_latest_raw = df_latest_raw[df_latest_raw["hf_user_valid"]].reset_index(drop=True)
|
| 52 |
+
|
| 53 |
+
# make sure to only keep the latest submission per user for the 'Average' endpoint
|
| 54 |
+
df_latest_raw["submission_time"] = pd.to_datetime(df_latest_raw["submission_time"])
|
| 55 |
+
df_latest_raw = df_latest_raw.query("Endpoint == 'Average'")
|
| 56 |
+
df_latest_raw['latest_time_per_user'] = df_latest_raw.groupby('user')['submission_time'].transform('max')
|
| 57 |
+
latest_submissions_df = df_latest_raw[df_latest_raw['submission_time'] == df_latest_raw['latest_time_per_user']].copy()
|
| 58 |
+
latest_submissions_df = latest_submissions_df.sort_values(
|
| 59 |
+
['RAE','user', 'Sample'], ascending=True
|
| 60 |
+
).reset_index(drop=True)
|
| 61 |
+
|
| 62 |
+
# Get the unique users in the order of their first appearance
|
| 63 |
+
unique_users_ordered = latest_submissions_df['user'].unique()
|
| 64 |
+
|
| 65 |
+
# Create a mapping dictionary: original_user -> prefixed_user
|
| 66 |
+
user_mapping = {}
|
| 67 |
+
for idx, user in enumerate(unique_users_ordered):
|
| 68 |
+
# The prefix is the index starting from 0, formatted to be 3 digits (001, 002, etc.)
|
| 69 |
+
# We use idx + 1 to start the sequence from 001 instead of 000
|
| 70 |
+
prefix = f"{idx + 1:03d}"
|
| 71 |
+
prefixed_user = f"{prefix}___{user}"
|
| 72 |
+
user_mapping[user] = prefixed_user
|
| 73 |
+
|
| 74 |
+
# Apply the mapping to create a new column with prefixed usernames
|
| 75 |
+
latest_submissions_df['user'] = latest_submissions_df['user'].map(user_mapping)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Perform Tukey's HSD test
|
| 81 |
+
tukey = pairwise_tukeyhsd(endog=latest_submissions_df['RAE'], groups=latest_submissions_df['user'], alpha=0.05)
|
| 82 |
+
tukey_df = pd.DataFrame(data=tukey._results_table.data[1:],
|
| 83 |
+
columns=tukey._results_table.data[0])
|
| 84 |
+
|
| 85 |
+
# add CLDs
|
| 86 |
+
cld_dict = cld(tukey_df)
|
| 87 |
+
|
| 88 |
+
cld_df = pd.DataFrame(cld_dict.items(),columns=["group","letter"]).sort_values("group")
|
| 89 |
+
cld_df.letter = [",".join(x) for x in cld_df.letter]
|
| 90 |
+
cld_df["user"] = cld_df.group
|
| 91 |
+
cld_df["user_fixed"] = cld_df.group.str.split("___").str[1]
|
| 92 |
+
|
| 93 |
+
# clean up CLD letters for extended alphabet (i.e with @ symbols)
|
| 94 |
+
def clean_up(ser):
|
| 95 |
+
ser = ser.split(",")
|
| 96 |
+
# rejoin for late in alphabet
|
| 97 |
+
if "@" in ser and len(ser) == 2:
|
| 98 |
+
let = "@" + ser[1]
|
| 99 |
+
elif "@" in ser and len(ser) == 4:
|
| 100 |
+
let = "@" + ser[2] + "," + "@" + ser[3]
|
| 101 |
+
else:
|
| 102 |
+
let = ",".join(ser)
|
| 103 |
+
return let
|
| 104 |
+
|
| 105 |
+
cld_df["fixed_letter"] = cld_df["letter"].apply(lambda x: clean_up(x))
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# gather means and stds for each metric for each user
|
| 109 |
+
for metric in METRICS:
|
| 110 |
+
metric_stats = latest_submissions_df.groupby('user')[metric].agg(['mean', 'std']).reset_index()
|
| 111 |
+
metric_stats = metric_stats.rename(columns={'mean': f'{metric}_mean', 'std': f'{metric}_std'})
|
| 112 |
+
metric_stats[f"{metric}_display"] = metric_stats.apply(
|
| 113 |
+
lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
|
| 114 |
+
)
|
| 115 |
+
cld_df = cld_df.merge(metric_stats[['user', f'{metric}_mean', f'{metric}_std', f'{metric}_display']], on='user', how='left')
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
cld_subset = cld_df[['user_fixed', 'fixed_letter'] + [f'{metric}_display' for metric in METRICS]]
|
| 119 |
+
cld_subset = cld_subset.rename(columns={'user_fixed': 'user', 'fixed_letter': 'CLD'})
|
| 120 |
+
print(cld_subset.head())
|
| 121 |
+
cld_subset.to_csv("leaderboard_cld_results.csv", index=False)
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
make_intermediate_lb()
|
utils.py
CHANGED
|
@@ -9,8 +9,11 @@ from loguru import logger
|
|
| 9 |
import time
|
| 10 |
import requests
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
Parameters
|
| 16 |
----------
|
|
@@ -18,6 +21,10 @@ def check_page_exists(url: str, delay=0.2):
|
|
| 18 |
Url of the page
|
| 19 |
delay : float, optional
|
| 20 |
Seconds to wait until submitting another request, by default 0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
Returns
|
| 23 |
-------
|
|
@@ -25,17 +32,26 @@ def check_page_exists(url: str, delay=0.2):
|
|
| 25 |
If the page exists
|
| 26 |
"""
|
| 27 |
safe_url = str(url).strip()
|
|
|
|
| 28 |
# Attempt to fix url
|
| 29 |
if not safe_url.startswith(('http://', 'https://')):
|
| 30 |
safe_url = f"https://{safe_url}"
|
|
|
|
| 31 |
try:
|
| 32 |
response = requests.get(safe_url, timeout=5)
|
| 33 |
|
|
|
|
| 34 |
if response.status_code == 429:
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return response.status_code == 200
|
| 40 |
|
| 41 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 9 |
import time
|
| 10 |
import requests
|
| 11 |
|
| 12 |
+
import requests
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
def check_page_exists(url: str, delay=0.2, max_retries=3, current_retries=0):
|
| 16 |
+
"""Checks if a web page exists at the given URL with a retry limit for 429 errors.
|
| 17 |
|
| 18 |
Parameters
|
| 19 |
----------
|
|
|
|
| 21 |
Url of the page
|
| 22 |
delay : float, optional
|
| 23 |
Seconds to wait until submitting another request, by default 0.2
|
| 24 |
+
max_retries : int, optional
|
| 25 |
+
Maximum number of times to retry on a 429 error, by default 3
|
| 26 |
+
current_retries : int, optional
|
| 27 |
+
Current number of retries performed (internal counter), by default 0
|
| 28 |
|
| 29 |
Returns
|
| 30 |
-------
|
|
|
|
| 32 |
If the page exists
|
| 33 |
"""
|
| 34 |
safe_url = str(url).strip()
|
| 35 |
+
|
| 36 |
# Attempt to fix url
|
| 37 |
if not safe_url.startswith(('http://', 'https://')):
|
| 38 |
safe_url = f"https://{safe_url}"
|
| 39 |
+
|
| 40 |
try:
|
| 41 |
response = requests.get(safe_url, timeout=5)
|
| 42 |
|
| 43 |
+
# Check for Rate Limit Error and retry if under the limit
|
| 44 |
if response.status_code == 429:
|
| 45 |
+
if current_retries < max_retries:
|
| 46 |
+
print(f"Warning: Rate limit hit on {safe_url}. Attempt {current_retries + 1}/{max_retries}. Waiting for 5 seconds...")
|
| 47 |
+
time.sleep(5)
|
| 48 |
+
# Recurse with an incremented retry counter
|
| 49 |
+
return check_page_exists(safe_url, delay=delay, max_retries=max_retries, current_retries=current_retries + 1)
|
| 50 |
+
else:
|
| 51 |
+
print(f"Error: Max retries ({max_retries}) reached for rate limit on {safe_url}.")
|
| 52 |
+
return False # Give up after max retries
|
| 53 |
+
|
| 54 |
+
# Return True only for a successful status code (200)
|
| 55 |
return response.status_code == 200
|
| 56 |
|
| 57 |
except requests.exceptions.RequestException as e:
|