File size: 5,651 Bytes
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52c947
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52c947
 
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52c947
b9a3c9e
b52c947
 
b9a3c9e
b52c947
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abd9b44
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fc1ec0
b9a3c9e
 
 
 
 
 
 
 
b52c947
b9a3c9e
abd9b44
 
4fc1ec0
b52c947
abd9b44
4fc1ec0
 
 
b9a3c9e
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from cld import cld
from utils import (
    check_page_exists,
    map_metric_to_stats,
    fetch_dataset_df,
)
from about import ENDPOINTS, LB_COLS, results_repo_test, METRICS
import pandas as pd



def validate_hf_username(username):
    username = str(username).strip()
    hf_url = f"https://huggingface.co/{username}"
    return check_page_exists(hf_url, delay=1, max_retries=10)
    # return True  # For testing purposes, assume all usernames are valid

def validate_model_details(tag):
    if tag is None:
        return "Not submitted"
    safe_tag = str(tag).strip()
    if not safe_tag.startswith("https://"):
        return "Invalid link"
    is_real_url = check_page_exists(safe_tag, delay=2)
    if not is_real_url:
        return "Invalid link"
    else:
        return safe_tag



def make_intermediate_lb():

    df_latest, df_latest_raw = fetch_dataset_df(
        download_raw=True, 
        test_repo=results_repo_test
    )

    # Make all usernames lowercase
    df_latest_raw["hf_username"] = df_latest_raw["hf_username"].str.lower()

    # HF username validation
    hf_usernames = df_latest_raw["hf_username"].unique()
    valid_hf_usernames = {username: validate_hf_username(username) for username in hf_usernames}

    # print all users and their validation status
    for username, is_valid in valid_hf_usernames.items():
        print(f"Username: {username}, Valid: {is_valid}")

    df_latest_raw["hf_user_valid"] = df_latest_raw["hf_username"].map(valid_hf_usernames)
    # drop invalid usernames
    df_latest_raw = df_latest_raw[df_latest_raw["hf_user_valid"]].reset_index(drop=True)

    # make sure to only keep the latest submission per user for the 'Average' endpoint
    df_latest_raw["submission_time"] = pd.to_datetime(df_latest_raw["submission_time"])
    df_latest_raw = df_latest_raw.query("Endpoint == 'Average'")
    df_latest_raw['latest_time_per_user'] = df_latest_raw.groupby('hf_username')['submission_time'].transform('max')
    latest_submissions_df = df_latest_raw[df_latest_raw['submission_time'] == df_latest_raw['latest_time_per_user']].copy()
    # Fix to order by the mean RAE and not the RAE of all samples (slight missmatch for some users)
    latest_submissions_df['mean_RAE'] = latest_submissions_df.groupby('hf_username')['RAE'].transform('mean')    
    latest_submissions_df = latest_submissions_df.sort_values(
        by=['mean_RAE', 'Sample'], ascending=True
    ).reset_index(drop=True)

    # Get the unique users in the order of their first appearance
    unique_users_ordered = latest_submissions_df['user'].unique()

    # Create a mapping dictionary: original_user -> prefixed_user
    user_mapping = {}
    for idx, user in enumerate(unique_users_ordered):
        # The prefix is the index starting from 0, formatted to be 3 digits (001, 002, etc.)
        # We use idx + 1 to start the sequence from 001 instead of 000
        prefix = f"{idx + 1:03d}"
        prefixed_user = f"{prefix}___{user}"
        user_mapping[user] = prefixed_user

    # Apply the mapping to create a new column with prefixed usernames
    latest_submissions_df['user'] = latest_submissions_df['user'].map(user_mapping)

    # Perform Tukey's HSD test
    tukey = pairwise_tukeyhsd(endog=latest_submissions_df['RAE'], groups=latest_submissions_df['user'], alpha=0.05)
    tukey_df = pd.DataFrame(data=tukey._results_table.data[1:], 
                            columns=tukey._results_table.data[0])

    # add CLDs
    cld_dict = cld(tukey_df)

    cld_df = pd.DataFrame(cld_dict.items(),columns=["group","letter"]).sort_values("group")
    cld_df.letter = [",".join(x) for x in cld_df.letter]
    cld_df["user"] = cld_df.group

    # clean up CLD letters for extended alphabet (i.e with @ symbols)
    def clean_up(ser):
        ser = ser.split(",")
        # rejoin for late in alphabet
        if "@" in ser and len(ser) == 2:
            let = "@" + ser[1]
        elif "@" in ser and len(ser) == 4:
            let = "@" + ser[2] + "," + "@" + ser[3]
        else:
            let = ",".join(ser)
        return let
    
    cld_df["fixed_letter"] = cld_df["letter"].apply(lambda x: clean_up(x))
    report_cols = latest_submissions_df[['user', 'model_report']].drop_duplicates(keep='first')

    # gather means and stds for each metric for each user
    for metric in METRICS:
        metric_stats = latest_submissions_df.groupby('user')[metric].agg(['mean', 'std']).reset_index()
        metric_stats = metric_stats.rename(columns={'mean': f'{metric}_mean', 'std': f'{metric}_std'})
        metric_stats[f"{metric}_display"] = metric_stats.apply(
            lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
        )
        cld_df = metric_stats[['user', f'{metric}_mean', f'{metric}_std', f'{metric}_display']].merge(cld_df, on='user', how='left')

    # re-sort by RAE mean, lowest is best
    cld_df = cld_df.sort_values(by='RAE_mean', ascending=True).reset_index(drop=True)
    cld_df = cld_df.merge(report_cols, on='user', how='inner')
    cld_df['user'] = cld_df['user'].str.split('___').str[1]

    cld_subset = cld_df[['user', 'fixed_letter'] + [f'{metric}_display' for metric in METRICS]+ ['model_report']]
    cld_subset = cld_subset.rename(columns={'user': 'user', 'fixed_letter': 'CLD', 'model_report': 'model details'})
    cld_subset['model details'] = cld_subset['model details'].apply(lambda x: validate_model_details(x)).astype(str)

    print(cld_subset.head())
    cld_subset.to_csv("leaderboard_cld_results.csv", index=False)

if __name__ == "__main__":
    make_intermediate_lb()