Spaces:

openadmet
/

OpenADMET-ExpansionRx-Challenge

Runtime error

File size: 5,651 Bytes

b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52c947
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52c947
 
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52c947
b9a3c9e
b52c947
 
b9a3c9e
b52c947
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abd9b44
b9a3c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fc1ec0
b9a3c9e
 
 
 
 
 
 
 
b52c947
b9a3c9e
abd9b44
 
4fc1ec0
b52c947
abd9b44
4fc1ec0
 
 
b9a3c9e

from statsmodels.stats.multicomp import pairwise_tukeyhsd
from cld import cld
from utils import (
    check_page_exists,
    map_metric_to_stats,
    fetch_dataset_df,
)
from about import ENDPOINTS, LB_COLS, results_repo_test, METRICS
import pandas as pd



def validate_hf_username(username):
    username = str(username).strip()
    hf_url = f"https://huggingface.co/{username}"
    return check_page_exists(hf_url, delay=1, max_retries=10)
    # return True  # For testing purposes, assume all usernames are valid

def validate_model_details(tag):
    if tag is None:
        return "Not submitted"
    safe_tag = str(tag).strip()
    if not safe_tag.startswith("https://"):
        return "Invalid link"
    is_real_url = check_page_exists(safe_tag, delay=2)
    if not is_real_url:
        return "Invalid link"
    else:
        return safe_tag



def make_intermediate_lb():

    df_latest, df_latest_raw = fetch_dataset_df(
        download_raw=True, 
        test_repo=results_repo_test
    )

    # Make all usernames lowercase
    df_latest_raw["hf_username"] = df_latest_raw["hf_username"].str.lower()

    # HF username validation
    hf_usernames = df_latest_raw["hf_username"].unique()
    valid_hf_usernames = {username: validate_hf_username(username) for username in hf_usernames}

    # print all users and their validation status
    for username, is_valid in valid_hf_usernames.items():
        print(f"Username: {username}, Valid: {is_valid}")

    df_latest_raw["hf_user_valid"] = df_latest_raw["hf_username"].map(valid_hf_usernames)
    # drop invalid usernames
    df_latest_raw = df_latest_raw[df_latest_raw["hf_user_valid"]].reset_index(drop=True)

    # make sure to only keep the latest submission per user for the 'Average' endpoint
    df_latest_raw["submission_time"] = pd.to_datetime(df_latest_raw["submission_time"])
    df_latest_raw = df_latest_raw.query("Endpoint == 'Average'")
    df_latest_raw['latest_time_per_user'] = df_latest_raw.groupby('hf_username')['submission_time'].transform('max')
    latest_submissions_df = df_latest_raw[df_latest_raw['submission_time'] == df_latest_raw['latest_time_per_user']].copy()
    # Fix to order by the mean RAE and not the RAE of all samples (slight missmatch for some users)
    latest_submissions_df['mean_RAE'] = latest_submissions_df.groupby('hf_username')['RAE'].transform('mean')    
    latest_submissions_df = latest_submissions_df.sort_values(
        by=['mean_RAE', 'Sample'], ascending=True
    ).reset_index(drop=True)

    # Get the unique users in the order of their first appearance
    unique_users_ordered = latest_submissions_df['user'].unique()

    # Create a mapping dictionary: original_user -> prefixed_user
    user_mapping = {}
    for idx, user in enumerate(unique_users_ordered):
        # The prefix is the index starting from 0, formatted to be 3 digits (001, 002, etc.)
        # We use idx + 1 to start the sequence from 001 instead of 000
        prefix = f"{idx + 1:03d}"
        prefixed_user = f"{prefix}___{user}"
        user_mapping[user] = prefixed_user

    # Apply the mapping to create a new column with prefixed usernames
    latest_submissions_df['user'] = latest_submissions_df['user'].map(user_mapping)

    # Perform Tukey's HSD test
    tukey = pairwise_tukeyhsd(endog=latest_submissions_df['RAE'], groups=latest_submissions_df['user'], alpha=0.05)
    tukey_df = pd.DataFrame(data=tukey._results_table.data[1:], 
                            columns=tukey._results_table.data[0])

    # add CLDs
    cld_dict = cld(tukey_df)

    cld_df = pd.DataFrame(cld_dict.items(),columns=["group","letter"]).sort_values("group")
    cld_df.letter = [",".join(x) for x in cld_df.letter]
    cld_df["user"] = cld_df.group

    # clean up CLD letters for extended alphabet (i.e with @ symbols)
    def clean_up(ser):
        ser = ser.split(",")
        # rejoin for late in alphabet
        if "@" in ser and len(ser) == 2:
            let = "@" + ser[1]
        elif "@" in ser and len(ser) == 4:
            let = "@" + ser[2] + "," + "@" + ser[3]
        else:
            let = ",".join(ser)
        return let
    
    cld_df["fixed_letter"] = cld_df["letter"].apply(lambda x: clean_up(x))
    report_cols = latest_submissions_df[['user', 'model_report']].drop_duplicates(keep='first')

    # gather means and stds for each metric for each user
    for metric in METRICS:
        metric_stats = latest_submissions_df.groupby('user')[metric].agg(['mean', 'std']).reset_index()
        metric_stats = metric_stats.rename(columns={'mean': f'{metric}_mean', 'std': f'{metric}_std'})
        metric_stats[f"{metric}_display"] = metric_stats.apply(
            lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
        )
        cld_df = metric_stats[['user', f'{metric}_mean', f'{metric}_std', f'{metric}_display']].merge(cld_df, on='user', how='left')

    # re-sort by RAE mean, lowest is best
    cld_df = cld_df.sort_values(by='RAE_mean', ascending=True).reset_index(drop=True)
    cld_df = cld_df.merge(report_cols, on='user', how='inner')
    cld_df['user'] = cld_df['user'].str.split('___').str[1]

    cld_subset = cld_df[['user', 'fixed_letter'] + [f'{metric}_display' for metric in METRICS]+ ['model_report']]
    cld_subset = cld_subset.rename(columns={'user': 'user', 'fixed_letter': 'CLD', 'model_report': 'model details'})
    cld_subset['model details'] = cld_subset['model details'].apply(lambda x: validate_model_details(x)).astype(str)

    print(cld_subset.head())
    cld_subset.to_csv("leaderboard_cld_results.csv", index=False)

if __name__ == "__main__":
    make_intermediate_lb()