File size: 1,881 Bytes
0c194f3 20ed309 0c194f3 20ed309 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import pandas as pd
import numpy as np
from datasets import load_dataset
from about import results_repo
from about import LB_COLS0
def make_user_clickable(name):
link =f'https://huggingface.co/{name}'
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{name}</a>'
def make_tag_clickable(tag):
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
def fetch_dataset_df():
dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
full_df = dset.to_pandas()
assert all(
col in full_df.columns for col in LB_COLS0
), f"Expected columns {LB_COLS0} not found in {full_df.columns}. Missing columns: {set(LB_COLS0) - set(full_df.columns)}"
df = full_df.copy()
df = df[df["user"] != "test"].copy()
df["submission_time"] = pd.to_datetime(df["submission_time"], errors="coerce")
df = df.dropna(subset=["submission_time"])
# Get the most recent submission per user & endpoint
latest = (
df.sort_values("submission_time")
.drop_duplicates(subset=["endpoint", "user"], keep="last")
.sort_values(["endpoint", "user"])
.reset_index(drop=True)
)
latest.rename(columns={"submission_time": "submission time"}, inplace=True)
return latest
def metrics_per_ep(pred, true):
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics import mean_absolute_error, r2_score
mae = mean_absolute_error(true, pred)
rae = mae / np.mean(np.abs(true - np.mean(true)))
if np.nanstd(true) == 0:
r2=np.nan
else:
r2 = r2_score(true, pred)
spr, _ = spearmanr(true, pred)
ktau, _ = kendalltau(true, pred)
return mae, rae, r2, spr, ktau |