Maria Castellanos
commited on
Commit
·
24d6e19
1
Parent(s):
179f265
Add code for CLD
Browse files- about.py +1 -1
- app.py +37 -16
- cld.py +204 -0
- evaluate.py +1 -0
- utils.py +29 -2
about.py
CHANGED
|
@@ -15,7 +15,7 @@ STANDARD_COLS = ["Endpoint", "user", "submission_time", "model_report"]
|
|
| 15 |
METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 16 |
# Final columns
|
| 17 |
LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
|
| 18 |
-
LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
|
| 19 |
LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
|
| 20 |
|
| 21 |
# Dictionary with unit conversion multipliers for each endpoint
|
|
|
|
| 15 |
METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 16 |
# Final columns
|
| 17 |
LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
|
| 18 |
+
LB_AVG = ["rank", "user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
|
| 19 |
LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
|
| 20 |
|
| 21 |
# Dictionary with unit conversion multipliers for each endpoint
|
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
from gradio_leaderboard import Leaderboard
|
| 3 |
from gradio.themes.utils import sizes
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
from evaluate import submit_data, evaluate_data
|
| 7 |
from utils import (
|
|
@@ -10,6 +11,7 @@ from utils import (
|
|
| 10 |
fetch_dataset_df,
|
| 11 |
map_metric_to_stats,
|
| 12 |
)
|
|
|
|
| 13 |
from datasets import load_dataset
|
| 14 |
import tempfile
|
| 15 |
from loguru import logger
|
|
@@ -21,7 +23,7 @@ import threading
|
|
| 21 |
|
| 22 |
ALL_EPS = ['Average'] + ENDPOINTS
|
| 23 |
|
| 24 |
-
def build_leaderboard(df_results):
|
| 25 |
logger.info("Rebuilding leaderboard data...")
|
| 26 |
per_ep = {}
|
| 27 |
for ep in ALL_EPS:
|
|
@@ -32,10 +34,7 @@ def build_leaderboard(df_results):
|
|
| 32 |
per_ep[ep] = pd.DataFrame(columns=LB_COLS) # Empty df
|
| 33 |
continue
|
| 34 |
|
| 35 |
-
# Make
|
| 36 |
-
df['user'] = df.apply(
|
| 37 |
-
lambda row: make_user_clickable(row['user']) if not row['anonymous'] else row['user'],
|
| 38 |
-
axis=1).astype(str)
|
| 39 |
df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
|
| 40 |
|
| 41 |
if ep == "Average":
|
|
@@ -44,16 +43,38 @@ def build_leaderboard(df_results):
|
|
| 44 |
"std_RAE": "std_MA-RAE"})
|
| 45 |
sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
|
| 46 |
sorted_df = map_metric_to_stats(sorted_df, average=True)
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
else:
|
| 49 |
sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
|
| 50 |
sorted_df = map_metric_to_stats(sorted_df)
|
|
|
|
|
|
|
|
|
|
| 51 |
per_ep[ep] = sorted_df[LB_COLS]
|
| 52 |
logger.info("Finished rebuilding leaderboard data.")
|
| 53 |
return per_ep
|
| 54 |
|
| 55 |
# Initialize global dataframe
|
| 56 |
-
current_df = fetch_dataset_df()
|
| 57 |
|
| 58 |
# # Initialize global counter
|
| 59 |
# data_version_counter = 0
|
|
@@ -64,9 +85,9 @@ def update_current_dataframe():
|
|
| 64 |
global current_df # ugly but works
|
| 65 |
while True:
|
| 66 |
logger.info("Fetching latest dataset for leaderboard...")
|
| 67 |
-
current_df = fetch_dataset_df()
|
| 68 |
logger.debug(f"Dataset version updated")
|
| 69 |
-
time.sleep(
|
| 70 |
|
| 71 |
threading.Thread(target=update_current_dataframe, daemon=True).start()
|
| 72 |
|
|
@@ -174,7 +195,7 @@ with gr.Blocks(title="OpenADMET ADMET Challenge", fill_height=False,
|
|
| 174 |
|
| 175 |
The test set will remained blinded until the challenge submission deadline. You will be tasked with predicting the same set of ADMET endpoints for the test set molecules.
|
| 176 |
|
| 177 |
-
The training and blinded test set will also be made available on the [CDD Vault](https://www.collaborativedrug.com/). An account to access the CDD Vault can be requested by filling out this [form](https://forms.gle/KiviZ7AaGcuqtrwH8, which can also be used to request access to some other tools.
|
| 178 |
Note that by joining the Vault, your account will be visible to other participants, so this option is **not recommended for those wishing to remain anonymous.**
|
| 179 |
|
| 180 |
## 📝 Evaluation
|
|
@@ -251,28 +272,28 @@ with gr.Blocks(title="OpenADMET ADMET Challenge", fill_height=False,
|
|
| 251 |
# Aggregated leaderboard
|
| 252 |
with gr.TabItem('OVERALL', elem_id="all_tab"):
|
| 253 |
lboard_dict['Average'] = Leaderboard(
|
| 254 |
-
value=build_leaderboard(current_df)['Average'],
|
| 255 |
-
datatype=LB_DTYPES,
|
| 256 |
select_columns=LB_AVG,
|
| 257 |
search_columns=["user"],
|
| 258 |
render=True,
|
| 259 |
-
every=
|
| 260 |
)
|
| 261 |
# per-endpoint leaderboard
|
| 262 |
for endpoint in ENDPOINTS:
|
| 263 |
with gr.TabItem(endpoint):
|
| 264 |
lboard_dict[endpoint] = Leaderboard(
|
| 265 |
-
value=build_leaderboard(current_df)[endpoint],
|
| 266 |
datatype=LB_DTYPES,
|
| 267 |
select_columns=LB_COLS,
|
| 268 |
search_columns=["user"],
|
| 269 |
render=True,
|
| 270 |
-
every=
|
| 271 |
)
|
| 272 |
# Auto-refresh
|
| 273 |
def refresh_if_changed():
|
| 274 |
logger.info("Refreshing on timer tick...")
|
| 275 |
-
per_ep = build_leaderboard(current_df)
|
| 276 |
#return [gr.update(value=per_ep.get(ep, pd.DataFrame(columns=LB_COLS))) for ep in ALL_EPS]
|
| 277 |
return [per_ep[ep] for ep in ALL_EPS]
|
| 278 |
data_version.change(fn=refresh_if_changed, outputs=[lboard_dict[ep] for ep in ALL_EPS])
|
|
|
|
| 2 |
from gradio_leaderboard import Leaderboard
|
| 3 |
from gradio.themes.utils import sizes
|
| 4 |
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
|
| 7 |
from evaluate import submit_data, evaluate_data
|
| 8 |
from utils import (
|
|
|
|
| 11 |
fetch_dataset_df,
|
| 12 |
map_metric_to_stats,
|
| 13 |
)
|
| 14 |
+
from cld import add_cld_to_leaderboard
|
| 15 |
from datasets import load_dataset
|
| 16 |
import tempfile
|
| 17 |
from loguru import logger
|
|
|
|
| 23 |
|
| 24 |
ALL_EPS = ['Average'] + ENDPOINTS
|
| 25 |
|
| 26 |
+
def build_leaderboard(df_results, df_results_raw):
|
| 27 |
logger.info("Rebuilding leaderboard data...")
|
| 28 |
per_ep = {}
|
| 29 |
for ep in ALL_EPS:
|
|
|
|
| 34 |
per_ep[ep] = pd.DataFrame(columns=LB_COLS) # Empty df
|
| 35 |
continue
|
| 36 |
|
| 37 |
+
# Make model details clickable if it's a huggingface user
|
|
|
|
|
|
|
|
|
|
| 38 |
df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
|
| 39 |
|
| 40 |
if ep == "Average":
|
|
|
|
| 43 |
"std_RAE": "std_MA-RAE"})
|
| 44 |
sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
|
| 45 |
sorted_df = map_metric_to_stats(sorted_df, average=True)
|
| 46 |
+
# Add ranking column
|
| 47 |
+
sorted_df['rank'] = np.arange(1, len(sorted_df) + 1)
|
| 48 |
+
avg_leaderboard = sorted_df.copy()
|
| 49 |
+
avg_cols = LB_AVG
|
| 50 |
+
# Add CLD
|
| 51 |
+
if df_results_raw is not None:
|
| 52 |
+
df_raw = df_results_raw[df_results_raw["Endpoint"] == ep].copy()
|
| 53 |
+
df_raw = df_raw.rename(columns={"RAE": "MA-RAE"})
|
| 54 |
+
avg_leaderboard = add_cld_to_leaderboard(
|
| 55 |
+
sorted_df,
|
| 56 |
+
df_raw,
|
| 57 |
+
"MA-RAE",
|
| 58 |
+
)
|
| 59 |
+
avg_cols = ["rank", "user", "CLD", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
|
| 60 |
+
|
| 61 |
+
# Make user and model details clickable if it's a huggingface user
|
| 62 |
+
avg_leaderboard['user'] = avg_leaderboard.apply(
|
| 63 |
+
lambda row: make_user_clickable(row['user']) if not row['anonymous'] else row['user'],
|
| 64 |
+
axis=1).astype(str)
|
| 65 |
+
per_ep[ep] = avg_leaderboard[avg_cols]
|
| 66 |
else:
|
| 67 |
sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
|
| 68 |
sorted_df = map_metric_to_stats(sorted_df)
|
| 69 |
+
sorted_df['user'] = sorted_df.apply(
|
| 70 |
+
lambda row: make_user_clickable(row['user']) if not row['anonymous'] else row['user'],
|
| 71 |
+
axis=1).astype(str)
|
| 72 |
per_ep[ep] = sorted_df[LB_COLS]
|
| 73 |
logger.info("Finished rebuilding leaderboard data.")
|
| 74 |
return per_ep
|
| 75 |
|
| 76 |
# Initialize global dataframe
|
| 77 |
+
current_df, current_df_raw = fetch_dataset_df()
|
| 78 |
|
| 79 |
# # Initialize global counter
|
| 80 |
# data_version_counter = 0
|
|
|
|
| 85 |
global current_df # ugly but works
|
| 86 |
while True:
|
| 87 |
logger.info("Fetching latest dataset for leaderboard...")
|
| 88 |
+
current_df, current_df_raw = fetch_dataset_df()
|
| 89 |
logger.debug(f"Dataset version updated")
|
| 90 |
+
time.sleep(300) # Check for updates every 5 minutes
|
| 91 |
|
| 92 |
threading.Thread(target=update_current_dataframe, daemon=True).start()
|
| 93 |
|
|
|
|
| 195 |
|
| 196 |
The test set will remained blinded until the challenge submission deadline. You will be tasked with predicting the same set of ADMET endpoints for the test set molecules.
|
| 197 |
|
| 198 |
+
The training and blinded test set will also be made available on the [CDD Vault](https://www.collaborativedrug.com/). An account to access the CDD Vault can be requested by filling out this [form](https://forms.gle/KiviZ7AaGcuqtrwH8), which can also be used to request access to some other tools.
|
| 199 |
Note that by joining the Vault, your account will be visible to other participants, so this option is **not recommended for those wishing to remain anonymous.**
|
| 200 |
|
| 201 |
## 📝 Evaluation
|
|
|
|
| 272 |
# Aggregated leaderboard
|
| 273 |
with gr.TabItem('OVERALL', elem_id="all_tab"):
|
| 274 |
lboard_dict['Average'] = Leaderboard(
|
| 275 |
+
value=build_leaderboard(current_df, current_df_raw)['Average'],
|
| 276 |
+
datatype=['number'] + LB_DTYPES,
|
| 277 |
select_columns=LB_AVG,
|
| 278 |
search_columns=["user"],
|
| 279 |
render=True,
|
| 280 |
+
every=300,
|
| 281 |
)
|
| 282 |
# per-endpoint leaderboard
|
| 283 |
for endpoint in ENDPOINTS:
|
| 284 |
with gr.TabItem(endpoint):
|
| 285 |
lboard_dict[endpoint] = Leaderboard(
|
| 286 |
+
value=build_leaderboard(current_df, current_df_raw)[endpoint],
|
| 287 |
datatype=LB_DTYPES,
|
| 288 |
select_columns=LB_COLS,
|
| 289 |
search_columns=["user"],
|
| 290 |
render=True,
|
| 291 |
+
every=300,
|
| 292 |
)
|
| 293 |
# Auto-refresh
|
| 294 |
def refresh_if_changed():
|
| 295 |
logger.info("Refreshing on timer tick...")
|
| 296 |
+
per_ep = build_leaderboard(current_df, current_df_raw)
|
| 297 |
#return [gr.update(value=per_ep.get(ep, pd.DataFrame(columns=LB_COLS))) for ep in ALL_EPS]
|
| 298 |
return [per_ep[ep] for ep in ALL_EPS]
|
| 299 |
data_version.change(fn=refresh_if_changed, outputs=[lboard_dict[ep] for ep in ALL_EPS])
|
cld.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
| 2 |
+
from string import ascii_lowercase, ascii_uppercase
|
| 3 |
+
import tqdm
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
CLD_ALPHABET = list(ascii_lowercase) + list(ascii_uppercase)
|
| 7 |
+
|
| 8 |
+
def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
|
| 9 |
+
"""Assert whether i and j are represented as non-significant in the column
|
| 10 |
+
i.e., if the corresponding values in the column are different
|
| 11 |
+
|
| 12 |
+
Parameters
|
| 13 |
+
----------
|
| 14 |
+
col : list[bool]
|
| 15 |
+
current column
|
| 16 |
+
i : int
|
| 17 |
+
index of first treatment
|
| 18 |
+
j : int
|
| 19 |
+
index of second treatment
|
| 20 |
+
|
| 21 |
+
Returns
|
| 22 |
+
-------
|
| 23 |
+
bool
|
| 24 |
+
If the non-significance is represented accurately
|
| 25 |
+
"""
|
| 26 |
+
return col[i] and col[j]
|
| 27 |
+
|
| 28 |
+
def insert(column: list[bool], i: int, j: int):
|
| 29 |
+
"""Duplicates column and in one of its copies flip entry i to 0,
|
| 30 |
+
and in the other copy flip entry j to 0
|
| 31 |
+
|
| 32 |
+
Parameters
|
| 33 |
+
----------
|
| 34 |
+
column : list[bool]
|
| 35 |
+
Original column
|
| 36 |
+
i : int
|
| 37 |
+
Index of first group
|
| 38 |
+
j : int
|
| 39 |
+
Index of second group
|
| 40 |
+
|
| 41 |
+
Returns
|
| 42 |
+
-------
|
| 43 |
+
list[bool], list[bool]
|
| 44 |
+
New columns after duplication and flip
|
| 45 |
+
"""
|
| 46 |
+
col_i = column.copy()
|
| 47 |
+
col_j = column.copy()
|
| 48 |
+
col_i[i] = False
|
| 49 |
+
col_j[j] = False
|
| 50 |
+
return col_i, col_j
|
| 51 |
+
|
| 52 |
+
def can_be_absorbed(new_col: list[bool], ref_col: list[bool]) -> bool:
|
| 53 |
+
"""An old column absorbs the new column
|
| 54 |
+
if it has a 1 in every row in which the new column has one
|
| 55 |
+
|
| 56 |
+
Parameters
|
| 57 |
+
----------
|
| 58 |
+
new_col : list[bool]
|
| 59 |
+
Column to add
|
| 60 |
+
ref_col : list[bool]
|
| 61 |
+
Old column we are checking if it can absorb new_col
|
| 62 |
+
|
| 63 |
+
Returns
|
| 64 |
+
-------
|
| 65 |
+
bool
|
| 66 |
+
Whether old column cand absorb new_col
|
| 67 |
+
"""
|
| 68 |
+
return all(ref_col[i] for i, x in enumerate(new_col) if x)
|
| 69 |
+
|
| 70 |
+
def absorb(new_column: list[bool], columns: list[list[bool]]) -> list[list[bool]]:
|
| 71 |
+
"""Absorb new column into existing columns if the condition allows
|
| 72 |
+
|
| 73 |
+
Parameters
|
| 74 |
+
----------
|
| 75 |
+
new_column : list[bool]
|
| 76 |
+
Column to add
|
| 77 |
+
columns : list[list[bool]]
|
| 78 |
+
existing columns
|
| 79 |
+
|
| 80 |
+
Returns
|
| 81 |
+
-------
|
| 82 |
+
list[list[bool]]
|
| 83 |
+
Columns after absorption
|
| 84 |
+
"""
|
| 85 |
+
if any(can_be_absorbed(new_column, c) for c in columns):
|
| 86 |
+
return columns
|
| 87 |
+
return columns + [new_column]
|
| 88 |
+
|
| 89 |
+
def cld(comparisons: pd.DataFrame) -> dict[str, str]:
|
| 90 |
+
"""
|
| 91 |
+
Compact Letter Display
|
| 92 |
+
|
| 93 |
+
Compute the compact letter display using the insert-absorb algorithm.
|
| 94 |
+
|
| 95 |
+
See the following papers for more information:
|
| 96 |
+
(1) https://doi.org/10.1016/j.csda.2006.09.035
|
| 97 |
+
(2) https://doi.org/10.1198/1061860043515
|
| 98 |
+
|
| 99 |
+
Parameters
|
| 100 |
+
----------
|
| 101 |
+
comparisons : pd.DataFrame
|
| 102 |
+
A DataFrame containing the pairwise comparisons produced by:
|
| 103 |
+
https://www.statsmodels.org/dev/generated/statsmodels.stats.multicomp.pairwise_tukeyhsd.html
|
| 104 |
+
"""
|
| 105 |
+
unique_groups = set(comparisons["group1"].unique())
|
| 106 |
+
unique_groups = unique_groups.union(set(comparisons["group2"].unique()))
|
| 107 |
+
unique_groups = list(unique_groups)
|
| 108 |
+
unique_groups_indices = {g: i for i, g in enumerate(unique_groups)}
|
| 109 |
+
|
| 110 |
+
sig_diff = comparisons[comparisons["reject"]]
|
| 111 |
+
print(f"Found {len(sig_diff)} significantly different pairs")
|
| 112 |
+
|
| 113 |
+
# Initialize CLD matrix for all unique groups/models, with "columns" as rows
|
| 114 |
+
solution = [[True] * len(unique_groups)]
|
| 115 |
+
|
| 116 |
+
for _, row in tqdm.tqdm(sig_diff.iterrows(), total=len(sig_diff)):
|
| 117 |
+
i = unique_groups_indices[row["group1"]]
|
| 118 |
+
j = unique_groups_indices[row["group2"]]
|
| 119 |
+
|
| 120 |
+
has_changed: bool = True
|
| 121 |
+
while has_changed:
|
| 122 |
+
has_changed = False
|
| 123 |
+
|
| 124 |
+
for idx in range(len(solution)):
|
| 125 |
+
if asserts_non_significance(solution[idx], i, j):
|
| 126 |
+
# Duplicate the column
|
| 127 |
+
col_i, col_j = insert(solution[idx], i, j)
|
| 128 |
+
|
| 129 |
+
# Remove the old column
|
| 130 |
+
solution.pop(idx)
|
| 131 |
+
|
| 132 |
+
# Try absorb the column in an old column
|
| 133 |
+
# Simply add it to the solution otherwise
|
| 134 |
+
solution = absorb(col_i, solution)
|
| 135 |
+
solution = absorb(col_j, solution)
|
| 136 |
+
|
| 137 |
+
has_changed = True
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
# Assign letters
|
| 141 |
+
letters = [""] * len(unique_groups)
|
| 142 |
+
|
| 143 |
+
for ci, col in enumerate(solution):
|
| 144 |
+
letter = CLD_ALPHABET[ci]
|
| 145 |
+
for idx, has_letter in enumerate(col):
|
| 146 |
+
if has_letter:
|
| 147 |
+
letters[idx] += letter
|
| 148 |
+
|
| 149 |
+
return {group: sorted(letter) for group, letter in zip(unique_groups, letters)}
|
| 150 |
+
|
| 151 |
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
| 152 |
+
import tqdm
|
| 153 |
+
|
| 154 |
+
def add_cld_to_leaderboard(
|
| 155 |
+
leaderboard: pd.DataFrame,
|
| 156 |
+
scores: pd.DataFrame,
|
| 157 |
+
metric: str,
|
| 158 |
+
):
|
| 159 |
+
"""Add the compact letter display to the leaderboard.
|
| 160 |
+
|
| 161 |
+
Parameters
|
| 162 |
+
----------
|
| 163 |
+
leaderboard : pd.DataFrame
|
| 164 |
+
The full leaderboard DataFrame
|
| 165 |
+
scores : pd.DataFrame
|
| 166 |
+
The **raw** scores DataFrame, with all replicates from bootstrapping
|
| 167 |
+
metric_ : str
|
| 168 |
+
The metric label to calculate CLD for.
|
| 169 |
+
"""
|
| 170 |
+
ordered_methods = leaderboard["user"].values
|
| 171 |
+
|
| 172 |
+
scores = scores[["Sample", "user", metric]]
|
| 173 |
+
scores[metric] = scores[metric].astype(float)
|
| 174 |
+
|
| 175 |
+
# We compared methods using bootstrapping and the Tukey HSD test, presenting results via Compact Letter Display (CLD).
|
| 176 |
+
# While acknowledging that bootstrapping likely underestimates variance,
|
| 177 |
+
# we are not aware of better sampling techniques that fit the challenge format.
|
| 178 |
+
stats = pairwise_tukeyhsd(endog=scores[metric], groups=scores["user"])
|
| 179 |
+
# comparisons = stats.summary_frame()
|
| 180 |
+
# The version of statsmodel is for some reason not the latest, so we have to do small workaround to get summary_frame
|
| 181 |
+
summary_table = stats.summary()
|
| 182 |
+
# data attribute is a list of lists with column names as first element
|
| 183 |
+
data = summary_table.data[1:]
|
| 184 |
+
columns = summary_table.data[0]
|
| 185 |
+
comparisons = pd.DataFrame(data=data, columns=columns)
|
| 186 |
+
|
| 187 |
+
letter_mapping = {}
|
| 188 |
+
letter_code = cld(comparisons)
|
| 189 |
+
|
| 190 |
+
cld_column = [""] * len(leaderboard)
|
| 191 |
+
for idx, method in enumerate(ordered_methods):
|
| 192 |
+
try:
|
| 193 |
+
letters = letter_code[str(method)]
|
| 194 |
+
|
| 195 |
+
for letter in letters:
|
| 196 |
+
if letter not in letter_mapping:
|
| 197 |
+
letter_mapping[letter] = CLD_ALPHABET[len(letter_mapping)]
|
| 198 |
+
cld_column[idx] += letter_mapping[letter]
|
| 199 |
+
except KeyError: # Error with CLD for openadmet-dummy
|
| 200 |
+
cld_column[idx] = "None"
|
| 201 |
+
|
| 202 |
+
leaderboard["CLD"] = cld_column
|
| 203 |
+
|
| 204 |
+
return leaderboard
|
evaluate.py
CHANGED
|
@@ -257,6 +257,7 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
|
|
| 257 |
results_df['anonymous'] = meta.participant.anonymous
|
| 258 |
results_df['hf_username'] = username
|
| 259 |
|
|
|
|
| 260 |
results_raw_df['user'] = display_name
|
| 261 |
results_raw_df['submission_time'] = timestamp
|
| 262 |
results_raw_df['model_report'] = report
|
|
|
|
| 257 |
results_df['anonymous'] = meta.participant.anonymous
|
| 258 |
results_df['hf_username'] = username
|
| 259 |
|
| 260 |
+
results_raw_df = results_raw_df[results_raw_df['Endpoint']=='Average'] # Save ONLY for average endpoint, otherwise file is too large
|
| 261 |
results_raw_df['user'] = display_name
|
| 262 |
results_raw_df['submission_time'] = timestamp
|
| 263 |
results_raw_df['model_report'] = report
|
utils.py
CHANGED
|
@@ -15,7 +15,7 @@ def make_tag_clickable(tag: str):
|
|
| 15 |
return "Not submitted"
|
| 16 |
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
| 17 |
|
| 18 |
-
def fetch_dataset_df():
|
| 19 |
logger.info("Fetching latest results dataset from Hugging Face Hub...")
|
| 20 |
# Specify feature types to load results dataset
|
| 21 |
metric_features = {
|
|
@@ -60,7 +60,34 @@ def fetch_dataset_df():
|
|
| 60 |
.reset_index(drop=True)
|
| 61 |
)
|
| 62 |
latest.rename(columns={"submission_time": "submission time"}, inplace=True)
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
def clip_and_log_transform(y: np.ndarray):
|
|
|
|
| 15 |
return "Not submitted"
|
| 16 |
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
| 17 |
|
| 18 |
+
def fetch_dataset_df(download_raw=False): # Change download_raw to True for the final leaderboard
|
| 19 |
logger.info("Fetching latest results dataset from Hugging Face Hub...")
|
| 20 |
# Specify feature types to load results dataset
|
| 21 |
metric_features = {
|
|
|
|
| 60 |
.reset_index(drop=True)
|
| 61 |
)
|
| 62 |
latest.rename(columns={"submission_time": "submission time"}, inplace=True)
|
| 63 |
+
|
| 64 |
+
# Also fetch raw dataset
|
| 65 |
+
metric_features = {
|
| 66 |
+
m: Value('float64') for m in METRICS
|
| 67 |
+
}
|
| 68 |
+
other_features.update({'Sample': Value("float32")})
|
| 69 |
+
feature_schema = Features(metric_features | other_features)
|
| 70 |
+
|
| 71 |
+
# We'll set download_raw for the live leaderboard, as it too long to load
|
| 72 |
+
latest_raw = None
|
| 73 |
+
if download_raw:
|
| 74 |
+
dset_raw = load_dataset(results_repo_validation, # change to results_repo_test for test set
|
| 75 |
+
name='raw',
|
| 76 |
+
split='train',
|
| 77 |
+
features=feature_schema,
|
| 78 |
+
download_mode="force_redownload")
|
| 79 |
+
raw_df = dset_raw.to_pandas()
|
| 80 |
+
df_raw = raw_df.copy()
|
| 81 |
+
df_raw["submission_time"] = pd.to_datetime(df_raw["submission_time"], errors="coerce")
|
| 82 |
+
df_raw = df_raw.dropna(subset=["submission_time"])
|
| 83 |
+
latest_raw = (
|
| 84 |
+
df_raw.sort_values("submission_time")
|
| 85 |
+
.drop_duplicates(subset=["Sample", "Endpoint", "hf_username"], keep="last")
|
| 86 |
+
.sort_values(["Sample","Endpoint", "user"])
|
| 87 |
+
.reset_index(drop=True)
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return latest, latest_raw
|
| 91 |
|
| 92 |
|
| 93 |
def clip_and_log_transform(y: np.ndarray):
|