Maria Castellanos commited on
Commit
24d6e19
·
1 Parent(s): 179f265

Add code for CLD

Browse files
Files changed (5) hide show
  1. about.py +1 -1
  2. app.py +37 -16
  3. cld.py +204 -0
  4. evaluate.py +1 -0
  5. utils.py +29 -2
about.py CHANGED
@@ -15,7 +15,7 @@ STANDARD_COLS = ["Endpoint", "user", "submission_time", "model_report"]
15
  METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
16
  # Final columns
17
  LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
18
- LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
19
  LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
20
 
21
  # Dictionary with unit conversion multipliers for each endpoint
 
15
  METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
16
  # Final columns
17
  LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
18
+ LB_AVG = ["rank", "user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
19
  LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
20
 
21
  # Dictionary with unit conversion multipliers for each endpoint
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  from gradio_leaderboard import Leaderboard
3
  from gradio.themes.utils import sizes
4
  import pandas as pd
 
5
 
6
  from evaluate import submit_data, evaluate_data
7
  from utils import (
@@ -10,6 +11,7 @@ from utils import (
10
  fetch_dataset_df,
11
  map_metric_to_stats,
12
  )
 
13
  from datasets import load_dataset
14
  import tempfile
15
  from loguru import logger
@@ -21,7 +23,7 @@ import threading
21
 
22
  ALL_EPS = ['Average'] + ENDPOINTS
23
 
24
- def build_leaderboard(df_results):
25
  logger.info("Rebuilding leaderboard data...")
26
  per_ep = {}
27
  for ep in ALL_EPS:
@@ -32,10 +34,7 @@ def build_leaderboard(df_results):
32
  per_ep[ep] = pd.DataFrame(columns=LB_COLS) # Empty df
33
  continue
34
 
35
- # Make user and model details clickable if it's a huggingface user
36
- df['user'] = df.apply(
37
- lambda row: make_user_clickable(row['user']) if not row['anonymous'] else row['user'],
38
- axis=1).astype(str)
39
  df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
40
 
41
  if ep == "Average":
@@ -44,16 +43,38 @@ def build_leaderboard(df_results):
44
  "std_RAE": "std_MA-RAE"})
45
  sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
46
  sorted_df = map_metric_to_stats(sorted_df, average=True)
47
- per_ep[ep] = sorted_df[LB_AVG]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  else:
49
  sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
50
  sorted_df = map_metric_to_stats(sorted_df)
 
 
 
51
  per_ep[ep] = sorted_df[LB_COLS]
52
  logger.info("Finished rebuilding leaderboard data.")
53
  return per_ep
54
 
55
  # Initialize global dataframe
56
- current_df = fetch_dataset_df()
57
 
58
  # # Initialize global counter
59
  # data_version_counter = 0
@@ -64,9 +85,9 @@ def update_current_dataframe():
64
  global current_df # ugly but works
65
  while True:
66
  logger.info("Fetching latest dataset for leaderboard...")
67
- current_df = fetch_dataset_df()
68
  logger.debug(f"Dataset version updated")
69
- time.sleep(60) # Check for updates every 60 seconds
70
 
71
  threading.Thread(target=update_current_dataframe, daemon=True).start()
72
 
@@ -174,7 +195,7 @@ with gr.Blocks(title="OpenADMET ADMET Challenge", fill_height=False,
174
 
175
  The test set will remained blinded until the challenge submission deadline. You will be tasked with predicting the same set of ADMET endpoints for the test set molecules.
176
 
177
- The training and blinded test set will also be made available on the [CDD Vault](https://www.collaborativedrug.com/). An account to access the CDD Vault can be requested by filling out this [form](https://forms.gle/KiviZ7AaGcuqtrwH8, which can also be used to request access to some other tools.
178
  Note that by joining the Vault, your account will be visible to other participants, so this option is **not recommended for those wishing to remain anonymous.**
179
 
180
  ## 📝 Evaluation
@@ -251,28 +272,28 @@ with gr.Blocks(title="OpenADMET ADMET Challenge", fill_height=False,
251
  # Aggregated leaderboard
252
  with gr.TabItem('OVERALL', elem_id="all_tab"):
253
  lboard_dict['Average'] = Leaderboard(
254
- value=build_leaderboard(current_df)['Average'],
255
- datatype=LB_DTYPES,
256
  select_columns=LB_AVG,
257
  search_columns=["user"],
258
  render=True,
259
- every=30,
260
  )
261
  # per-endpoint leaderboard
262
  for endpoint in ENDPOINTS:
263
  with gr.TabItem(endpoint):
264
  lboard_dict[endpoint] = Leaderboard(
265
- value=build_leaderboard(current_df)[endpoint],
266
  datatype=LB_DTYPES,
267
  select_columns=LB_COLS,
268
  search_columns=["user"],
269
  render=True,
270
- every=30,
271
  )
272
  # Auto-refresh
273
  def refresh_if_changed():
274
  logger.info("Refreshing on timer tick...")
275
- per_ep = build_leaderboard(current_df)
276
  #return [gr.update(value=per_ep.get(ep, pd.DataFrame(columns=LB_COLS))) for ep in ALL_EPS]
277
  return [per_ep[ep] for ep in ALL_EPS]
278
  data_version.change(fn=refresh_if_changed, outputs=[lboard_dict[ep] for ep in ALL_EPS])
 
2
  from gradio_leaderboard import Leaderboard
3
  from gradio.themes.utils import sizes
4
  import pandas as pd
5
+ import numpy as np
6
 
7
  from evaluate import submit_data, evaluate_data
8
  from utils import (
 
11
  fetch_dataset_df,
12
  map_metric_to_stats,
13
  )
14
+ from cld import add_cld_to_leaderboard
15
  from datasets import load_dataset
16
  import tempfile
17
  from loguru import logger
 
23
 
24
  ALL_EPS = ['Average'] + ENDPOINTS
25
 
26
+ def build_leaderboard(df_results, df_results_raw):
27
  logger.info("Rebuilding leaderboard data...")
28
  per_ep = {}
29
  for ep in ALL_EPS:
 
34
  per_ep[ep] = pd.DataFrame(columns=LB_COLS) # Empty df
35
  continue
36
 
37
+ # Make model details clickable if it's a huggingface user
 
 
 
38
  df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
39
 
40
  if ep == "Average":
 
43
  "std_RAE": "std_MA-RAE"})
44
  sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
45
  sorted_df = map_metric_to_stats(sorted_df, average=True)
46
+ # Add ranking column
47
+ sorted_df['rank'] = np.arange(1, len(sorted_df) + 1)
48
+ avg_leaderboard = sorted_df.copy()
49
+ avg_cols = LB_AVG
50
+ # Add CLD
51
+ if df_results_raw is not None:
52
+ df_raw = df_results_raw[df_results_raw["Endpoint"] == ep].copy()
53
+ df_raw = df_raw.rename(columns={"RAE": "MA-RAE"})
54
+ avg_leaderboard = add_cld_to_leaderboard(
55
+ sorted_df,
56
+ df_raw,
57
+ "MA-RAE",
58
+ )
59
+ avg_cols = ["rank", "user", "CLD", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
60
+
61
+ # Make user and model details clickable if it's a huggingface user
62
+ avg_leaderboard['user'] = avg_leaderboard.apply(
63
+ lambda row: make_user_clickable(row['user']) if not row['anonymous'] else row['user'],
64
+ axis=1).astype(str)
65
+ per_ep[ep] = avg_leaderboard[avg_cols]
66
  else:
67
  sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
68
  sorted_df = map_metric_to_stats(sorted_df)
69
+ sorted_df['user'] = sorted_df.apply(
70
+ lambda row: make_user_clickable(row['user']) if not row['anonymous'] else row['user'],
71
+ axis=1).astype(str)
72
  per_ep[ep] = sorted_df[LB_COLS]
73
  logger.info("Finished rebuilding leaderboard data.")
74
  return per_ep
75
 
76
  # Initialize global dataframe
77
+ current_df, current_df_raw = fetch_dataset_df()
78
 
79
  # # Initialize global counter
80
  # data_version_counter = 0
 
85
  global current_df # ugly but works
86
  while True:
87
  logger.info("Fetching latest dataset for leaderboard...")
88
+ current_df, current_df_raw = fetch_dataset_df()
89
  logger.debug(f"Dataset version updated")
90
+ time.sleep(300) # Check for updates every 5 minutes
91
 
92
  threading.Thread(target=update_current_dataframe, daemon=True).start()
93
 
 
195
 
196
  The test set will remained blinded until the challenge submission deadline. You will be tasked with predicting the same set of ADMET endpoints for the test set molecules.
197
 
198
+ The training and blinded test set will also be made available on the [CDD Vault](https://www.collaborativedrug.com/). An account to access the CDD Vault can be requested by filling out this [form](https://forms.gle/KiviZ7AaGcuqtrwH8), which can also be used to request access to some other tools.
199
  Note that by joining the Vault, your account will be visible to other participants, so this option is **not recommended for those wishing to remain anonymous.**
200
 
201
  ## 📝 Evaluation
 
272
  # Aggregated leaderboard
273
  with gr.TabItem('OVERALL', elem_id="all_tab"):
274
  lboard_dict['Average'] = Leaderboard(
275
+ value=build_leaderboard(current_df, current_df_raw)['Average'],
276
+ datatype=['number'] + LB_DTYPES,
277
  select_columns=LB_AVG,
278
  search_columns=["user"],
279
  render=True,
280
+ every=300,
281
  )
282
  # per-endpoint leaderboard
283
  for endpoint in ENDPOINTS:
284
  with gr.TabItem(endpoint):
285
  lboard_dict[endpoint] = Leaderboard(
286
+ value=build_leaderboard(current_df, current_df_raw)[endpoint],
287
  datatype=LB_DTYPES,
288
  select_columns=LB_COLS,
289
  search_columns=["user"],
290
  render=True,
291
+ every=300,
292
  )
293
  # Auto-refresh
294
  def refresh_if_changed():
295
  logger.info("Refreshing on timer tick...")
296
+ per_ep = build_leaderboard(current_df, current_df_raw)
297
  #return [gr.update(value=per_ep.get(ep, pd.DataFrame(columns=LB_COLS))) for ep in ALL_EPS]
298
  return [per_ep[ep] for ep in ALL_EPS]
299
  data_version.change(fn=refresh_if_changed, outputs=[lboard_dict[ep] for ep in ALL_EPS])
cld.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
2
+ from string import ascii_lowercase, ascii_uppercase
3
+ import tqdm
4
+ import pandas as pd
5
+
6
+ CLD_ALPHABET = list(ascii_lowercase) + list(ascii_uppercase)
7
+
8
+ def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
9
+ """Assert whether i and j are represented as non-significant in the column
10
+ i.e., if the corresponding values in the column are different
11
+
12
+ Parameters
13
+ ----------
14
+ col : list[bool]
15
+ current column
16
+ i : int
17
+ index of first treatment
18
+ j : int
19
+ index of second treatment
20
+
21
+ Returns
22
+ -------
23
+ bool
24
+ If the non-significance is represented accurately
25
+ """
26
+ return col[i] and col[j]
27
+
28
+ def insert(column: list[bool], i: int, j: int):
29
+ """Duplicates column and in one of its copies flip entry i to 0,
30
+ and in the other copy flip entry j to 0
31
+
32
+ Parameters
33
+ ----------
34
+ column : list[bool]
35
+ Original column
36
+ i : int
37
+ Index of first group
38
+ j : int
39
+ Index of second group
40
+
41
+ Returns
42
+ -------
43
+ list[bool], list[bool]
44
+ New columns after duplication and flip
45
+ """
46
+ col_i = column.copy()
47
+ col_j = column.copy()
48
+ col_i[i] = False
49
+ col_j[j] = False
50
+ return col_i, col_j
51
+
52
+ def can_be_absorbed(new_col: list[bool], ref_col: list[bool]) -> bool:
53
+ """An old column absorbs the new column
54
+ if it has a 1 in every row in which the new column has one
55
+
56
+ Parameters
57
+ ----------
58
+ new_col : list[bool]
59
+ Column to add
60
+ ref_col : list[bool]
61
+ Old column we are checking if it can absorb new_col
62
+
63
+ Returns
64
+ -------
65
+ bool
66
+ Whether old column cand absorb new_col
67
+ """
68
+ return all(ref_col[i] for i, x in enumerate(new_col) if x)
69
+
70
+ def absorb(new_column: list[bool], columns: list[list[bool]]) -> list[list[bool]]:
71
+ """Absorb new column into existing columns if the condition allows
72
+
73
+ Parameters
74
+ ----------
75
+ new_column : list[bool]
76
+ Column to add
77
+ columns : list[list[bool]]
78
+ existing columns
79
+
80
+ Returns
81
+ -------
82
+ list[list[bool]]
83
+ Columns after absorption
84
+ """
85
+ if any(can_be_absorbed(new_column, c) for c in columns):
86
+ return columns
87
+ return columns + [new_column]
88
+
89
+ def cld(comparisons: pd.DataFrame) -> dict[str, str]:
90
+ """
91
+ Compact Letter Display
92
+
93
+ Compute the compact letter display using the insert-absorb algorithm.
94
+
95
+ See the following papers for more information:
96
+ (1) https://doi.org/10.1016/j.csda.2006.09.035
97
+ (2) https://doi.org/10.1198/1061860043515
98
+
99
+ Parameters
100
+ ----------
101
+ comparisons : pd.DataFrame
102
+ A DataFrame containing the pairwise comparisons produced by:
103
+ https://www.statsmodels.org/dev/generated/statsmodels.stats.multicomp.pairwise_tukeyhsd.html
104
+ """
105
+ unique_groups = set(comparisons["group1"].unique())
106
+ unique_groups = unique_groups.union(set(comparisons["group2"].unique()))
107
+ unique_groups = list(unique_groups)
108
+ unique_groups_indices = {g: i for i, g in enumerate(unique_groups)}
109
+
110
+ sig_diff = comparisons[comparisons["reject"]]
111
+ print(f"Found {len(sig_diff)} significantly different pairs")
112
+
113
+ # Initialize CLD matrix for all unique groups/models, with "columns" as rows
114
+ solution = [[True] * len(unique_groups)]
115
+
116
+ for _, row in tqdm.tqdm(sig_diff.iterrows(), total=len(sig_diff)):
117
+ i = unique_groups_indices[row["group1"]]
118
+ j = unique_groups_indices[row["group2"]]
119
+
120
+ has_changed: bool = True
121
+ while has_changed:
122
+ has_changed = False
123
+
124
+ for idx in range(len(solution)):
125
+ if asserts_non_significance(solution[idx], i, j):
126
+ # Duplicate the column
127
+ col_i, col_j = insert(solution[idx], i, j)
128
+
129
+ # Remove the old column
130
+ solution.pop(idx)
131
+
132
+ # Try absorb the column in an old column
133
+ # Simply add it to the solution otherwise
134
+ solution = absorb(col_i, solution)
135
+ solution = absorb(col_j, solution)
136
+
137
+ has_changed = True
138
+ break
139
+
140
+ # Assign letters
141
+ letters = [""] * len(unique_groups)
142
+
143
+ for ci, col in enumerate(solution):
144
+ letter = CLD_ALPHABET[ci]
145
+ for idx, has_letter in enumerate(col):
146
+ if has_letter:
147
+ letters[idx] += letter
148
+
149
+ return {group: sorted(letter) for group, letter in zip(unique_groups, letters)}
150
+
151
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
152
+ import tqdm
153
+
154
+ def add_cld_to_leaderboard(
155
+ leaderboard: pd.DataFrame,
156
+ scores: pd.DataFrame,
157
+ metric: str,
158
+ ):
159
+ """Add the compact letter display to the leaderboard.
160
+
161
+ Parameters
162
+ ----------
163
+ leaderboard : pd.DataFrame
164
+ The full leaderboard DataFrame
165
+ scores : pd.DataFrame
166
+ The **raw** scores DataFrame, with all replicates from bootstrapping
167
+ metric_ : str
168
+ The metric label to calculate CLD for.
169
+ """
170
+ ordered_methods = leaderboard["user"].values
171
+
172
+ scores = scores[["Sample", "user", metric]]
173
+ scores[metric] = scores[metric].astype(float)
174
+
175
+ # We compared methods using bootstrapping and the Tukey HSD test, presenting results via Compact Letter Display (CLD).
176
+ # While acknowledging that bootstrapping likely underestimates variance,
177
+ # we are not aware of better sampling techniques that fit the challenge format.
178
+ stats = pairwise_tukeyhsd(endog=scores[metric], groups=scores["user"])
179
+ # comparisons = stats.summary_frame()
180
+ # The version of statsmodel is for some reason not the latest, so we have to do small workaround to get summary_frame
181
+ summary_table = stats.summary()
182
+ # data attribute is a list of lists with column names as first element
183
+ data = summary_table.data[1:]
184
+ columns = summary_table.data[0]
185
+ comparisons = pd.DataFrame(data=data, columns=columns)
186
+
187
+ letter_mapping = {}
188
+ letter_code = cld(comparisons)
189
+
190
+ cld_column = [""] * len(leaderboard)
191
+ for idx, method in enumerate(ordered_methods):
192
+ try:
193
+ letters = letter_code[str(method)]
194
+
195
+ for letter in letters:
196
+ if letter not in letter_mapping:
197
+ letter_mapping[letter] = CLD_ALPHABET[len(letter_mapping)]
198
+ cld_column[idx] += letter_mapping[letter]
199
+ except KeyError: # Error with CLD for openadmet-dummy
200
+ cld_column[idx] = "None"
201
+
202
+ leaderboard["CLD"] = cld_column
203
+
204
+ return leaderboard
evaluate.py CHANGED
@@ -257,6 +257,7 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
257
  results_df['anonymous'] = meta.participant.anonymous
258
  results_df['hf_username'] = username
259
 
 
260
  results_raw_df['user'] = display_name
261
  results_raw_df['submission_time'] = timestamp
262
  results_raw_df['model_report'] = report
 
257
  results_df['anonymous'] = meta.participant.anonymous
258
  results_df['hf_username'] = username
259
 
260
+ results_raw_df = results_raw_df[results_raw_df['Endpoint']=='Average'] # Save ONLY for average endpoint, otherwise file is too large
261
  results_raw_df['user'] = display_name
262
  results_raw_df['submission_time'] = timestamp
263
  results_raw_df['model_report'] = report
utils.py CHANGED
@@ -15,7 +15,7 @@ def make_tag_clickable(tag: str):
15
  return "Not submitted"
16
  return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
17
 
18
- def fetch_dataset_df():
19
  logger.info("Fetching latest results dataset from Hugging Face Hub...")
20
  # Specify feature types to load results dataset
21
  metric_features = {
@@ -60,7 +60,34 @@ def fetch_dataset_df():
60
  .reset_index(drop=True)
61
  )
62
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
63
- return latest
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  def clip_and_log_transform(y: np.ndarray):
 
15
  return "Not submitted"
16
  return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
17
 
18
+ def fetch_dataset_df(download_raw=False): # Change download_raw to True for the final leaderboard
19
  logger.info("Fetching latest results dataset from Hugging Face Hub...")
20
  # Specify feature types to load results dataset
21
  metric_features = {
 
60
  .reset_index(drop=True)
61
  )
62
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
63
+
64
+ # Also fetch raw dataset
65
+ metric_features = {
66
+ m: Value('float64') for m in METRICS
67
+ }
68
+ other_features.update({'Sample': Value("float32")})
69
+ feature_schema = Features(metric_features | other_features)
70
+
71
+ # We'll set download_raw for the live leaderboard, as it too long to load
72
+ latest_raw = None
73
+ if download_raw:
74
+ dset_raw = load_dataset(results_repo_validation, # change to results_repo_test for test set
75
+ name='raw',
76
+ split='train',
77
+ features=feature_schema,
78
+ download_mode="force_redownload")
79
+ raw_df = dset_raw.to_pandas()
80
+ df_raw = raw_df.copy()
81
+ df_raw["submission_time"] = pd.to_datetime(df_raw["submission_time"], errors="coerce")
82
+ df_raw = df_raw.dropna(subset=["submission_time"])
83
+ latest_raw = (
84
+ df_raw.sort_values("submission_time")
85
+ .drop_duplicates(subset=["Sample", "Endpoint", "hf_username"], keep="last")
86
+ .sort_values(["Sample","Endpoint", "user"])
87
+ .reset_index(drop=True)
88
+ )
89
+
90
+ return latest, latest_raw
91
 
92
 
93
  def clip_and_log_transform(y: np.ndarray):