Maria Castellanos commited on
Commit
b4b15c9
·
1 Parent(s): ac084ef

Update interface

Browse files
Files changed (3) hide show
  1. about.py +20 -0
  2. app.py +179 -27
  3. evaluate.py +187 -0
about.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ ENDPOINTS = ["LogD",
5
+ "KSol",
6
+ "MLM CLint",
7
+ "HLM CLint",
8
+ "Caco-2 Permeability Efflux",
9
+ "Caco-2 Permeability Papp A>B",
10
+ "MPPB",
11
+ "MBPB",
12
+ "RLM CLint",
13
+ "MGMB"]
14
+ TOKEN = os.environ.get("HF_TOKEN")
15
+ CACHE_PATH=os.getenv("HF_HOME", ".")
16
+ API = HfApi(token=TOKEN)
17
+ organization="OpenADMET"
18
+ submissions_repo = f'{organization}/openadmet-challenge-submissions' # private
19
+ results_repo = f'{organization}/openadmet-challenge-results' # public
20
+ test_repo = f'{organization}/openadmet-challenge-test-data' # private
app.py CHANGED
@@ -2,27 +2,85 @@ import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter
3
  import pandas as pd
4
 
5
- # dataset = load_dataset("your_dataset_name")
6
-
7
 
 
8
  from datetime import datetime
 
 
9
 
 
 
 
 
 
 
 
10
 
 
11
 
12
  def gradio_interface():
13
  with gr.Blocks(title="OpenADMET ADMET Challenge") as demo:
14
 
15
-
16
 
17
  # --- Welcome markdown message ---
18
  welcome_md = """
19
- # 🧪 OpenADMET + XXX
20
  ## Computational Blind Challenge in ADMET
21
 
22
- Welcome to the **XXX**, hosted by **OpenADMET** in collaboration with **XXX**.
 
23
 
24
  Your task is to develop and submit predictive models for key ADMET properties on a blinded test set of real world drug discovery data.
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  📅 **Timeline**:
28
  - TBD
@@ -34,35 +92,129 @@ def gradio_interface():
34
  # --- Gradio Interface ---
35
  with gr.Tabs(elem_classes="tab-buttons"):
36
 
37
- with gr.TabItem("Welcome"):
38
  gr.Markdown(welcome_md)
39
 
40
- with gr.TabItem("Submit Predictions"):
41
- gr.Markdown("Upload your prediction files here.")
42
- filename = gr.State(value=None)
43
- eval_state = gr.State(value=None)
44
- user_state = gr.State(value=None)
45
-
46
- with gr.TabItem("Leaderboard"):
47
- gr.Markdown("View the leaderboard here.")
48
- df = pd.DataFrame({
49
  "user": ["User1", "User2", "User3"],
50
- "Model": ["A", "B", "C"],
51
  "R2": [0.94, 0.92, 0.89],
52
  "Spearman R": [0.93, 0.91, 0.88],
 
53
  })
54
- Leaderboard(
55
- value=df,
56
- # Optionally configure columns:
57
- select_columns=["Model", "R2", "Spearman R"],
58
- # Additional options: search_columns, filter_columns, hide_columns, etc.
59
- search_columns=["Model", "user"],
60
- )
61
-
62
-
63
- with gr.TabItem("About"):
64
- gr.Markdown("Learn more about the challenge and the organizers.")
 
 
 
 
 
 
 
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return demo
67
 
68
  if __name__ == "__main__":
 
2
  from gradio_leaderboard import Leaderboard, ColumnFilter
3
  import pandas as pd
4
 
5
+ from about import submissions_repo, results_repo
6
+ from evaluate import submit_data, evaluate_data
7
 
8
+ from datasets import load_dataset
9
  from datetime import datetime
10
+ from about import ENDPOINTS
11
+
12
 
13
+ def get_leaderboard(dset):
14
+ dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
15
+ full_df = pd.DataFrame(dset)
16
+ to_show = full_df.copy(deep=True)
17
+ to_show = to_show[to_show['user'] != 'test']
18
+ # The columns to display publicly
19
+ to_show = to_show[["user", "Model", "MAE", "R2", "Spearman R", "Kendall's Tau"]]
20
 
21
+ return to_show
22
 
23
  def gradio_interface():
24
  with gr.Blocks(title="OpenADMET ADMET Challenge") as demo:
25
 
26
+ gr.Markdown("## Welcome to the OpenADMET + XXX Blind Challenge!")
27
 
28
  # --- Welcome markdown message ---
29
  welcome_md = """
30
+ # 💊 OpenADMET + XXX
31
  ## Computational Blind Challenge in ADMET
32
 
33
+ Welcome to the **XXX**, hosted by **OpenADMET** in collaboration with **XXX**.
34
+ This is a community-driven initiative to benchmark predictive models for ADMET properties in drug discovery.
35
 
36
  Your task is to develop and submit predictive models for key ADMET properties on a blinded test set of real world drug discovery data.
37
 
38
+ ## ADMET Properties:
39
+ *Absorption*, *Distribution*, *Metabolism*, *Excretion*, *Toxicology*--or **ADMET**--endpoints sit in the middle of the assay cascade and can make or break preclinical candidate molecules.
40
+ For this blind challenge we selected several crucial endpoints for the community to predict:
41
+ - LogD
42
+ - Kinetic Solubility **KSOL**: uM
43
+ - Mouse Liver Microsomal (**MLM**) *CLint*: mL/min/kg
44
+ - Human Liver Microsomal (**HLM**) *Clint*: mL/min/kg
45
+ - Caco-2 Efflux Ratio
46
+ - Caco-2 Papp A>B (10^-6 cm/s)
47
+ - Mouse Plasma Protein Binding (**MPPB**): % Unbound
48
+ - Mouse Brain Protein Binding (**MBPB**): % Unbound
49
+ - Rat Liver Microsomal (**RLM**) *Clint*: mL/min/kg
50
+ - Mouse Gastrocnemius Muscle Binding (**MGMB**): % Unbound
51
+
52
+ ## ✅ How to Participate
53
+ 1. **Register**: Create an account with Hugging Face.
54
+ 2. **Download the Public Dataset**: Clone the XXX dataset [link]
55
+ 3. **Train Your Model**: Use the provided training data for each ADMET property of your choice.
56
+ 4. **Submit Predictions**: Follow the instructions in the *Submit* tab to upload your predictions.
57
+ 5. Join the discussion on the [Challenge Discord](link)!
58
+
59
+ ## 📊 Data:
60
+
61
+ The training set will have the following variables:
62
+
63
+ | Column | Unit | data type | Description |
64
+ |:-----------------------------|-----------|-----------|:-------------|
65
+ | Molecule Name | | str | Identifier for the molecule |
66
+ | Smiles | | str | Text representation of the 2D molecular structure |
67
+ | LogD | | float | LogD calculation |
68
+ | KSol | uM | float | Kinetic Solubility |
69
+ | MLM CLint | mL/min/kg | float | Mouse Liver Microsomal |
70
+ | HLM CLint | mL/min/kg | float | Human Liver Microsomal |
71
+ | Caco-2 Permeability Efflux | | float | Caco-2 Permeability Efflux |
72
+ | Caco-2 Permeability Papp A>B | 10^-6 cm/s| float | Caco-2 Permeability Papp A>B |
73
+ | MPPB | % Unbound | float | Mouse Plasma Protein Binding |
74
+ | MBPB | % Unbound | float | Mouse Brain Protein Binding |
75
+ | RLM CLint | mL/min/kg | float | Rat Liver Microsomal Stability |
76
+ | MGMB. | % Unbound | float | Mouse Gastrocnemius Muscle Binding |
77
+
78
+ At test time, we will only provide the Molecule Name and Smiles. Make sure your submission file has the same columns!
79
+
80
+ ## 📝 Evaluation
81
+ The challenge will be judged based on the judging criteria outlined here.
82
+
83
+ - TBD
84
 
85
  📅 **Timeline**:
86
  - TBD
 
92
  # --- Gradio Interface ---
93
  with gr.Tabs(elem_classes="tab-buttons"):
94
 
95
+ with gr.TabItem("📝About"):
96
  gr.Markdown(welcome_md)
97
 
98
+ with gr.TabItem("🚀Leaderboard"):
99
+ gr.Markdown("View the leaderboard for each ADMET endpoint by selecting the appropiate tab.")
100
+ df1 = pd.DataFrame({
 
 
 
 
 
 
101
  "user": ["User1", "User2", "User3"],
102
+ "MAE": [0.1, 0.2, 0.15],
103
  "R2": [0.94, 0.92, 0.89],
104
  "Spearman R": [0.93, 0.91, 0.88],
105
+ "Kendall's Tau": [0.90, 0.89, 0.85],
106
  })
107
+ df2 = pd.DataFrame({
108
+ "user": ["User1", "User2", "User3"],
109
+ "MAE": [0.2, 0.3, 0.15],
110
+ "R2": [0.2, 0.72, 0.89],
111
+ "Spearman R": [0.91, 0.71, 0.68],
112
+ "Kendall's Tau": [0.90, 0.4, 0.7],
113
+ })
114
+ # Make separate leaderboards in separate tabs
115
+ mock_data = [df1, df1, df2, df1, df2, df1, df1, df2, df1, df2]
116
+ for i, endpoint in enumerate(ENDPOINTS):
117
+ df = mock_data[i]
118
+ with gr.TabItem(endpoint):
119
+ Leaderboard(
120
+ value=df,
121
+ datatype=['str', 'number', 'number', 'number', 'number'],
122
+ select_columns=["user", "MAE", "R2", "Spearman R", "Kendall's Tau"],
123
+ search_columns=["user"],
124
+ every=60,
125
+ )
126
 
127
+ with gr.TabItem("Submit Predictions"):
128
+ gr.Markdown(
129
+ """
130
+ # ADME Endpoints Submission
131
+ Upload your prediction files here as a csv file.
132
+ """
133
+ )
134
+ filename = gr.State(value=None)
135
+ eval_state = gr.State(value=None)
136
+ user_state = gr.State(value=None)
137
+
138
+ with gr.Row():
139
+
140
+ with gr.Column():
141
+ gr.Markdown(
142
+ """
143
+ ## Participant Information
144
+ To participate, you must enter a Hugging Face username, or alias, which will be displayed on the leaderboard.
145
+ Other information is optional but helps us track participation.
146
+ If you wish to be included in Challenge discussions, please provide your Discord username and email.
147
+ If you wish to be included in a future publication with the Challenge results, please provide your name and affiliation.
148
+ """
149
+ )
150
+ # endpoint_type = gr.CheckboxGroup(
151
+ ## ENDPOINTS,
152
+ # label="ADME Endpoints",
153
+ # info="Select the ADME endpoints you are submitting predictions for."),
154
+ # Could also allow a display name in case HF username is not necessary?
155
+ username_input = gr.Textbox(
156
+ label="Username",
157
+ placeholder="Enter your Hugging Face username",
158
+ info="This will be displayed on the leaderboard."
159
+ )
160
+ with gr.Column():
161
+ # Info to track participant, that will not be displayed publicly
162
+ participant_name = gr.Textbox(
163
+ label="Participant Name",
164
+ placeholder="Enter your name (optional)",
165
+ info="This will not be displayed on the leaderboard but will be used for tracking participation."
166
+ )
167
+ discord_username= gr.Textbox(
168
+ label="Discord Username",
169
+ placeholder="Enter your Discord username (optional)",
170
+ info="Enter the username you will use for the Discord channel (if you are planning to engage in the discussion)."
171
+ )
172
+ email = gr.Textbox(
173
+ label="Email",
174
+ placeholder="Enter your email (optional)",
175
+ )
176
+ affiliation = gr.Textbox(
177
+ label="Affiliation",
178
+ placeholder="Enter your school/company affiliation (optional)",
179
+ )
180
+
181
+ with gr.Row():
182
+ with gr.Column():
183
+ gr.Markdown(
184
+ """
185
+ ## Submission Instructions
186
+ Upload a single CSV file containing your predictions for all ligands in the test set.
187
+ You can download the ligand test set here (lik/to/download/smiles/csv).
188
+ """
189
+ )
190
+ with gr.Column():
191
+ predictions_file = gr.File(label="Single file with ADME predictions (.csv)",
192
+ file_types=[".csv"],
193
+ file_count="single",)
194
+
195
+ username_input.change(
196
+ fn=lambda x: x if x.strip() else None,
197
+ inputs=username_input,
198
+ outputs=user_state
199
+ )
200
+
201
+ submit_btn = gr.Button("Submit Predictions")
202
+ message = gr.Textbox(label="Status", lines=1, visible=False)
203
+ '''
204
+ submit_btn.click(
205
+ submit_data,
206
+ inputs=[predictions_file, user_state, participant_name, discord_username, email, affiliation],
207
+ outputs=[message],
208
+ ).then(
209
+ fn=lambda m: gr.update(value=m, visible=True),
210
+ inputs=[message],
211
+ outputs=[message],
212
+ ).then(
213
+ fn=evaluate_data,
214
+ inputs=[filename],
215
+ outputs=[eval_state]
216
+ )
217
+ '''
218
  return demo
219
 
220
  if __name__ == "__main__":
evaluate.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from scipy.stats import spearmanr, kendalltau
5
+ from sklearn.metrics import mean_absolute_error, r2_score
6
+ from typing import List
7
+ from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
8
+ from huggingface_hub import hf_hub_download
9
+ import datetime
10
+ import io
11
+ import json, tempfile
12
+
13
+
14
+ def _compact_dict(d: dict) -> dict:
15
+ """Drop None/empty-string values; strip whitespace for strings."""
16
+ out = {}
17
+ for k, v in d.items():
18
+ if isinstance(v, str):
19
+ v = v.strip()
20
+ if v not in (None, "", []):
21
+ out[k] = v
22
+ return out
23
+
24
+
25
+ def submit_data(predictions_file: str,
26
+ user_state,
27
+ *,
28
+ participant_name: str = "",
29
+ discord_username: str = "",
30
+ email: str = "",
31
+ affiliation: str = ""
32
+ ):
33
+
34
+ if user_state is None:
35
+ raise gr.Error("Username or alias is required for submission.")
36
+
37
+ file_path = Path(predictions_file).resolve()
38
+
39
+ if not file_path.exists():
40
+ raise gr.Error("Uploaded file object does not have a valid file path.")
41
+
42
+ # Read results file
43
+ try:
44
+ results_df = pd.read_csv(file_path)
45
+ except Exception as e:
46
+ return f"❌ Error reading results file: {str(e)}"
47
+
48
+ if results_df.empty:
49
+ raise gr.Error("The uploaded file is empty.")
50
+ if not set(ENDPOINTS).issubset(set(results_df.columns)):
51
+ raise gr.Error(f"The uploaded file must contain all endpoint predictions")
52
+
53
+ # Build destination filename in the dataset
54
+ ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds").replace(":", "-")
55
+ safe_user = str(user_state.strip()).replace("/", "_").replace(" ", "_")
56
+
57
+ destination_csv = f"submissions/{safe_user}_{ts}.csv"
58
+ destination_json = destination_csv.replace(".csv", ".json")
59
+ # Upload the CSV file
60
+ API.upload_file(
61
+ path_or_fileobj=str(file_path),
62
+ path_in_repo=destination_csv,
63
+ repo_id=submissions_repo,
64
+ repo_type="dataset",
65
+ commit_message=f"Add submission for {safe_user} at {ts}"
66
+ )
67
+
68
+ # Optional participant record
69
+ participant_fields = _compact_dict({
70
+ "participant_name": participant_name,
71
+ "discord_username": discord_username,
72
+ "email": email,
73
+ "affiliation": affiliation,
74
+ })
75
+ # Metadata JSON
76
+ meta = {
77
+ "submission_time_utc": ts,
78
+ "user": user_state,
79
+ "original_filename": file_path.name,
80
+ "evaluated": False,
81
+ **participant_fields, # merged here
82
+ }
83
+
84
+ meta_bytes = io.BytesIO(json.dumps(meta, indent=2).encode("utf-8"))
85
+ API.upload_file(
86
+ path_or_fileobj=meta_bytes,
87
+ path_in_repo=destination_json,
88
+ repo_id=submissions_repo,
89
+ repo_type="dataset",
90
+ commit_message=f"Add metadata for {user_state} submission at {ts}"
91
+ )
92
+
93
+ return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
94
+
95
+ def evaluate_data(filename: str) -> None:
96
+
97
+ # Load the submission csv
98
+ try:
99
+ local_path = hf_hub_download(
100
+ repo_id=submissions_repo,
101
+ repo_type="dataset",
102
+ filename=filename,
103
+ )
104
+ except Exception as e:
105
+ raise gr.Error(f"Failed to download submission file: {e}")
106
+
107
+ # Load the test set
108
+ try:
109
+ test_path = hf_hub_download(
110
+ repo_id=test_repo,
111
+ repo_type="dataset",
112
+ filename="data/test_dataset.csv",
113
+ )
114
+ except Exception as e:
115
+ raise gr.Error(f"Failed to download test file: {e}")
116
+
117
+ data_df = pd.read_csv(local_path)
118
+ test_df = pd.read_csv(test_path)
119
+ try:
120
+ results_df = calculate_metrics(data_df, test_df)
121
+ if not isinstance(results_df, pd.DataFrame) or results_df.empty:
122
+ raise gr.Error("Evaluation produced no results.")
123
+ except Exception as e:
124
+ raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.')
125
+
126
+ # Load metadata file
127
+ meta_filename = filename.replace(".csv", ".json")
128
+ try:
129
+ meta_path = hf_hub_download(
130
+ repo_id=submissions_repo,
131
+ repo_type="dataset",
132
+ filename=meta_filename,
133
+ )
134
+ with open(meta_path, "r", encoding="utf-8") as f:
135
+ meta = json.load(f)
136
+ username = meta.get("user")
137
+ timestamp = meta.get("submission_time_utc")
138
+ except Exception as e:
139
+ raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.")
140
+
141
+ # Write results to results dataset
142
+ results_df['user'] = username
143
+ safe_user = str(username).replace("/", "_").replace(" ", "_")
144
+ destination_path = f"results/{safe_user}_{timestamp}_results.csv"
145
+ tmp_name = None
146
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
147
+ results_df.to_csv(tmp, index=False)
148
+ tmp.flush()
149
+ tmp_name = tmp.name
150
+
151
+ API.upload_file(
152
+ path_or_fileobj=tmp_name,
153
+ path_in_repo=destination_path,
154
+ repo_id=results_repo,
155
+ repo_type="dataset",
156
+ commit_message=f"Add result data for {username}"
157
+ )
158
+ Path(tmp_name).unlink()
159
+
160
+
161
+ def calculate_metrics(
162
+ results_dataframe: pd.DataFrame,
163
+ test_dataframe: pd.DataFrame
164
+ ):
165
+
166
+ def metrics_per_ep(pred, true):
167
+ mae = mean_absolute_error(true, pred)
168
+ r2 = r2_score(true, pred)
169
+ spr, _ = spearmanr(true, pred)
170
+ ktau, _ = kendalltau(true, pred)
171
+ return mae, r2, spr, ktau
172
+
173
+ df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"])
174
+ for i, measurement in enumerate(ENDPOINTS):
175
+ df_pred = results_dataframe[['Molecule Name', measurement]].dropna()
176
+ df_true = test_dataframe[['Molecule Name', measurement]].dropna()
177
+ # Make sure both have the same order
178
+ pred = df_pred.sort_values(by='Molecule Name')[measurement]
179
+ true = df_true.sort_values(by='Molecule Name')[measurement]
180
+ mae, r2, spearman, ktau = metrics_per_ep(pred, true)
181
+ df_results.loc[i, 'endpoint'] = measurement
182
+ df_results.loc[i, 'MAE'] = mae
183
+ df_results.loc[i, 'R2'] = r2
184
+ df_results.loc[i, 'Spearman R'] = spearman
185
+ df_results.loc[i, "Kendall's Tau"] = ktau
186
+
187
+ return df_results