Spaces:
Running
Running
| import gc | |
| import os | |
| import sys | |
| import warnings | |
| import pandas as pd | |
| import streamlit as st | |
| import torch | |
| from torch.utils.data import DataLoader | |
| from tqdm import tqdm | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| sys.path.append( | |
| os.path.abspath(os.path.join(os.path.dirname(__file__), "task_forward")) | |
| ) | |
| from generation_utils import ( | |
| ReactionT5Dataset, | |
| decode_output, | |
| save_multiple_predictions, | |
| ) | |
| from train import preprocess_df | |
| from utils import seed_everything | |
| warnings.filterwarnings("ignore") | |
| st.title("ReactionT5 task forward") | |
| st.markdown(""" | |
| ##### At this space, you can predict the products of reactions from their inputs. | |
| ##### The code expects input_data as a string or CSV file that contains an "input" column. | |
| ##### The format of the string or contents of the column should be "REACTANT:{reactants}REAGENT:{reagents}". | |
| ##### If there is no reagent, fill the blank with a space. For multiple compounds, concatenate them with ".". | |
| ##### The output contains SMILES of predicted products and the sum of log-likelihood for each prediction, ordered by their log-likelihood (0th is the most probable product). | |
| """) | |
| st.download_button( | |
| label="Download demo_reaction_data.csv", | |
| data=pd.read_csv("data/demo_reaction_data.csv").to_csv(index=False), | |
| file_name="demo_reaction_data.csv", | |
| mime="text/csv", | |
| ) | |
| class CFG: | |
| num_beams = st.number_input( | |
| label="num beams", min_value=1, max_value=10, value=5, step=1 | |
| ) | |
| num_return_sequences = num_beams | |
| input_data = st.file_uploader("Choose a CSV file") | |
| model_name_or_path = "sagawa/ReactionT5v2-forward" | |
| input_column = "input" | |
| input_max_length = 400 | |
| output_max_length = 300 | |
| model = "t5" | |
| seed = 42 | |
| batch_size = 1 | |
| if st.button("predict"): | |
| with st.spinner( | |
| "Now processing. If num beams=5, this process takes about 15 seconds per reaction." | |
| ): | |
| # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # seed_everything(seed=CFG.seed) | |
| # tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors="pt") | |
| # model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name_or_path).to(device) | |
| # model.eval() | |
| # if CFG.uploaded_file is None: | |
| # input_compound = CFG.input_data | |
| # output = predict_single_input(input_compound) | |
| # sequences, scores = decode_output(output) | |
| # output_df = save_single_prediction(input_compound, sequences, scores) | |
| # else: | |
| # input_data = pd.read_csv(CFG.uploaded_file) | |
| # dataset = ProductDataset(CFG, input_data) | |
| # dataloader = DataLoader( | |
| # dataset, | |
| # batch_size=CFG.batch_size, | |
| # shuffle=False, | |
| # num_workers=4, | |
| # pin_memory=True, | |
| # drop_last=False, | |
| # ) | |
| # all_sequences, all_scores = [], [] | |
| # for inputs in dataloader: | |
| # inputs = {k: v[0].to(device) for k, v in inputs.items()} | |
| # with torch.no_grad(): | |
| # output = model.generate( | |
| # **inputs, | |
| # num_beams=CFG.num_beams, | |
| # num_return_sequences=CFG.num_return_sequences, | |
| # return_dict_in_generate=True, | |
| # output_scores=True, | |
| # ) | |
| # sequences, scores = decode_output(output) | |
| # all_sequences.extend(sequences) | |
| # if scores: | |
| # all_scores.extend(scores) | |
| # del output | |
| # torch.cuda.empty_cache() | |
| # gc.collect() | |
| # output_df = save_multiple_predictions(input_data, all_sequences, all_scores) | |
| CFG.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| seed_everything(seed=CFG.seed) | |
| CFG.tokenizer = AutoTokenizer.from_pretrained( | |
| os.path.abspath(CFG.model_name_or_path) | |
| if os.path.exists(CFG.model_name_or_path) | |
| else CFG.model_name_or_path, | |
| return_tensors="pt", | |
| ) | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| os.path.abspath(CFG.model_name_or_path) | |
| if os.path.exists(CFG.model_name_or_path) | |
| else CFG.model_name_or_path | |
| ).to(CFG.device) | |
| model.eval() | |
| input_data = pd.read_csv(CFG.input_data) | |
| input_data = preprocess_df(input_data, drop_duplicates=False) | |
| dataset = ReactionT5Dataset(CFG, input_data) | |
| dataloader = DataLoader( | |
| dataset, | |
| batch_size=CFG.batch_size, | |
| shuffle=False, | |
| num_workers=4, | |
| pin_memory=True, | |
| drop_last=False, | |
| ) | |
| all_sequences, all_scores = [], [] | |
| for inputs in tqdm(dataloader, total=len(dataloader)): | |
| inputs = {k: v.to(CFG.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| min_length=CFG.output_min_length, | |
| max_length=CFG.output_max_length, | |
| num_beams=CFG.num_beams, | |
| num_return_sequences=CFG.num_return_sequences, | |
| return_dict_in_generate=True, | |
| output_scores=True, | |
| ) | |
| sequences, scores = decode_output(output, CFG) | |
| all_sequences.extend(sequences) | |
| if scores: | |
| all_scores.extend(scores) | |
| del output | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| output_df = save_multiple_predictions( | |
| input_data, all_sequences, all_scores, CFG | |
| ) | |
| # output_df.to_csv(os.path.join(CFG.output_dir, "output.csv"), index=False) | |
| def convert_df(df): | |
| return df.to_csv(index=False) | |
| csv = convert_df(output_df) | |
| st.download_button( | |
| label="Download data as CSV", | |
| data=csv, | |
| file_name="output.csv", | |
| mime="text/csv", | |
| ) | |