Spaces:
Running
Running
| import streamlit as st | |
| import torch | |
| from datasets import combine | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| from transformers import pipeline | |
| # Load HUPD dataset | |
| dataset_dict = load_dataset( | |
| "HUPD/hupd", | |
| name="sample", | |
| data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
| icpr_label=None, | |
| train_filing_start_date="2016-01-01", | |
| train_filing_end_date="2016-01-21", | |
| val_filing_start_date="2016-01-22", | |
| val_filing_end_date="2016-01-31", | |
| ) | |
| # Process data | |
| filtered_dataset = dataset_dict["validation"].filter( | |
| lambda e: e["decision"] == "ACCEPTED" or e["decision"] == "REJECTED" | |
| ) | |
| seed = 88 | |
| accepted = filtered_dataset.filter(lambda e: e["decision"] == "ACCEPTED").shuffle(seed).select(range(5)) | |
| rejected = filtered_dataset.filter(lambda e: e["decision"] == "REJECTED").shuffle(seed).select(range(5)) | |
| dataset = combine.concatenate_datasets([accepted, rejected]) | |
| dataset = dataset.sort("patent_number") | |
| # Create pipeline using model trainned on Colab | |
| model = torch.load("patent_classifier_v4.pt", map_location=torch.device("cpu")) | |
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") | |
| tokenizer_kwargs = {'padding':True,'truncation':True} | |
| def load_data(): | |
| selected_application = dataset.select([applications[st.session_state.id]]) | |
| st.session_state.abstract = selected_application["abstract"][0] | |
| st.session_state.claims = selected_application["claims"][0] | |
| st.session_state.title = selected_application["title"][0] | |
| st.session_state.decision = selected_application["decision"][0] | |
| st.title("CS-GY-6613 Project Milestone 3") | |
| # List patent numbers for select box | |
| applications = {} | |
| for ds_index, example in enumerate(dataset): | |
| applications.update({example["patent_number"]: ds_index}) | |
| st.selectbox( | |
| "Select a sample patent application:", applications, on_change=load_data, key="id" | |
| ) | |
| # Sample title/decision displayed for additional context only, not used with model | |
| st.text_input("Sample Title", key="title", value=dataset[0]["title"],) | |
| st.text_input("Sample Decision", key="decision", value=dataset[0]["decision"]) | |
| # Classifier input form | |
| with st.form("Input Form"): | |
| abstract = st.text_area( | |
| "Abstract", key="abstract", value=dataset[0]["abstract"], height=200 | |
| ) | |
| claims = st.text_area( | |
| "Claims", key="claims", value=dataset[0]["abstract"], height=200 | |
| ) | |
| submitted = st.form_submit_button("Get Patentability Score") | |
| if submitted: | |
| tokens = tokenizer(abstract, claims, return_tensors='pt', **tokenizer_kwargs) | |
| with torch.no_grad(): | |
| output = model(**tokens) | |
| logits = output.logits | |
| pred = torch.softmax(logits, dim=1) | |
| score = pred[0][1] # index 1 of softmax output is probability that decision = ACCEPTED | |
| st.markdown( | |
| "This application's patentability score is **{}**.".format(score) | |
| ) | |