Spaces:
Build error
Build error
| import streamlit as st | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline | |
| import torch | |
| import numpy as np | |
| import contextlib | |
| import plotly.express as px | |
| import pandas as pd | |
| from PIL import Image | |
| import datetime | |
| import os | |
| import psutil | |
| with open("hit_log.txt", mode='a') as file: | |
| file.write(str(datetime.datetime.now()) + '\n') | |
| MAX_GRAPH_ROWS = 10 | |
| MODEL_DESC = { | |
| 'Bart MNLI': """Bart with a classification head trained on MNLI.\n\nSequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""", | |
| 'Bart MNLI + Yahoo Answers': """Bart with a classification head trained on MNLI and then further fine-tuned on Yahoo Answers topic classification.\n\nSequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""", | |
| 'XLM Roberta XNLI (cross-lingual)': """XLM Roberta, a cross-lingual model, with a classification head trained on XNLI. Supported languages include: _English, French, Spanish, German, Greek, Bulgarian, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi, Swahili, and Urdu_. | |
| Note that this model seems to be less reliable than the English-only models when classifying longer sequences. | |
| Examples were automatically translated and may contain grammatical mistakes. | |
| Sequences are posed as NLI premises and topic labels are turned into premises, i.e. `business` -> `This text is about business.`""", | |
| } | |
| ZSL_DESC = """*Update 2024/05/02: This app demo'd the use of pre-trained NLI models for zero-shot topic classification right around the 2020 announcement of GPT-3. It was originally released as a standalone app on the Hugging Face servers before the introduction of HF spaces. It is kept here for posterity.* | |
| Recently, the NLP science community has begun to pay increasing attention to zero-shot and few-shot applications, such as in the [paper from OpenAI](https://arxiv.org/abs/2005.14165) introducing GPT-3. This demo shows how 🤗 Transformers can be used for zero-shot topic classification, the task of predicting a topic that the model has not been trained on.""" | |
| CODE_DESC = """```python | |
| from transformers import pipeline | |
| classifier = pipeline('zero-shot-classification', | |
| model='{}') | |
| hypothesis_template = 'This text is about {{}}.' # the template used in this demo | |
| classifier(sequence, labels, | |
| hypothesis_template=hypothesis_template, | |
| multi_class=multi_class) | |
| # {{'sequence' ..., 'labels': ..., 'scores': ...}} | |
| ```""" | |
| model_ids = { | |
| 'Bart MNLI': 'facebook/bart-large-mnli', | |
| 'Bart MNLI + Yahoo Answers': 'joeddav/bart-large-mnli-yahoo-answers', | |
| 'XLM Roberta XNLI (cross-lingual)': 'joeddav/xlm-roberta-large-xnli' | |
| } | |
| device = 0 if torch.cuda.is_available() else -1 | |
| def load_models(): | |
| return {id: AutoModelForSequenceClassification.from_pretrained(id) for id in model_ids.values()} | |
| models = load_models() | |
| def load_tokenizer(tok_id): | |
| return AutoTokenizer.from_pretrained(tok_id) | |
| def get_most_likely(nli_model_id, sequence, labels, hypothesis_template, multi_class): | |
| classifier = pipeline( | |
| 'zero-shot-classification', | |
| model=models[nli_model_id], | |
| tokenizer=load_tokenizer(nli_model_id), | |
| device=device | |
| ) | |
| outputs = classifier( | |
| sequence, | |
| candidate_labels=labels, | |
| hypothesis_template=hypothesis_template, | |
| multi_label=multi_class | |
| ) | |
| return outputs['labels'], outputs['scores'] | |
| def load_examples(model_id): | |
| model_id_stripped = model_id.split('/')[-1] | |
| df = pd.read_json(f'texts-{model_id_stripped}.json') | |
| names = df.name.values.tolist() | |
| mapping = {df['name'].iloc[i]: (df['text'].iloc[i], df['labels'].iloc[i]) for i in range(len(names))} | |
| names.append('Custom') | |
| mapping['Custom'] = ('', '') | |
| return names, mapping | |
| def plot_result(top_topics, scores): | |
| top_topics = np.array(top_topics) | |
| scores = np.array(scores) | |
| scores *= 100 | |
| fig = px.bar(x=scores, y=top_topics, orientation='h', | |
| labels={'x': 'Confidence', 'y': 'Label'}, | |
| text=scores, | |
| range_x=(0,115), | |
| title='Top Predictions', | |
| color=np.linspace(0,1,len(scores)), | |
| color_continuous_scale='GnBu') | |
| fig.update(layout_coloraxis_showscale=False) | |
| fig.update_traces(texttemplate='%{text:0.1f}%', textposition='outside') | |
| st.plotly_chart(fig) | |
| def main(): | |
| with open("style.css") as f: | |
| st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True) | |
| logo = Image.open('huggingface_logo.png') | |
| st.sidebar.image(logo, width=120) | |
| st.sidebar.markdown(ZSL_DESC) | |
| model_desc = st.sidebar.selectbox('Model', list(MODEL_DESC.keys()), 0) | |
| do_print_code = st.sidebar.checkbox('Show code snippet', False) | |
| st.sidebar.markdown('#### Model Description') | |
| st.sidebar.markdown(MODEL_DESC[model_desc]) | |
| st.sidebar.markdown('Originally proposed by [Yin et al. (2019)](https://arxiv.org/abs/1909.00161). Read more in our [blog post](https://joeddav.github.io/blog/2020/05/29/ZSL.html).') | |
| model_id = model_ids[model_desc] | |
| ex_names, ex_map = load_examples(model_id) | |
| st.title('Zero Shot Topic Classification') | |
| example = st.selectbox('Choose an example', ex_names) | |
| height = min((len(ex_map[example][0].split()) + 1) * 2, 200) | |
| sequence = st.text_area('Text', ex_map[example][0], key='sequence', height=height) | |
| labels = st.text_input('Possible topics (separated by `,`)', ex_map[example][1], max_chars=1000) | |
| multi_class = st.checkbox('Allow multiple correct topics', value=True) | |
| hypothesis_template = "This text is about {}." | |
| labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0])) | |
| if len(labels) == 0 or len(sequence) == 0: | |
| st.write('Enter some text and at least one possible topic to see predictions.') | |
| return | |
| if do_print_code: | |
| st.markdown(CODE_DESC.format(model_id)) | |
| with st.spinner('Classifying...'): | |
| top_topics, scores = get_most_likely(model_id, sequence, labels, hypothesis_template, multi_class) | |
| plot_result(top_topics[::-1][-MAX_GRAPH_ROWS:], scores[::-1][-MAX_GRAPH_ROWS:]) | |
| if __name__ == '__main__': | |
| main() | |