Spaces:
Runtime error
Runtime error
| import re | |
| import string | |
| import orjson | |
| import streamlit as st | |
| from annotated_text.util import get_annotated_html | |
| from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline | |
| from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline | |
| def load_pipeline(chosen_model): | |
| if "keyphrase-extraction" in chosen_model: | |
| return KeyphraseExtractionPipeline(chosen_model) | |
| elif "keyphrase-generation" in chosen_model: | |
| return KeyphraseGenerationPipeline(chosen_model, truncation=True) | |
| def generate_run_id(): | |
| return f"run_{re.sub('keyphrase-extraction-|keyphrase-generation-', '', st.session_state.chosen_model)}_{st.session_state.current_run_id}" | |
| def extract_keyphrases(): | |
| st.session_state.keyphrases = pipe(st.session_state.input_text) | |
| st.session_state.history[generate_run_id()] = { | |
| "run_id": st.session_state.current_run_id, | |
| "model": st.session_state.chosen_model, | |
| "text": st.session_state.input_text, | |
| "keyphrases": st.session_state.keyphrases, | |
| } | |
| st.session_state.current_run_id += 1 | |
| def get_annotated_text(text, keyphrases, color="#d294ff"): | |
| for keyphrase in keyphrases: | |
| text = re.sub( | |
| rf"({keyphrase})([^A-Za-z0-9])", | |
| rf"$K:{keyphrases.index(keyphrase)}\2", | |
| text, | |
| flags=re.I, | |
| ) | |
| result = [] | |
| for i, word in enumerate(text.split(" ")): | |
| if "$K" in word and re.search( | |
| "(\d+)$", word.translate(str.maketrans("", "", string.punctuation)) | |
| ): | |
| result.append( | |
| ( | |
| re.sub( | |
| r"\$K:\d+", | |
| keyphrases[ | |
| int( | |
| re.search( | |
| "(\d+)$", | |
| word.translate( | |
| str.maketrans("", "", string.punctuation) | |
| ), | |
| ).group(1) | |
| ) | |
| ], | |
| word, | |
| ), | |
| "KEY", | |
| color, | |
| ) | |
| ) | |
| else: | |
| if i == len(st.session_state.input_text.split(" ")) - 1: | |
| result.append(f" {word}") | |
| elif i == 0: | |
| result.append(f"{word} ") | |
| else: | |
| result.append(f" {word} ") | |
| return result | |
| def render_output(layout, runs, reverse=False): | |
| runs = list(runs.values())[::-1] if reverse else list(runs.values()) | |
| for run in runs: | |
| layout.markdown( | |
| f""" | |
| <p style=\"margin-bottom: 0rem\"><strong>Run:</strong> {run.get('run_id')}</p> | |
| <p style=\"margin-bottom: 0rem\"><strong>Model:</strong> {run.get('model')}</p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| result = get_annotated_text(run.get("text"), list(run.get("keyphrases"))) | |
| layout.markdown( | |
| f""" | |
| <p style="margin-bottom: 0.5rem"><strong>Text:</strong></p> | |
| {get_annotated_html(*result)} | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| if "generation" in run.get("model"): | |
| abstractive_keyphrases = [ | |
| (keyphrase, "KEY", "#FFA500") | |
| for keyphrase in run.get("keyphrases") | |
| if keyphrase.lower() not in run.get("text").lower() | |
| ] | |
| for i in range(len(abstractive_keyphrases)): | |
| if i % 2 == 0: | |
| abstractive_keyphrases.insert(i + 1, " ") | |
| layout.markdown( | |
| f"<p style=\"margin: 1rem 0 0 0\"><strong>Absent keyphrases:</strong> {get_annotated_html(*abstractive_keyphrases) if abstractive_keyphrases else 'None' }</p>", | |
| unsafe_allow_html=True, | |
| ) | |
| layout.markdown("---") | |
| if "config" not in st.session_state: | |
| with open("config.json", "r") as f: | |
| content = f.read() | |
| st.session_state.config = orjson.loads(content) | |
| st.session_state.history = {} | |
| st.session_state.keyphrases = [] | |
| st.session_state.current_run_id = 1 | |
| st.set_page_config( | |
| page_icon="π", | |
| page_title="Keyphrase extraction/generation with Transformers", | |
| layout="centered", | |
| ) | |
| with open("css/style.css") as f: | |
| st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True) | |
| st.header("π Keyphrase extraction/generation with Transformers") | |
| description = """ | |
| Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a document. | |
| Thanks to these keyphrases humans can understand the content of a text very quickly and easily without reading | |
| it completely. Keyphrase extraction was first done primarily by human annotators, who read the text in detail | |
| and then wrote down the most important keyphrases. The disadvantage is that if you work with a lot of documents, | |
| this process can take a lot of time β³. | |
| Here is where Artificial Intelligence π€ comes in. Currently, classical machine learning methods, that use statistical | |
| and linguistic features, are widely used for the extraction process. Now with deep learning, it is possible to capture | |
| the semantic meaning of a text even better than these classical methods. Classical methods look at the frequency, | |
| occurrence and order of words in the text, whereas these neural approaches can capture long-term semantic dependencies | |
| and context of words in a text. | |
| This space gives you the ability to extract keyphrases out of a custom text with transformer-based extraction and generation models. | |
| Keyphrase extraction models are transformer models fine-tuned as a token classification problem where each word in the document | |
| is classified as being part of a keyphrase or not. | |
| The labels used during fine-tuning are B (Beginning of a keyphrase), I (Inside a keyphrases), | |
| and O (Outside a keyhprase). | |
| While keyphrase extraction use encoder-only models to interpret the document. Keyphrase generation models | |
| work a bit differently. Here you use an encoder-decoder model (e.g. BART, T5) to generate keyphrases from a given text. | |
| These models also have the ability to generate keyphrases, which are not present in the text π€―. | |
| This can be really interesting in certain applications. For example if you want to make a news article more discoverable. | |
| Try it out yourself! π | |
| """ | |
| st.write(description) | |
| with st.form("keyphrase-extraction-form"): | |
| st.session_state.chosen_model = st.selectbox( | |
| "Choose your model:", st.session_state.config.get("models") | |
| ) | |
| st.markdown( | |
| f"For more information about the chosen model, please be sure to check out the [π€ Model Card](https://huggingface.co/{st.session_state.get('config').get('model_author')}/{st.session_state.chosen_model})." | |
| ) | |
| st.session_state.input_text = ( | |
| st.text_area( | |
| "β Input", | |
| st.session_state.config.get("example_text"), | |
| height=350, | |
| max_chars=2500, | |
| ) | |
| .replace("\n", " ") | |
| .strip() | |
| ) | |
| with st.spinner("Extracting keyphrases..."): | |
| _, button_container = st.columns([7, 1]) | |
| pressed = button_container.form_submit_button("Extract") | |
| if pressed and st.session_state.input_text != "": | |
| with st.spinner("Loading pipeline..."): | |
| pipe = load_pipeline( | |
| f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}" | |
| ) | |
| with st.spinner("Extracting keyphrases"): | |
| extract_keyphrases() | |
| elif st.session_state.input_text == "": | |
| st.error("The text input is empty π Please provide a text in the input field.") | |
| if len(st.session_state.history.keys()) > 0: | |
| options = st.multiselect( | |
| "Specify the runs you want to see", | |
| st.session_state.history.keys(), | |
| format_func=lambda run_id: f"Run {run_id.split('_')[-1]}: {run_id.split('_')[1]}", | |
| ) | |
| if options: | |
| render_output( | |
| st, | |
| {key: st.session_state.history[key] for key in options}, | |
| ) | |
| else: | |
| render_output(st, st.session_state.history, reverse=True) | |