Spaces:

ml6team
/

keyphrase-extraction

Runtime error

App Files Files Community

DeDeckerThomas commited on May 13, 2022

Commit

8339421

1 Parent(s): 55b038b

Fix last bugs with annotation system

Browse files

Files changed (2) hide show

app.py +64 -33
pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc +0 -0

app.py CHANGED Viewed

@@ -7,30 +7,11 @@ import orjson
 from annotated_text.util import get_annotated_html
 from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
 import re
 import numpy as np
-if "config" not in st.session_state:
-    with open("config.json", "r") as f:
-        content = f.read()
-    st.session_state.config = orjson.loads(content)
-    st.session_state.data_frame = pd.DataFrame(columns=["model"])
-    st.session_state.keyphrases = []
-st.set_page_config(
-    page_icon="🔑",
-    page_title="Keyphrase extraction/generation with Transformers",
-    layout="wide",
-)
-if "select_rows" not in st.session_state:
-    st.session_state.selected_rows = []
-st.header("🔑 Keyphrase extraction/generation with Transformers")
-col1, col2 = st.empty().columns(2)
-@st.cache(allow_output_mutation=True)
 def load_pipeline(chosen_model):
     if "keyphrase-extraction" in chosen_model:
         return KeyphraseExtractionPipeline(chosen_model)
@@ -67,18 +48,38 @@ def extract_keyphrases():
 def get_annotated_text(text, keyphrases):
     for keyphrase in keyphrases:
         text = re.sub(
-            f"({keyphrase})",
-            keyphrase.replace(" ", "$K"),
             text,
             flags=re.I,
         )
     result = []
     for i, word in enumerate(text.split(" ")):
-        if re.sub(r"[^\w\s]", "", word) in keyphrases:
-            result.append((word, "KEY", "#21c354"))
-        elif "$K" in word:
-            result.append((" ".join(word.split("$K")), "KEY", "#21c354"))
         else:
             if i == len(st.session_state.input_text.split(" ")) - 1:
                 result.append(f" {word}")
@@ -113,12 +114,39 @@ def rerender_output(layout):
             ],
         )
-    result = get_annotated_text(text, keyphrases)
     layout.markdown(
         get_annotated_html(*result),
         unsafe_allow_html=True,
     )
 chosen_model = col1.selectbox(
@@ -127,14 +155,17 @@ chosen_model = col1.selectbox(
 )
 st.session_state.chosen_model = chosen_model
-pipe = load_pipeline(
-    f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
-)
 st.session_state.input_text = col1.text_area(
     "Input", st.session_state.config.get("example_text"), height=300
-)
-pressed = col1.button("Extract", on_click=extract_keyphrases)
 if len(st.session_state.data_frame.columns) > 0:

 from annotated_text.util import get_annotated_html
 from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
 import re
+import string
 import numpy as np
+@st.cache(allow_output_mutation=True, show_spinner=False)
 def load_pipeline(chosen_model):
     if "keyphrase-extraction" in chosen_model:
         return KeyphraseExtractionPipeline(chosen_model)
 def get_annotated_text(text, keyphrases):
     for keyphrase in keyphrases:
         text = re.sub(
+            rf"({keyphrase})([^A-Za-z])",
+            rf"$K:{keyphrases.index(keyphrase)}\2",
             text,
             flags=re.I,
+            count=1
         )
     result = []
     for i, word in enumerate(text.split(" ")):
+        if "$K" in word and re.search(
+            "(\d+)$", word.translate(str.maketrans("", "", string.punctuation))
+        ):
+            result.append(
+                (
+                    re.sub(
+                        r"\$K:\d+",
+                        keyphrases[
+                            int(
+                                re.search(
+                                    "(\d+)$",
+                                    word.translate(
+                                        str.maketrans("", "", string.punctuation)
+                                    ),
+                                ).group(1)
+                            )
+                        ],
+                        word,
+                    ),
+                    "KEY",
+                    "#21c354",
+                )
+            )
         else:
             if i == len(st.session_state.input_text.split(" ")) - 1:
                 result.append(f" {word}")
             ],
         )
+    result = get_annotated_text(text, list(keyphrases))
     layout.markdown(
         get_annotated_html(*result),
         unsafe_allow_html=True,
     )
+    if "generation" in st.session_state.chosen_model:
+        abstractive_keyphrases = [
+            keyphrase
+            for keyphrase in keyphrases
+            if keyphrase.lower() not in text.lower()
+        ]
+        layout.write(", ".join(abstractive_keyphrases))
+if "config" not in st.session_state:
+    with open("config.json", "r") as f:
+        content = f.read()
+    st.session_state.config = orjson.loads(content)
+    st.session_state.data_frame = pd.DataFrame(columns=["model"])
+    st.session_state.keyphrases = []
+if "select_rows" not in st.session_state:
+    st.session_state.selected_rows = []
+st.set_page_config(
+    page_icon="🔑",
+    page_title="Keyphrase extraction/generation with Transformers",
+    layout="wide",
+)
+st.header("🔑 Keyphrase extraction/generation with Transformers")
+col1, col2 = st.columns(2)
 chosen_model = col1.selectbox(
 )
 st.session_state.chosen_model = chosen_model
+with st.spinner("Loading pipeline..."):
+    pipe = load_pipeline(
+        f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
+    )
 st.session_state.input_text = col1.text_area(
     "Input", st.session_state.config.get("example_text"), height=300
+).replace("\n", " ")
+with st.spinner("Extracting keyphrases..."):
+    pressed = col1.button("Extract", on_click=extract_keyphrases)
 if len(st.session_state.data_frame.columns) > 0:

pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc CHANGED Viewed

Binary files a/pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc and b/pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc differ