Paula Leonova
commited on
Commit
·
f60d1c6
1
Parent(s):
51fcc5c
Add keyBERT in order to generate top keywords
Browse files
app.py
CHANGED
|
@@ -51,7 +51,7 @@ with st.form(key='my_form'):
|
|
| 51 |
|
| 52 |
|
| 53 |
|
| 54 |
-
with st.spinner('Loading pretrained
|
| 55 |
start = time.time()
|
| 56 |
summarizer = md.load_summary_model()
|
| 57 |
s_time = round(time.time() - start,4)
|
|
@@ -60,13 +60,11 @@ with st.spinner('Loading pretrained summarizer and classifier mnli model...'):
|
|
| 60 |
classifier = md.load_model()
|
| 61 |
c_time = round(time.time() - start,4)
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# start = time.time()
|
| 67 |
-
# classifier = md.load_model()
|
| 68 |
-
# st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
| 69 |
|
|
|
|
| 70 |
|
| 71 |
if submit_button:
|
| 72 |
if len(text_input) == 0:
|
|
@@ -80,22 +78,31 @@ if submit_button:
|
|
| 80 |
for n in range(0, len(nested_sentences)):
|
| 81 |
tc = " ".join(map(str, nested_sentences[n]))
|
| 82 |
text_chunks.append(tc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
|
|
|
| 84 |
with st.spinner('Generating summaries for text chunks...'):
|
| 85 |
|
| 86 |
-
my_expander = st.expander(label='Expand to see summary generation details')
|
| 87 |
with my_expander:
|
| 88 |
summary = []
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
# # For each chunk of sentences (within the token max), generate a summary
|
| 95 |
-
# for n in range(0, len(nested_sentences)):
|
| 96 |
-
# text_chunk = " ".join(map(str, nested_sentences[n]))
|
| 97 |
-
# st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
| 98 |
-
# st.markdown(text_chunk)
|
| 99 |
|
| 100 |
for num_chunk, text_chunk in enumerate(text_chunks):
|
| 101 |
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
|
@@ -108,15 +115,9 @@ if submit_button:
|
|
| 108 |
# Combine all the summaries into a list and compress into one document, again
|
| 109 |
final_summary = " \n\n".join(list(summary))
|
| 110 |
|
| 111 |
-
# final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
| 112 |
st.markdown("### Combined Summary")
|
| 113 |
st.markdown(final_summary)
|
| 114 |
|
| 115 |
-
# if gen_keywords == 'Yes':
|
| 116 |
-
# st.markdown("### Top Keywords")
|
| 117 |
-
# with st.spinner("Generating keywords from text...")
|
| 118 |
-
# keywords =
|
| 119 |
-
|
| 120 |
if len(text_input) == 0 or len(labels) == 0:
|
| 121 |
st.write('Enter some text and at least one possible topic to see predictions.')
|
| 122 |
else:
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
|
| 54 |
+
with st.spinner('Loading pretrained models...'):
|
| 55 |
start = time.time()
|
| 56 |
summarizer = md.load_summary_model()
|
| 57 |
s_time = round(time.time() - start,4)
|
|
|
|
| 60 |
classifier = md.load_model()
|
| 61 |
c_time = round(time.time() - start,4)
|
| 62 |
|
| 63 |
+
start = time.time()
|
| 64 |
+
kw_model = md.load_keyword_model()
|
| 65 |
+
k_time = round(time.time() - start,4)
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
st.success(f'Time taken to load BART summarizer mnli model: {s_time}s & BART classifier mnli model: {c_time}s & KeyBERT model: {k_time}s')
|
| 68 |
|
| 69 |
if submit_button:
|
| 70 |
if len(text_input) == 0:
|
|
|
|
| 78 |
for n in range(0, len(nested_sentences)):
|
| 79 |
tc = " ".join(map(str, nested_sentences[n]))
|
| 80 |
text_chunks.append(tc)
|
| 81 |
+
|
| 82 |
+
if gen_keywords == 'Yes':
|
| 83 |
+
st.markdown("### Top Keywords")
|
| 84 |
+
with st.spinner("Generating keywords from text..."):
|
| 85 |
+
|
| 86 |
+
kw_df = pd.DataFrame()
|
| 87 |
+
for text_chunk in text_chunks:
|
| 88 |
+
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
| 89 |
+
kw_df = kw_df.append(pd.DataFrame(keywords_list))
|
| 90 |
+
kw_df.columns = ['keyword', 'score']
|
| 91 |
+
top_kw_df = kw_df.groupby('keyword')['score'].max().reset_index()
|
| 92 |
+
|
| 93 |
+
top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
|
| 94 |
+
st.dataframe(top_kw_df.head(10))
|
| 95 |
|
| 96 |
+
st.markdown("### Text Chunk & Summaries")
|
| 97 |
with st.spinner('Generating summaries for text chunks...'):
|
| 98 |
|
| 99 |
+
my_expander = st.expander(label='Expand to see intermediate summary generation details')
|
| 100 |
with my_expander:
|
| 101 |
summary = []
|
| 102 |
+
|
| 103 |
+
st.markdown("_The original text is broken into chunks with complete sentences totaling \
|
| 104 |
+
fewer than 1024 tokens, a requirement for the summarizer. Each block of text is then summarized separately \
|
| 105 |
+
and then combined at the very end to generate the final summary._")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
for num_chunk, text_chunk in enumerate(text_chunks):
|
| 108 |
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
|
|
|
| 115 |
# Combine all the summaries into a list and compress into one document, again
|
| 116 |
final_summary = " \n\n".join(list(summary))
|
| 117 |
|
|
|
|
| 118 |
st.markdown("### Combined Summary")
|
| 119 |
st.markdown(final_summary)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
if len(text_input) == 0 or len(labels) == 0:
|
| 122 |
st.write('Enter some text and at least one possible topic to see predictions.')
|
| 123 |
else:
|
models.py
CHANGED
|
@@ -34,9 +34,9 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
| 34 |
@st.cache(allow_output_mutation=True)
|
| 35 |
def load_keyword_model():
|
| 36 |
kw_model = KeyBERT()
|
| 37 |
-
return
|
| 38 |
|
| 39 |
-
def keyword_gen(sequence:str):
|
| 40 |
keywords = kw_model.extract_keywords(sequence,
|
| 41 |
keyphrase_ngram_range=(1, 1),
|
| 42 |
stop_words='english',
|
|
|
|
| 34 |
@st.cache(allow_output_mutation=True)
|
| 35 |
def load_keyword_model():
|
| 36 |
kw_model = KeyBERT()
|
| 37 |
+
return kw_model
|
| 38 |
|
| 39 |
+
def keyword_gen(kw_model, sequence:str):
|
| 40 |
keywords = kw_model.extract_keywords(sequence,
|
| 41 |
keyphrase_ngram_range=(1, 1),
|
| 42 |
stop_words='english',
|