Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
✨ enable new checkpoints
Browse filesSigned-off-by: peter szemraj <[email protected]>
app.py
CHANGED
|
@@ -1,33 +1,70 @@
|
|
| 1 |
-
import os
|
| 2 |
import contextlib
|
| 3 |
import logging
|
|
|
|
| 4 |
import random
|
| 5 |
import re
|
| 6 |
import time
|
| 7 |
from pathlib import Path
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
import nltk
|
|
|
|
| 11 |
from cleantext import clean
|
| 12 |
from doctr.io import DocumentFile
|
| 13 |
from doctr.models import ocr_predictor
|
| 14 |
-
from pdf2text import convert_PDF_to_Text
|
| 15 |
|
|
|
|
| 16 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
| 17 |
-
from utils import load_example_filenames,
|
| 18 |
|
| 19 |
_here = Path(__file__).parent
|
| 20 |
|
| 21 |
nltk.download("stopwords") # TODO=find where this requirement originates from
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def proc_submission(
|
| 29 |
input_text: str,
|
| 30 |
-
|
| 31 |
num_beams,
|
| 32 |
token_batch_length,
|
| 33 |
length_penalty,
|
|
@@ -40,7 +77,7 @@ def proc_submission(
|
|
| 40 |
|
| 41 |
Args:
|
| 42 |
input_text (str): the input text to summarize
|
| 43 |
-
|
| 44 |
num_beams (int): the number of beams to use
|
| 45 |
token_batch_length (int): the length of the token batches to use
|
| 46 |
length_penalty (float): the length penalty to use
|
|
@@ -66,7 +103,7 @@ def proc_submission(
|
|
| 66 |
st = time.perf_counter()
|
| 67 |
history = {}
|
| 68 |
clean_text = clean(input_text, lower=False)
|
| 69 |
-
max_input_length = 2048 if "base" in
|
| 70 |
processed = truncate_word_count(clean_text, max_input_length)
|
| 71 |
|
| 72 |
if processed["was_truncated"]:
|
|
@@ -100,14 +137,13 @@ def proc_submission(
|
|
| 100 |
|
| 101 |
return msg, "", []
|
| 102 |
|
| 103 |
-
_summaries =
|
| 104 |
-
tr_in,
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
batch_length=token_batch_length,
|
| 108 |
**settings,
|
| 109 |
)
|
| 110 |
-
sum_text = [f"Section {i}
|
| 111 |
sum_scores = [
|
| 112 |
f" - Section {i}: {round(s['summary_score'],4)}"
|
| 113 |
for i, s in enumerate(_summaries)
|
|
@@ -204,18 +240,6 @@ def load_uploaded_file(file_obj, max_pages=20):
|
|
| 204 |
|
| 205 |
if __name__ == "__main__":
|
| 206 |
logging.info("Starting app instance")
|
| 207 |
-
os.environ[
|
| 208 |
-
"TOKENIZERS_PARALLELISM"
|
| 209 |
-
] = "false" # parallelism on tokenizers is buggy with gradio
|
| 210 |
-
logging.info("Loading summ models")
|
| 211 |
-
with contextlib.redirect_stdout(None):
|
| 212 |
-
model, tokenizer = load_model_and_tokenizer(
|
| 213 |
-
"pszemraj/pegasus-x-large-book-summary"
|
| 214 |
-
)
|
| 215 |
-
model_sm, tokenizer_sm = load_model_and_tokenizer(
|
| 216 |
-
"pszemraj/long-t5-tglobal-base-16384-book-summary"
|
| 217 |
-
)
|
| 218 |
-
|
| 219 |
logging.info("Loading OCR model")
|
| 220 |
with contextlib.redirect_stdout(None):
|
| 221 |
ocr_model = ocr_predictor(
|
|
@@ -229,24 +253,19 @@ if __name__ == "__main__":
|
|
| 229 |
demo = gr.Blocks()
|
| 230 |
_examples = list(name_to_path.keys())
|
| 231 |
with demo:
|
| 232 |
-
|
| 233 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
| 234 |
gr.Markdown(
|
| 235 |
"This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
|
| 236 |
)
|
| 237 |
with gr.Column():
|
| 238 |
-
|
| 239 |
gr.Markdown("## Load Inputs & Select Parameters")
|
| 240 |
gr.Markdown(
|
| 241 |
"Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
|
| 242 |
)
|
| 243 |
with gr.Row(variant="compact"):
|
| 244 |
with gr.Column(scale=0.5, variant="compact"):
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
choices=["LongT5-base", "Pegasus-X-large"],
|
| 248 |
-
label="Model Variant",
|
| 249 |
-
value="LongT5-base",
|
| 250 |
)
|
| 251 |
num_beams = gr.Radio(
|
| 252 |
choices=[2, 3, 4],
|
|
@@ -336,7 +355,7 @@ if __name__ == "__main__":
|
|
| 336 |
value=3,
|
| 337 |
)
|
| 338 |
with gr.Column():
|
| 339 |
-
gr.Markdown("### About
|
| 340 |
gr.Markdown(
|
| 341 |
"These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
|
| 342 |
)
|
|
@@ -354,7 +373,7 @@ if __name__ == "__main__":
|
|
| 354 |
fn=proc_submission,
|
| 355 |
inputs=[
|
| 356 |
input_text,
|
| 357 |
-
|
| 358 |
num_beams,
|
| 359 |
token_batch_length,
|
| 360 |
length_penalty,
|
|
|
|
|
|
|
| 1 |
import contextlib
|
| 2 |
import logging
|
| 3 |
+
import os
|
| 4 |
import random
|
| 5 |
import re
|
| 6 |
import time
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
os.environ["USE_TORCH"] = "1"
|
| 10 |
+
os.environ[
|
| 11 |
+
"TOKENIZERS_PARALLELISM"
|
| 12 |
+
] = "false" # parallelism on tokenizers is buggy with gradio
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(
|
| 15 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
import gradio as gr
|
| 19 |
import nltk
|
| 20 |
+
import torch
|
| 21 |
from cleantext import clean
|
| 22 |
from doctr.io import DocumentFile
|
| 23 |
from doctr.models import ocr_predictor
|
|
|
|
| 24 |
|
| 25 |
+
from pdf2text import convert_PDF_to_Text
|
| 26 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
| 27 |
+
from utils import load_example_filenames, saves_summary, truncate_word_count
|
| 28 |
|
| 29 |
_here = Path(__file__).parent
|
| 30 |
|
| 31 |
nltk.download("stopwords") # TODO=find where this requirement originates from
|
| 32 |
|
| 33 |
+
|
| 34 |
+
MODEL_OPTIONS = [
|
| 35 |
+
"pszemraj/long-t5-tglobal-base-16384-book-summary",
|
| 36 |
+
"pszemraj/long-t5-tglobal-base-sci-simplify",
|
| 37 |
+
"pszemraj/long-t5-tglobal-base-sci-simplify-elife",
|
| 38 |
+
"pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
|
| 39 |
+
"pszemraj/pegasus-x-large-book-summary",
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def predict(
|
| 44 |
+
input_text: str,
|
| 45 |
+
model_name: str,
|
| 46 |
+
token_batch_length: int = 1024,
|
| 47 |
+
empty_cache: bool = True,
|
| 48 |
+
**settings,
|
| 49 |
+
):
|
| 50 |
+
"""helper fn to support multiple models at once"""
|
| 51 |
+
if torch.cuda.is_available() and empty_cache:
|
| 52 |
+
torch.cuda.empty_cache()
|
| 53 |
+
|
| 54 |
+
model, tokenizer = load_model_and_tokenizer(model_name)
|
| 55 |
+
summaries = summarize_via_tokenbatches(
|
| 56 |
+
input_text,
|
| 57 |
+
model,
|
| 58 |
+
tokenizer,
|
| 59 |
+
batch_length=token_batch_length,
|
| 60 |
+
**settings,
|
| 61 |
+
)
|
| 62 |
+
return summaries
|
| 63 |
|
| 64 |
|
| 65 |
def proc_submission(
|
| 66 |
input_text: str,
|
| 67 |
+
model_name: str,
|
| 68 |
num_beams,
|
| 69 |
token_batch_length,
|
| 70 |
length_penalty,
|
|
|
|
| 77 |
|
| 78 |
Args:
|
| 79 |
input_text (str): the input text to summarize
|
| 80 |
+
model_name (str): the hf model tag of the model to use
|
| 81 |
num_beams (int): the number of beams to use
|
| 82 |
token_batch_length (int): the length of the token batches to use
|
| 83 |
length_penalty (float): the length penalty to use
|
|
|
|
| 103 |
st = time.perf_counter()
|
| 104 |
history = {}
|
| 105 |
clean_text = clean(input_text, lower=False)
|
| 106 |
+
max_input_length = 2048 if "base" in model_name.lower() else max_input_length
|
| 107 |
processed = truncate_word_count(clean_text, max_input_length)
|
| 108 |
|
| 109 |
if processed["was_truncated"]:
|
|
|
|
| 137 |
|
| 138 |
return msg, "", []
|
| 139 |
|
| 140 |
+
_summaries = predict(
|
| 141 |
+
input_text=tr_in,
|
| 142 |
+
model_name=model_name,
|
| 143 |
+
token_batch_length=token_batch_length,
|
|
|
|
| 144 |
**settings,
|
| 145 |
)
|
| 146 |
+
sum_text = [f"Section {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries)]
|
| 147 |
sum_scores = [
|
| 148 |
f" - Section {i}: {round(s['summary_score'],4)}"
|
| 149 |
for i, s in enumerate(_summaries)
|
|
|
|
| 240 |
|
| 241 |
if __name__ == "__main__":
|
| 242 |
logging.info("Starting app instance")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
logging.info("Loading OCR model")
|
| 244 |
with contextlib.redirect_stdout(None):
|
| 245 |
ocr_model = ocr_predictor(
|
|
|
|
| 253 |
demo = gr.Blocks()
|
| 254 |
_examples = list(name_to_path.keys())
|
| 255 |
with demo:
|
|
|
|
| 256 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
| 257 |
gr.Markdown(
|
| 258 |
"This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
|
| 259 |
)
|
| 260 |
with gr.Column():
|
|
|
|
| 261 |
gr.Markdown("## Load Inputs & Select Parameters")
|
| 262 |
gr.Markdown(
|
| 263 |
"Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
|
| 264 |
)
|
| 265 |
with gr.Row(variant="compact"):
|
| 266 |
with gr.Column(scale=0.5, variant="compact"):
|
| 267 |
+
model_name = gr.Dropdown(
|
| 268 |
+
choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Model"
|
|
|
|
|
|
|
|
|
|
| 269 |
)
|
| 270 |
num_beams = gr.Radio(
|
| 271 |
choices=[2, 3, 4],
|
|
|
|
| 355 |
value=3,
|
| 356 |
)
|
| 357 |
with gr.Column():
|
| 358 |
+
gr.Markdown("### About")
|
| 359 |
gr.Markdown(
|
| 360 |
"These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
|
| 361 |
)
|
|
|
|
| 373 |
fn=proc_submission,
|
| 374 |
inputs=[
|
| 375 |
input_text,
|
| 376 |
+
model_name,
|
| 377 |
num_beams,
|
| 378 |
token_batch_length,
|
| 379 |
length_penalty,
|