Spaces:

pszemraj
/

document-summarization

Running on CPU Upgrade

App Files Files Community

pszemraj commited on Apr 9, 2023

Commit

b8e1b99

1 Parent(s): 0fccfa2

✨ enable new checkpoints

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show

app.py +55 -36

app.py CHANGED Viewed

@@ -1,33 +1,70 @@
-import os
 import contextlib
 import logging
 import random
 import re
 import time
 from pathlib import Path
 import gradio as gr
 import nltk
 from cleantext import clean
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
-from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
-from utils import load_example_filenames, truncate_word_count, saves_summary
 _here = Path(__file__).parent
 nltk.download("stopwords")  # TODO=find where this requirement originates from
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 def proc_submission(
     input_text: str,
-    model_size: str,
     num_beams,
     token_batch_length,
     length_penalty,
@@ -40,7 +77,7 @@ def proc_submission(
     Args:
         input_text (str): the input text to summarize
-        model_size (str): the size of the model to use
         num_beams (int): the number of beams to use
         token_batch_length (int): the length of the token batches to use
         length_penalty (float): the length penalty to use
@@ -66,7 +103,7 @@ def proc_submission(
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
-    max_input_length = 2048 if "base" in model_size.lower() else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
@@ -100,14 +137,13 @@ def proc_submission(
         return msg, "", []
-    _summaries = summarize_via_tokenbatches(
-        tr_in,
-        model_sm if "base" in model_size.lower() else model,
-        tokenizer_sm if "base" in model_size.lower() else tokenizer,
-        batch_length=token_batch_length,
         **settings,
     )
-    sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
     sum_scores = [
         f" - Section {i}: {round(s['summary_score'],4)}"
         for i, s in enumerate(_summaries)
@@ -204,18 +240,6 @@ def load_uploaded_file(file_obj, max_pages=20):
 if __name__ == "__main__":
     logging.info("Starting app instance")
-    os.environ[
-        "TOKENIZERS_PARALLELISM"
-    ] = "false"  # parallelism on tokenizers is buggy with gradio
-    logging.info("Loading summ models")
-    with contextlib.redirect_stdout(None):
-        model, tokenizer = load_model_and_tokenizer(
-            "pszemraj/pegasus-x-large-book-summary"
-        )
-        model_sm, tokenizer_sm = load_model_and_tokenizer(
-            "pszemraj/long-t5-tglobal-base-16384-book-summary"
-        )
     logging.info("Loading OCR model")
     with contextlib.redirect_stdout(None):
         ocr_model = ocr_predictor(
@@ -229,24 +253,19 @@ if __name__ == "__main__":
     demo = gr.Blocks()
     _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Document Summarization with Long-Document Transformers")
         gr.Markdown(
             "This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
         )
         with gr.Column():
             gr.Markdown("## Load Inputs & Select Parameters")
             gr.Markdown(
                 "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
             )
             with gr.Row(variant="compact"):
                 with gr.Column(scale=0.5, variant="compact"):
-                    model_size = gr.Radio(
-                        choices=["LongT5-base", "Pegasus-X-large"],
-                        label="Model Variant",
-                        value="LongT5-base",
                     )
                     num_beams = gr.Radio(
                         choices=[2, 3, 4],
@@ -336,7 +355,7 @@ if __name__ == "__main__":
                     value=3,
                 )
         with gr.Column():
-            gr.Markdown("### About the Model")
             gr.Markdown(
                 "These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )
@@ -354,7 +373,7 @@ if __name__ == "__main__":
             fn=proc_submission,
             inputs=[
                 input_text,
-                model_size,
                 num_beams,
                 token_batch_length,
                 length_penalty,

 import contextlib
 import logging
+import os
 import random
 import re
 import time
 from pathlib import Path
+os.environ["USE_TORCH"] = "1"
+os.environ[
+    "TOKENIZERS_PARALLELISM"
+] = "false"  # parallelism on tokenizers is buggy with gradio
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 import gradio as gr
 import nltk
+import torch
 from cleantext import clean
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
+from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
+from utils import load_example_filenames, saves_summary, truncate_word_count
 _here = Path(__file__).parent
 nltk.download("stopwords")  # TODO=find where this requirement originates from
+MODEL_OPTIONS = [
+    "pszemraj/long-t5-tglobal-base-16384-book-summary",
+    "pszemraj/long-t5-tglobal-base-sci-simplify",
+    "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
+    "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
+    "pszemraj/pegasus-x-large-book-summary",
+]
+def predict(
+    input_text: str,
+    model_name: str,
+    token_batch_length: int = 1024,
+    empty_cache: bool = True,
+    **settings,
+):
+    """helper fn to support multiple models at once"""
+    if torch.cuda.is_available() and empty_cache:
+        torch.cuda.empty_cache()
+    model, tokenizer = load_model_and_tokenizer(model_name)
+    summaries = summarize_via_tokenbatches(
+        input_text,
+        model,
+        tokenizer,
+        batch_length=token_batch_length,
+        **settings,
+    )
+    return summaries
 def proc_submission(
     input_text: str,
+    model_name: str,
     num_beams,
     token_batch_length,
     length_penalty,
     Args:
         input_text (str): the input text to summarize
+        model_name (str): the hf model tag of the model to use
         num_beams (int): the number of beams to use
         token_batch_length (int): the length of the token batches to use
         length_penalty (float): the length penalty to use
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
+    max_input_length = 2048 if "base" in model_name.lower() else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
         return msg, "", []
+    _summaries = predict(
+        input_text=tr_in,
+        model_name=model_name,
+        token_batch_length=token_batch_length,
         **settings,
     )
+    sum_text = [f"Section {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries)]
     sum_scores = [
         f" - Section {i}: {round(s['summary_score'],4)}"
         for i, s in enumerate(_summaries)
 if __name__ == "__main__":
     logging.info("Starting app instance")
     logging.info("Loading OCR model")
     with contextlib.redirect_stdout(None):
         ocr_model = ocr_predictor(
     demo = gr.Blocks()
     _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Document Summarization with Long-Document Transformers")
         gr.Markdown(
             "This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
         )
         with gr.Column():
             gr.Markdown("## Load Inputs & Select Parameters")
             gr.Markdown(
                 "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
             )
             with gr.Row(variant="compact"):
                 with gr.Column(scale=0.5, variant="compact"):
+                    model_name = gr.Dropdown(
+                        choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Model"
                     )
                     num_beams = gr.Radio(
                         choices=[2, 3, 4],
                     value=3,
                 )
         with gr.Column():
+            gr.Markdown("### About")
             gr.Markdown(
                 "These models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )
             fn=proc_submission,
             inputs=[
                 input_text,
+                model_name,
                 num_beams,
                 token_batch_length,
                 length_penalty,