Spaces:
Sleeping
Sleeping
| """from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| from sumy.summarizers.lex_rank import LexRankSummarizer | |
| from sumy.summarizers.text_rank import TextRankSummarizer | |
| from pysummarization.nlpbase.auto_abstractor import AutoAbstractor | |
| from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer | |
| from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor | |
| from sumy.nlp.stemmers import Stemmer | |
| from sumy.utils import get_stop_words""" | |
| import PyPDF2 | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.text_rank import TextRankSummarizer | |
| def summarize_pdf_with_textrank(pdf_path, sentences_count=10): | |
| """ | |
| Summarizes the content of a PDF file using TextRank algorithm. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| sentences_count (int): Number of sentences for the summary. | |
| Returns: | |
| str: Summarized text. | |
| """ | |
| # Extract text from the PDF | |
| pdf_text = "" | |
| with open(pdf_path, "rb") as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| for page in pdf_reader.pages: | |
| pdf_text += page.extract_text() or "" | |
| # Check if text extraction was successful | |
| if not pdf_text.strip(): | |
| return "Text extraction from PDF failed or PDF is empty." | |
| # Create a parser for the extracted text | |
| parser = PlaintextParser.from_string(pdf_text, Tokenizer("english")) | |
| # Use TextRank for summarization | |
| text_rank_summarizer = TextRankSummarizer() | |
| text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count) | |
| # Compile summary into a single string | |
| summary_text = "\n".join(str(sentence) for sentence in text_rank_summary) | |
| return summary_text | |