Spaces:
Running
Running
File size: 1,898 Bytes
1540d77 7d35509 1540d77 7d35509 2985613 1540d77 7d35509 1540d77 7d35509 1540d77 65c99df b4b5aad 65c99df b4b5aad 65c99df 1540d77 9c242be 7d35509 90c3309 7d35509 90c3309 7d35509 90c3309 9c242be 90c3309 65c99df 9c242be 65c99df 9c242be 65c99df 9c242be 65c99df 90c828a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
"""
File manager to help with uploaded PDF files.
"""
import logging
import streamlit as st
from pypdf import PdfReader
logger = logging.getLogger(__name__)
def get_pdf_contents(
pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
page_range: tuple[int, None] | tuple[int, int]
) -> str:
"""
Extract the text contents from a PDF file.
Args:
pdf_file: The uploaded PDF file.
page_range: The range of pages to extract contents from.
Returns:
The contents.
"""
reader = PdfReader(pdf_file)
start, end = page_range # Set start and end per the range (user-specified values)
text = ''
if end is None:
# If end is None (where PDF has only 1 page or start = end), extract start
end = start
# Get the text from the specified page range
for page_num in range(start - 1, end):
text += reader.pages[page_num].extract_text()
return text
def validate_page_range(
pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
start:int, end:int
) -> tuple[int, None] | tuple[int, int]:
"""
Validate the page range for the uploaded PDF file. Adjusts start and end
to be within the valid range of pages in the PDF.
Args:
pdf_file: The uploaded PDF file.
start: The start page
end: The end page
Returns:
The validated page range tuple
"""
n_pages = len(PdfReader(pdf_file).pages)
# Set start to max of 1 or specified start (whichever's higher)
start = max(1, start)
# Set end to min of pdf length or specified end (whichever's lower)
end = min(n_pages, end)
if start > end: # If the start is higher than the end, make it 1
start = 1
if start == end:
# If start = end (including when PDF is 1 page long), set end to None
return start, None
return start, end
|