File size: 1,898 Bytes
1540d77
7d35509
1540d77
 
 
 
 
 
 
 
 
 
 
 
7d35509
2985613
1540d77
 
 
7d35509
 
 
1540d77
7d35509
 
 
1540d77
65c99df
b4b5aad
65c99df
 
 
 
 
 
b4b5aad
65c99df
1540d77
9c242be
 
7d35509
 
 
 
90c3309
7d35509
 
 
 
 
 
 
90c3309
7d35509
 
90c3309
9c242be
90c3309
65c99df
9c242be
65c99df
9c242be
 
65c99df
9c242be
 
65c99df
 
 
 
90c828a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
File manager to help with uploaded PDF files.
"""
import logging

import streamlit as st
from pypdf import PdfReader


logger = logging.getLogger(__name__)


def get_pdf_contents(
        pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
        page_range: tuple[int, None] | tuple[int, int]
) -> str:
    """
    Extract the text contents from a PDF file.

    Args:
        pdf_file: The uploaded PDF file.
        page_range: The range of pages to extract contents from.

    Returns:
        The contents.
    """
    reader = PdfReader(pdf_file)
    start, end = page_range  # Set start and end per the range (user-specified values)
    text = ''

    if end is None:
        # If end is None (where PDF has only 1 page or start = end), extract start
        end = start

    # Get the text from the specified page range
    for page_num in range(start - 1, end):
        text += reader.pages[page_num].extract_text()

    return text

def validate_page_range(
        pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
        start:int, end:int
) -> tuple[int, None] | tuple[int, int]:
    """
    Validate the page range for the uploaded PDF file. Adjusts start and end
    to be within the valid range of pages in the PDF.

    Args:
        pdf_file: The uploaded PDF file.
        start: The start page
        end: The end page

    Returns:
        The validated page range tuple
    """
    n_pages = len(PdfReader(pdf_file).pages)

    # Set start to max of 1 or specified start (whichever's higher)
    start = max(1, start)
    # Set end to min of pdf length or specified end (whichever's lower)
    end = min(n_pages, end)

    if start > end:  # If the start is higher than the end, make it 1
        start = 1

    if start == end:
        # If start = end (including when PDF is 1 page long), set end to None
        return start, None

    return start, end