Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -49,43 +49,32 @@ if "vector_store" not in st.session_state:
|
|
| 49 |
st.session_state.vector_store = None
|
| 50 |
|
| 51 |
# ----------------- Improved Metadata Extraction -----------------
|
| 52 |
-
def
|
| 53 |
-
"""Extracts
|
| 54 |
with pdfplumber.open(pdf_path) as pdf:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
title
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
author = author if author else "Unknown Author"
|
| 79 |
-
|
| 80 |
-
# Extract emails
|
| 81 |
-
emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text))
|
| 82 |
-
email_str = ", ".join(emails) if emails else "No emails found"
|
| 83 |
-
|
| 84 |
-
# Extract affiliations
|
| 85 |
-
affiliations = set(re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text))
|
| 86 |
-
affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
|
| 87 |
-
|
| 88 |
-
return title, author, email_str, affiliation_str
|
| 89 |
|
| 90 |
# ----------------- Step 1: Choose PDF Source -----------------
|
| 91 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|
|
|
|
| 49 |
st.session_state.vector_store = None
|
| 50 |
|
| 51 |
# ----------------- Improved Metadata Extraction -----------------
|
| 52 |
+
def extract_metadata_llm(pdf_path):
|
| 53 |
+
"""Extracts metadata using LLM instead of regex."""
|
| 54 |
with pdfplumber.open(pdf_path) as pdf:
|
| 55 |
+
first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
|
| 56 |
+
|
| 57 |
+
# LLM prompt for extracting metadata
|
| 58 |
+
metadata_prompt = PromptTemplate(
|
| 59 |
+
input_variables=["text"],
|
| 60 |
+
template="""
|
| 61 |
+
Given the following first page of a research paper, extract:
|
| 62 |
+
- The title of the paper
|
| 63 |
+
- The authors' names
|
| 64 |
+
- Any email addresses present
|
| 65 |
+
- The affiliations of the authors
|
| 66 |
+
|
| 67 |
+
Ensure accurate extraction.
|
| 68 |
+
|
| 69 |
+
First page content:
|
| 70 |
+
{text}
|
| 71 |
+
"""
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
|
| 75 |
+
metadata_response = metadata_chain.invoke({"text": first_page_text})
|
| 76 |
+
|
| 77 |
+
return metadata_response["metadata"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# ----------------- Step 1: Choose PDF Source -----------------
|
| 80 |
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
|