Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

33dd4ca

verified ·

1 Parent(s): 97baa24

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -97

app.py CHANGED Viewed

@@ -47,85 +47,6 @@ if "processed_chunks" not in st.session_state:
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
-# ----------------- Text Cleaning Functions -----------------
-def clean_extracted_text(text):
-    """
-    Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
-    """
-    text = re.sub(r'\n+', '\n', text)  # Remove excessive newlines
-    text = re.sub(r'\s{2,}', ' ', text)  # Remove extra spaces
-    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)  # Fix hyphenated words split by a newline
-    return text.strip()
-def extract_title_manually(text):
-    """
-    Attempts to find the title by checking the first few lines.
-    - Titles are usually long enough (more than 5 words).
-    - Ignores common header text like "Abstract", "Introduction".
-    """
-    lines = text.split("\n")
-    ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
-    for line in lines[:5]:  # Check only the first 5 lines
-        clean_line = line.strip()
-        if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
-            return clean_line  # Return first valid title
-    return "Unknown"
-# ----------------- Metadata Extraction -----------------
-def extract_metadata_llm(pdf_path):
-    """Extracts metadata using LLM for better accuracy."""
-    with pdfplumber.open(pdf_path) as pdf:
-        if not pdf.pages:
-            return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
-        # Extract text from the first page
-        first_page_text = pdf.pages[0].extract_text()
-        if not first_page_text:
-            return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
-        cleaned_text = first_page_text.strip()
-    # Define a structured prompt for the LLM
-    metadata_prompt = PromptTemplate(
-        input_variables=["text"],
-        template="""
-        Extract the following metadata from the research paper's first page:
-        - Title
-        - Authors (comma-separated)
-        - Emails (comma-separated)
-        - Affiliations
-        Ensure the output is in **valid JSON format** with keys: "Title", "Author", "Emails", "Affiliations".
-        Here is the text:
-        {text}
-        Provide the JSON output only, no extra text.
-        """
-    )
-    # Run the LLM Metadata Extraction
-    metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
-    try:
-        metadata_response = metadata_chain.invoke({"text": cleaned_text})
-        # Convert the LLM response into a dictionary
-        metadata_dict = json.loads(metadata_response["metadata"])
-    except Exception as e:
-        metadata_dict = {
-            "Title": "Unknown",
-            "Author": "Unknown",
-            "Emails": "No emails found",
-            "Affiliations": "No affiliations found"
-        }
-    return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
@@ -164,34 +85,16 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
     with st.spinner("🔄 Processing document... Please wait."):
         loader = PDFPlumberLoader(st.session_state.pdf_path)
         docs = loader.load()
-        st.json(docs[0].metadata)
-        # Extract metadata
-        metadata = extract_metadata_llm(st.session_state.pdf_path)
-        # Display extracted-metadata
-        if isinstance(metadata, dict):
-            st.subheader("📄 Extracted Document Metadata")
-            st.write(f"**Title:** {metadata.get('Title', 'Unknown')}")
-            st.write(f"**Author:** {metadata.get('Author', 'Unknown')}")
-            st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
-            st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
-        else:
-            st.error("Metadata extraction failed.")
         # Embedding Model
         model_name = "nomic-ai/modernbert-embed-base"
         embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
-        # Convert metadata into a retrievable chunk
-        metadata_doc = {"page_content": metadata, "metadata": {"source": "metadata"}}
         # Prevent unnecessary re-chunking
         if not st.session_state.chunked:
             text_splitter = SemanticChunker(embedding_model)
             document_chunks = text_splitter.split_documents(docs)
-            document_chunks.insert(0, metadata_doc)  # Insert metadata as a retrievable document
             st.session_state.processed_chunks = document_chunks
             st.session_state.chunked = True

 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
     with st.spinner("🔄 Processing document... Please wait."):
         loader = PDFPlumberLoader(st.session_state.pdf_path)
         docs = loader.load()
         # Embedding Model
         model_name = "nomic-ai/modernbert-embed-base"
         embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
         # Prevent unnecessary re-chunking
         if not st.session_state.chunked:
             text_splitter = SemanticChunker(embedding_model)
             document_chunks = text_splitter.split_documents(docs)
             st.session_state.processed_chunks = document_chunks
             st.session_state.chunked = True