Spaces:

OrganizedProgrammers
/

arXiv

Sleeping

Omar ID EL MOUMEN commited on Mar 26

Commit

a5f46a9

1 Parent(s): 26aea4d

Add text field to get formated text

Files changed (1) hide show

app.py CHANGED Viewed

@@ -89,6 +89,7 @@ async def extract_text_pdf(id_doc: str):
         postprocess_text = remove_in_betweens(pdf_text)
         postprocess_text = remove_punctuations(postprocess_text)
         regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
         titles = doc.get_toc()
         main_titles = []
@@ -98,12 +99,12 @@ async def extract_text_pdf(id_doc: str):
             for title in titles:
                 if title[0] == 1:
                     main_titles.append(title[1])
-        return {"message": main_titles, "pub_id": id_doc, "error": False} if len(main_titles) > 0 else {"message": f"No titles, document of {doc.page_count} pages", "pub_id": id_doc, "error": False}
     else:
         print("ID: " + id_doc)
         print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
         print("Status code: " + str(pdf_req.status_code))
-        return {"error": True, "message": "Error while downloading PDF: " + str(pdf_req.status_code)}
 @app.get("/extract/random/{keyword}/{limit}")
 async def extract_random_pdf(keyword: str, limit: int):

         postprocess_text = remove_in_betweens(pdf_text)
         postprocess_text = remove_punctuations(postprocess_text)
+        postprocess_text = re.sub(r"\ +", " ", postprocess_text)
         regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
         titles = doc.get_toc()
         main_titles = []
             for title in titles:
                 if title[0] == 1:
                     main_titles.append(title[1])
+        return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False}
     else:
         print("ID: " + id_doc)
         print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
         print("Status code: " + str(pdf_req.status_code))
+        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
 @app.get("/extract/random/{keyword}/{limit}")
 async def extract_random_pdf(keyword: str, limit: int):