Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
a5f46a9
1
Parent(s):
26aea4d
Add text field to get formated text
Browse files
app.py
CHANGED
|
@@ -89,6 +89,7 @@ async def extract_text_pdf(id_doc: str):
|
|
| 89 |
|
| 90 |
postprocess_text = remove_in_betweens(pdf_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
|
|
|
| 92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 93 |
titles = doc.get_toc()
|
| 94 |
main_titles = []
|
|
@@ -98,12 +99,12 @@ async def extract_text_pdf(id_doc: str):
|
|
| 98 |
for title in titles:
|
| 99 |
if title[0] == 1:
|
| 100 |
main_titles.append(title[1])
|
| 101 |
-
return {"
|
| 102 |
else:
|
| 103 |
print("ID: " + id_doc)
|
| 104 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|
| 105 |
print("Status code: " + str(pdf_req.status_code))
|
| 106 |
-
return {"error": True, "message": "Error while downloading PDF: " + str(pdf_req.status_code)}
|
| 107 |
|
| 108 |
@app.get("/extract/random/{keyword}/{limit}")
|
| 109 |
async def extract_random_pdf(keyword: str, limit: int):
|
|
|
|
| 89 |
|
| 90 |
postprocess_text = remove_in_betweens(pdf_text)
|
| 91 |
postprocess_text = remove_punctuations(postprocess_text)
|
| 92 |
+
postprocess_text = re.sub(r"\ +", " ", postprocess_text)
|
| 93 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
| 94 |
titles = doc.get_toc()
|
| 95 |
main_titles = []
|
|
|
|
| 99 |
for title in titles:
|
| 100 |
if title[0] == 1:
|
| 101 |
main_titles.append(title[1])
|
| 102 |
+
return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
| 103 |
else:
|
| 104 |
print("ID: " + id_doc)
|
| 105 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|
| 106 |
print("Status code: " + str(pdf_req.status_code))
|
| 107 |
+
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
| 108 |
|
| 109 |
@app.get("/extract/random/{keyword}/{limit}")
|
| 110 |
async def extract_random_pdf(keyword: str, limit: int):
|