Spaces:
Running
Running
π improve formatting
Browse filesSigned-off-by: peter szemraj <[email protected]>
- pdf2text.py +8 -7
pdf2text.py
CHANGED
|
@@ -556,7 +556,7 @@ def postprocess(text: str) -> str:
|
|
| 556 |
return eval_and_replace(proc)
|
| 557 |
|
| 558 |
|
| 559 |
-
def result2text(result) -> str:
|
| 560 |
"""Convert OCR result to text"""
|
| 561 |
|
| 562 |
full_doc = []
|
|
@@ -570,8 +570,7 @@ def result2text(result) -> str:
|
|
| 570 |
text += word.value + " "
|
| 571 |
full_doc.append(text)
|
| 572 |
|
| 573 |
-
|
| 574 |
-
return full_text
|
| 575 |
|
| 576 |
|
| 577 |
import warnings
|
|
@@ -603,8 +602,10 @@ def convert_PDF_to_Text(
|
|
| 603 |
logging.info(f"running OCR on {len(doc)} pages")
|
| 604 |
result = ocr_model(doc)
|
| 605 |
raw_text = result2text(result)
|
| 606 |
-
proc_text = format_ocr_out(raw_text
|
| 607 |
-
|
|
|
|
|
|
|
| 608 |
|
| 609 |
fn_rt = time.perf_counter() - st
|
| 610 |
|
|
@@ -614,8 +615,8 @@ def convert_PDF_to_Text(
|
|
| 614 |
"num_pages": len(doc),
|
| 615 |
"runtime": round(fn_rt, 2),
|
| 616 |
"date": str(date.today()),
|
| 617 |
-
"converted_text":
|
| 618 |
-
"length": len(
|
| 619 |
}
|
| 620 |
|
| 621 |
return results_dict
|
|
|
|
| 556 |
return eval_and_replace(proc)
|
| 557 |
|
| 558 |
|
| 559 |
+
def result2text(result, as_text=False) -> str or list:
|
| 560 |
"""Convert OCR result to text"""
|
| 561 |
|
| 562 |
full_doc = []
|
|
|
|
| 570 |
text += word.value + " "
|
| 571 |
full_doc.append(text)
|
| 572 |
|
| 573 |
+
return "\n".join(full_doc) if as_text else full_doc
|
|
|
|
| 574 |
|
| 575 |
|
| 576 |
import warnings
|
|
|
|
| 602 |
logging.info(f"running OCR on {len(doc)} pages")
|
| 603 |
result = ocr_model(doc)
|
| 604 |
raw_text = result2text(result)
|
| 605 |
+
proc_text = [format_ocr_out(r) for r in raw_text]
|
| 606 |
+
fin_text = [postprocess(t) for t in proc_text]
|
| 607 |
+
|
| 608 |
+
ocr_results = "\n\n".join(fin_text)
|
| 609 |
|
| 610 |
fn_rt = time.perf_counter() - st
|
| 611 |
|
|
|
|
| 615 |
"num_pages": len(doc),
|
| 616 |
"runtime": round(fn_rt, 2),
|
| 617 |
"date": str(date.today()),
|
| 618 |
+
"converted_text": ocr_results,
|
| 619 |
+
"length": len(ocr_results),
|
| 620 |
}
|
| 621 |
|
| 622 |
return results_dict
|