Spaces:
Running
Running
workaround for spellcheck fail
Browse files- pdf2text.py +23 -20
pdf2text.py
CHANGED
|
@@ -213,26 +213,29 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
|
|
| 213 |
str: text with replaced tokens
|
| 214 |
"""
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
|
| 238 |
def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
|
|
|
| 213 |
str: text with replaced tokens
|
| 214 |
"""
|
| 215 |
|
| 216 |
+
try:
|
| 217 |
+
if match_token not in text:
|
| 218 |
+
return text
|
| 219 |
+
else:
|
| 220 |
+
while True:
|
| 221 |
+
full_before_text = text.split(match_token, maxsplit=1)[0]
|
| 222 |
+
before_text = [
|
| 223 |
+
char for char in full_before_text.split()[-1] if char.isalpha()
|
| 224 |
+
]
|
| 225 |
+
before_text = "".join(before_text)
|
| 226 |
+
full_after_text = text.split(match_token, maxsplit=1)[-1]
|
| 227 |
+
after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
|
| 228 |
+
after_text = "".join(after_text)
|
| 229 |
+
full_text = before_text + after_text
|
| 230 |
+
if check_word_spelling(full_text):
|
| 231 |
+
text = full_before_text + full_after_text
|
| 232 |
+
else:
|
| 233 |
+
text = full_before_text + " " + full_after_text
|
| 234 |
+
if match_token not in text:
|
| 235 |
+
break
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
|
| 238 |
+
return text
|
| 239 |
|
| 240 |
|
| 241 |
def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|