Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
π update logs
Browse filesSigned-off-by: peter szemraj <[email protected]>
app.py
CHANGED
|
@@ -19,7 +19,8 @@ os.environ[
|
|
| 19 |
] = "false" # parallelism on tokenizers is buggy with gradio
|
| 20 |
|
| 21 |
logging.basicConfig(
|
| 22 |
-
level=logging.INFO,
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
import gradio as gr
|
|
@@ -232,18 +233,20 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
| 232 |
:param bool lower: whether to lowercase the text
|
| 233 |
:return str: the text of the file
|
| 234 |
"""
|
|
|
|
|
|
|
| 235 |
# check if mysterious file object is a list
|
| 236 |
if isinstance(file_obj, list):
|
| 237 |
file_obj = file_obj[0]
|
| 238 |
file_path = Path(file_obj.name)
|
| 239 |
try:
|
| 240 |
-
|
| 241 |
if file_path.suffix == ".txt":
|
| 242 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 243 |
raw_text = f.read()
|
| 244 |
text = clean(raw_text, lower=lower)
|
| 245 |
elif file_path.suffix == ".pdf":
|
| 246 |
-
|
| 247 |
conversion_stats = convert_PDF_to_Text(
|
| 248 |
file_path,
|
| 249 |
ocr_model=ocr_model,
|
|
@@ -251,18 +254,19 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
| 251 |
)
|
| 252 |
text = conversion_stats["converted_text"]
|
| 253 |
else:
|
| 254 |
-
|
| 255 |
text = "ERROR - check file - unknown file type"
|
| 256 |
|
| 257 |
return text
|
| 258 |
except Exception as e:
|
| 259 |
-
|
| 260 |
return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
|
| 261 |
|
| 262 |
|
| 263 |
if __name__ == "__main__":
|
| 264 |
-
logging.
|
| 265 |
-
|
|
|
|
| 266 |
with contextlib.redirect_stdout(None):
|
| 267 |
ocr_model = ocr_predictor(
|
| 268 |
"db_resnet50",
|
|
@@ -271,7 +275,7 @@ if __name__ == "__main__":
|
|
| 271 |
assume_straight_pages=True,
|
| 272 |
)
|
| 273 |
name_to_path = load_example_filenames(_here / "examples")
|
| 274 |
-
|
| 275 |
demo = gr.Blocks()
|
| 276 |
_examples = list(name_to_path.keys())
|
| 277 |
with demo:
|
|
@@ -355,7 +359,7 @@ if __name__ == "__main__":
|
|
| 355 |
minimum=0.5,
|
| 356 |
maximum=1.0,
|
| 357 |
label="length penalty",
|
| 358 |
-
|
| 359 |
step=0.05,
|
| 360 |
)
|
| 361 |
token_batch_length = gr.Radio(
|
|
@@ -369,7 +373,7 @@ if __name__ == "__main__":
|
|
| 369 |
minimum=1.0,
|
| 370 |
maximum=5.0,
|
| 371 |
label="repetition penalty",
|
| 372 |
-
|
| 373 |
step=0.1,
|
| 374 |
)
|
| 375 |
no_repeat_ngram_size = gr.Radio(
|
|
|
|
| 19 |
] = "false" # parallelism on tokenizers is buggy with gradio
|
| 20 |
|
| 21 |
logging.basicConfig(
|
| 22 |
+
level=logging.INFO,
|
| 23 |
+
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
|
| 24 |
)
|
| 25 |
|
| 26 |
import gradio as gr
|
|
|
|
| 233 |
:param bool lower: whether to lowercase the text
|
| 234 |
:return str: the text of the file
|
| 235 |
"""
|
| 236 |
+
|
| 237 |
+
logger = logging.getLogger(__name__)
|
| 238 |
# check if mysterious file object is a list
|
| 239 |
if isinstance(file_obj, list):
|
| 240 |
file_obj = file_obj[0]
|
| 241 |
file_path = Path(file_obj.name)
|
| 242 |
try:
|
| 243 |
+
logger.info(f"Loading file:\t{file_path}")
|
| 244 |
if file_path.suffix == ".txt":
|
| 245 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 246 |
raw_text = f.read()
|
| 247 |
text = clean(raw_text, lower=lower)
|
| 248 |
elif file_path.suffix == ".pdf":
|
| 249 |
+
logger.info(f"loading as PDF file {file_path}")
|
| 250 |
conversion_stats = convert_PDF_to_Text(
|
| 251 |
file_path,
|
| 252 |
ocr_model=ocr_model,
|
|
|
|
| 254 |
)
|
| 255 |
text = conversion_stats["converted_text"]
|
| 256 |
else:
|
| 257 |
+
logger.error(f"Unknown file type {file_path.suffix}")
|
| 258 |
text = "ERROR - check file - unknown file type"
|
| 259 |
|
| 260 |
return text
|
| 261 |
except Exception as e:
|
| 262 |
+
logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
| 263 |
return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
|
| 264 |
|
| 265 |
|
| 266 |
if __name__ == "__main__":
|
| 267 |
+
logger = logging.getLogger(__name__)
|
| 268 |
+
logger.info("Starting app instance")
|
| 269 |
+
logger.info("Loading OCR model")
|
| 270 |
with contextlib.redirect_stdout(None):
|
| 271 |
ocr_model = ocr_predictor(
|
| 272 |
"db_resnet50",
|
|
|
|
| 275 |
assume_straight_pages=True,
|
| 276 |
)
|
| 277 |
name_to_path = load_example_filenames(_here / "examples")
|
| 278 |
+
logger.info(f"Loaded {len(name_to_path)} examples")
|
| 279 |
demo = gr.Blocks()
|
| 280 |
_examples = list(name_to_path.keys())
|
| 281 |
with demo:
|
|
|
|
| 359 |
minimum=0.5,
|
| 360 |
maximum=1.0,
|
| 361 |
label="length penalty",
|
| 362 |
+
value=0.7,
|
| 363 |
step=0.05,
|
| 364 |
)
|
| 365 |
token_batch_length = gr.Radio(
|
|
|
|
| 373 |
minimum=1.0,
|
| 374 |
maximum=5.0,
|
| 375 |
label="repetition penalty",
|
| 376 |
+
value=1.5,
|
| 377 |
step=0.1,
|
| 378 |
)
|
| 379 |
no_repeat_ngram_size = gr.Radio(
|