Spaces:
Sleeping
Sleeping
Commit
·
ba0c780
1
Parent(s):
0a8d09f
update
Browse files- src/demo/asg_loader.py +8 -1
- src/demo/views.py +5 -0
src/demo/asg_loader.py
CHANGED
|
@@ -161,6 +161,13 @@ class DocumentLoading:
|
|
| 161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
| 162 |
|
| 163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
| 165 |
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
| 166 |
md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
|
|
@@ -170,7 +177,7 @@ class DocumentLoading:
|
|
| 170 |
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
| 171 |
return self.process_md_file(md_file_path, survey_id)
|
| 172 |
|
| 173 |
-
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m",
|
| 174 |
try:
|
| 175 |
subprocess.run(command, check=True)
|
| 176 |
# 检查是否生成了 Markdown 文件
|
|
|
|
| 161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
| 162 |
|
| 163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
| 164 |
+
# 确保 mode 合法
|
| 165 |
+
valid_modes = ['auto', 'txt', 'ocr']
|
| 166 |
+
mineru_mode = mode if mode in valid_modes else 'auto'
|
| 167 |
+
if mode not in valid_modes:
|
| 168 |
+
print(f"Warning: unsupported mineru method '{mode}', defaulting to 'auto'.")
|
| 169 |
+
# 同时修正用于生成路径的 mode
|
| 170 |
+
mode = mineru_mode
|
| 171 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
| 172 |
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
| 173 |
md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
|
|
|
|
| 177 |
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
| 178 |
return self.process_md_file(md_file_path, survey_id)
|
| 179 |
|
| 180 |
+
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_mode]
|
| 181 |
try:
|
| 182 |
subprocess.run(command, check=True)
|
| 183 |
# 检查是否生成了 Markdown 文件
|
src/demo/views.py
CHANGED
|
@@ -313,6 +313,11 @@ def PosRank_get_top5_ngrams(input_pd):
|
|
| 313 |
|
| 314 |
def process_file(file_name, survey_id, mode):
|
| 315 |
global embedder
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
# 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
|
| 317 |
abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
|
| 318 |
result = process_pdf(abs_file_path, survey_id, embedder, mode)
|
|
|
|
| 313 |
|
| 314 |
def process_file(file_name, survey_id, mode):
|
| 315 |
global embedder
|
| 316 |
+
# 校验 mode,仅允许 auto/txt/ocr
|
| 317 |
+
valid_modes = ['auto', 'txt', 'ocr']
|
| 318 |
+
if mode not in valid_modes:
|
| 319 |
+
print(f"Warning: unsupported mode '{mode}', defaulting to 'auto'.")
|
| 320 |
+
mode = 'auto'
|
| 321 |
# 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
|
| 322 |
abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
|
| 323 |
result = process_pdf(abs_file_path, survey_id, embedder, mode)
|