Spaces:
Runtime error
Runtime error
| # Media_wiki_tab.py | |
| # Description: Gradio UI snippet that allows users to import a MediaWiki XML dump file into the application. | |
| # | |
| # Imports | |
| import os | |
| from threading import Thread | |
| # | |
| # 3rd-party Imports | |
| import gradio as gr | |
| # | |
| # Local Imports | |
| from App_Function_Libraries.MediaWiki.Media_Wiki import import_mediawiki_dump | |
| # | |
| ####################################################################################################################### | |
| # | |
| # Create MediaWiki Import Tab | |
| def create_mediawiki_import_tab(): | |
| with gr.Tab("MediaWiki Import"): | |
| gr.Markdown("# Import MediaWiki Dump") | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_path = gr.File(label="MediaWiki XML Dump File") | |
| wiki_name = gr.Textbox(label="Wiki Name", placeholder="Enter a unique name for this wiki") | |
| namespaces = gr.Textbox(label="Namespaces (comma-separated integers, leave empty for all)") | |
| skip_redirects = gr.Checkbox(label="Skip Redirects", value=True) | |
| single_item = gr.Checkbox(label="Import as Single Item", value=False) | |
| chunk_method = gr.Dropdown( | |
| choices=["sentences", "words", "paragraphs", "tokens"], | |
| value="sentences", | |
| label="Chunking Method" | |
| ) | |
| chunk_size = gr.Slider(minimum=100, maximum=2000, value=1000, step=100, label="Chunk Size") | |
| chunk_overlap = gr.Slider(minimum=0, maximum=500, value=100, step=10, label="Chunk Overlap") | |
| import_button = gr.Button("Import MediaWiki Dump") | |
| cancel_button = gr.Button("Cancel Import", visible=False) | |
| with gr.Column(): | |
| output = gr.Markdown(label="Import Status") | |
| progress_bar = gr.Progress() | |
| def validate_inputs(file_path, wiki_name, namespaces): | |
| if not file_path: | |
| return "Please select a MediaWiki XML dump file." | |
| if not wiki_name: | |
| return "Please enter a name for the wiki." | |
| if namespaces: | |
| try: | |
| [int(ns.strip()) for ns in namespaces.split(',')] | |
| except ValueError: | |
| return "Invalid namespaces. Please enter comma-separated integers." | |
| return None | |
| def check_file_size(file_path): | |
| max_size_mb = 1000 # 1 GB | |
| file_size_mb = os.path.getsize(file_path) / (1024 * 1024) | |
| if file_size_mb > max_size_mb: | |
| return f"Warning: The selected file is {file_size_mb:.2f} MB. Importing large files may take a long time." | |
| return None | |
| import_thread = None | |
| cancel_flag = False | |
| def run_import(file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, | |
| chunk_overlap, progress=gr.Progress()): | |
| validation_error = validate_inputs(file_path, wiki_name, namespaces) | |
| if validation_error: | |
| return gr.update(), gr.update(), validation_error | |
| file_size_warning = check_file_size(file_path.name) | |
| status_text = "# MediaWiki Import Process\n\n## Initializing\n- Starting import process...\n" | |
| if file_size_warning: | |
| status_text += f"- {file_size_warning}\n" | |
| chunk_options = { | |
| 'method': chunk_method, | |
| 'max_size': chunk_size, | |
| 'overlap': chunk_overlap, | |
| 'adaptive': True, | |
| 'language': 'en' | |
| } | |
| namespaces_list = [int(ns.strip()) for ns in namespaces.split(',')] if namespaces else None | |
| pages_processed = 0 | |
| try: | |
| for progress_info in import_mediawiki_dump( | |
| file_path=file_path.name, | |
| wiki_name=wiki_name, | |
| namespaces=namespaces_list, | |
| skip_redirects=skip_redirects, | |
| chunk_options=chunk_options, | |
| single_item=single_item, | |
| progress_callback=progress | |
| ): | |
| if progress_info.startswith("Found"): | |
| status_text += f"\n## Parsing\n- {progress_info}\n" | |
| elif progress_info.startswith("Processed page"): | |
| pages_processed += 1 | |
| if pages_processed % 10 == 0: # Update every 10 pages to avoid too frequent updates | |
| status_text += f"- {progress_info}\n" | |
| elif progress_info.startswith("Successfully imported"): | |
| status_text += f"\n## Completed\n- {progress_info}\n- Total pages processed: {pages_processed}" | |
| else: | |
| status_text += f"- {progress_info}\n" | |
| yield gr.update(), gr.update(), status_text | |
| status_text += "\n## Import Process Completed Successfully" | |
| except Exception as e: | |
| status_text += f"\n## Error\n- An error occurred during the import process: {str(e)}" | |
| yield gr.update(visible=False), gr.update(visible=True), status_text | |
| def start_import(*args): | |
| nonlocal import_thread | |
| import_thread = Thread(target=run_import, args=args) | |
| import_thread.start() | |
| return gr.update(visible=True), gr.update(visible=False), gr.update( | |
| value="Import process started. Please wait...") | |
| def cancel_import(): | |
| nonlocal cancel_flag | |
| cancel_flag = True | |
| return gr.update(visible=False), gr.update(visible=True) | |
| import_button.click( | |
| run_import, | |
| inputs=[file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, | |
| chunk_overlap], | |
| outputs=[cancel_button, import_button, output] | |
| ) | |
| cancel_button.click( | |
| cancel_import, | |
| outputs=[cancel_button, import_button] | |
| ) | |
| return file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, chunk_overlap, import_button, output | |
| # | |
| # End of MediaWiki Import Tab | |
| ####################################################################################################################### | |