Spaces:

jbilcke-hf
/

VideoModelStudio

Paused

App Files Files Community

Julian Bilcke commited on Mar 9

Commit

dc948ae

1 Parent(s): 55eb99b

working on hf dataset downloader

Browse files

Files changed (6) hide show

app.py +1 -1
vms/services/importer/hub_dataset.py +106 -15
vms/services/importer/import_service.py +18 -5
vms/tabs/import_tab/hub_tab.py +116 -74
vms/tabs/import_tab/import_tab.py +76 -22
vms/ui/video_trainer_ui.py +26 -0

app.py CHANGED Viewed

@@ -65,7 +65,7 @@ def main():
     ]
     # Launch the Gradio app
-    app.queue(default_concurrency_limit=1).launch(
         server_name="0.0.0.0",
         allowed_paths=allowed_paths
     )

     ]
     # Launch the Gradio app
+    app.queue(default_concurrency_limit=2).launch(
         server_name="0.0.0.0",
         allowed_paths=allowed_paths
     )

vms/services/importer/hub_dataset.py CHANGED Viewed

@@ -10,7 +10,7 @@ import asyncio
 import logging
 import gradio as gr
 from pathlib import Path
-from typing import List, Dict, Optional, Tuple, Any, Union
 from huggingface_hub import (
     HfApi,
@@ -43,6 +43,7 @@ class HubDatasetBrowser:
         Returns:
             List of datasets matching the query [id, title, downloads]
         """
         try:
             # Start with some filters to find video-related datasets
@@ -126,15 +127,10 @@ class HubDatasetBrowser:
             # Add basic stats (with safer access)
             downloads = getattr(dataset_info, 'downloads', None)
-            info_text += f"**Downloads:** {downloads if downloads is not None else 'N/A'}\n"
             last_modified = getattr(dataset_info, 'last_modified', None)
-            info_text += f"**Last modified:** {last_modified if last_modified is not None else 'N/A'}\n"
-            # Show tags if available (with safer access)
-            tags = getattr(dataset_info, "tags", None) or []
-            if tags:
-                info_text += f"**Tags:** {', '.join(tags[:10])}\n\n"
             # Group files by type
             file_groups = {
@@ -168,13 +164,20 @@ class HubDatasetBrowser:
             logger.error(f"Error getting dataset info: {str(e)}", exc_info=True)
             return f"Error loading dataset information: {str(e)}", {}, {}
-    async def download_file_group(self, dataset_id: str, file_type: str, enable_splitting: bool = True) -> str:
         """Download all files of a specific type from the dataset
         Args:
             dataset_id: The dataset ID
             file_type: Either "video" or "webdataset"
             enable_splitting: Whether to enable automatic video splitting
         Returns:
             Status message
@@ -190,6 +193,11 @@ class HubDatasetBrowser:
                 return f"No {file_type} files found in the dataset"
             logger.info(f"Downloading {len(files)} {file_type} files from dataset {dataset_id}")
             # Track counts for status message
             video_count = 0
@@ -200,8 +208,16 @@ class HubDatasetBrowser:
                 temp_path = Path(temp_dir)
                 # Process all files of the requested type
-                for filename in files:
                     try:
                         # Download the file
                         file_path = hf_hub_download(
                             repo_id=dataset_id,
@@ -212,6 +228,7 @@ class HubDatasetBrowser:
                         file_path = Path(file_path)
                         logger.info(f"Downloaded file to {file_path}")
                         # Process based on file type
                         if file_type == "video":
@@ -274,9 +291,13 @@ class HubDatasetBrowser:
                     except Exception as e:
                         logger.warning(f"Error processing file {filename}: {e}")
                 # Generate status message
                 if file_type == "video":
-                    return f"Successfully imported {video_count} videos from dataset {dataset_id}"
                 elif file_type == "webdataset":
                     parts = []
                     if video_count > 0:
@@ -285,23 +306,37 @@ class HubDatasetBrowser:
                         parts.append(f"{image_count} image{'s' if image_count != 1 else ''}")
                     if parts:
-                        return f"Successfully imported {' and '.join(parts)} from WebDataset archives"
                     else:
-                        return f"No media was found in the WebDataset archives"
-                return f"Unknown file type: {file_type}"
         except Exception as e:
             error_msg = f"Error downloading {file_type} files: {str(e)}"
             logger.error(error_msg, exc_info=True)
             return error_msg
-    async def download_dataset(self, dataset_id: str, enable_splitting: bool = True) -> Tuple[str, str]:
         """Download a dataset and process its video/image content
         Args:
             dataset_id: The dataset ID to download
             enable_splitting: Whether to enable automatic video splitting
         Returns:
             Tuple of (loading_msg, status_msg)
@@ -327,9 +362,15 @@ class HubDatasetBrowser:
                 video_files = [s.rfilename for s in siblings if hasattr(s, 'rfilename') and s.rfilename.lower().endswith((".mp4", ".webm"))]
                 tar_files = [s.rfilename for s in siblings if hasattr(s, 'rfilename') and s.rfilename.lower().endswith(".tar")]
             # Create a temporary directory for downloads
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_path = Path(temp_dir)
                 # If we have video files, download them individually
                 if video_files:
@@ -337,6 +378,14 @@ class HubDatasetBrowser:
                     logger.info(f"Downloading {len(video_files)} video files from {dataset_id}")
                     for i, video_file in enumerate(video_files):
                         # Download the video file
                         try:
                             file_path = hf_hub_download(
@@ -369,6 +418,7 @@ class HubDatasetBrowser:
                             status_msg = f"Downloaded video {i+1}/{len(video_files)} from {dataset_id}"
                             logger.info(status_msg)
                         except Exception as e:
                             logger.warning(f"Error downloading {video_file}: {e}")
@@ -378,6 +428,14 @@ class HubDatasetBrowser:
                     logger.info(f"Downloading {len(tar_files)} WebDataset files from {dataset_id}")
                     for i, tar_file in enumerate(tar_files):
                         try:
                             file_path = hf_hub_download(
                                 repo_id=dataset_id,
@@ -387,6 +445,7 @@ class HubDatasetBrowser:
                             )
                             status_msg = f"Downloaded WebDataset {i+1}/{len(tar_files)} from {dataset_id}"
                             logger.info(status_msg)
                         except Exception as e:
                             logger.warning(f"Error downloading {tar_file}: {e}")
@@ -395,6 +454,9 @@ class HubDatasetBrowser:
                     loading_msg = f"{loading_msg}\n\nDownloading entire dataset repository..."
                     logger.info(f"No specific media files found, downloading entire repository for {dataset_id}")
                     try:
                         snapshot_download(
                             repo_id=dataset_id,
@@ -403,6 +465,9 @@ class HubDatasetBrowser:
                         )
                         status_msg = f"Downloaded entire repository for {dataset_id}"
                         logger.info(status_msg)
                     except Exception as e:
                         logger.error(f"Error downloading dataset snapshot: {e}", exc_info=True)
                         return loading_msg, f"Error downloading dataset: {str(e)}"
@@ -411,6 +476,9 @@ class HubDatasetBrowser:
                 loading_msg = f"{loading_msg}\n\nProcessing downloaded files..."
                 logger.info(f"Processing downloaded files from {dataset_id}")
                 # Count imported files
                 video_count = 0
                 image_count = 0
@@ -420,11 +488,28 @@ class HubDatasetBrowser:
                 async def process_files():
                     nonlocal video_count, image_count, tar_count
                     # Process all files in the temp directory
                     for root, _, files in os.walk(temp_path):
                         for file in files:
                             file_path = Path(root) / file
                             # Process videos
                             if file.lower().endswith((".mp4", ".webm")):
                                 # Choose target path based on auto-splitting setting
@@ -490,10 +575,16 @@ class HubDatasetBrowser:
                                     logger.info(f"Extracted {vid_count} videos and {img_count} images from {file}")
                                 except Exception as e:
                                     logger.error(f"Error processing WebDataset file {file_path}: {str(e)}", exc_info=True)
                 # Run the processing asynchronously
                 await process_files()
                 # Generate final status message
                 parts = []
                 if video_count > 0:

 import logging
 import gradio as gr
 from pathlib import Path
+from typing import List, Dict, Optional, Tuple, Any, Union, Callable
 from huggingface_hub import (
     HfApi,
         Returns:
             List of datasets matching the query [id, title, downloads]
+            Note: We still return all columns internally, but the UI will only display the first column
         """
         try:
             # Start with some filters to find video-related datasets
             # Add basic stats (with safer access)
             downloads = getattr(dataset_info, 'downloads', None)
+            info_text += f"## Downloads: {downloads if downloads is not None else 'N/A'}\n"
             last_modified = getattr(dataset_info, 'last_modified', None)
+            info_text += f"## Last modified: {last_modified if last_modified is not None else 'N/A'}\n"
             # Group files by type
             file_groups = {
             logger.error(f"Error getting dataset info: {str(e)}", exc_info=True)
             return f"Error loading dataset information: {str(e)}", {}, {}
+    async def download_file_group(
+        self,
+        dataset_id: str,
+        file_type: str,
+        enable_splitting: bool = True,
+        progress_callback: Optional[Callable] = None
+    ) -> str:
         """Download all files of a specific type from the dataset
         Args:
             dataset_id: The dataset ID
             file_type: Either "video" or "webdataset"
             enable_splitting: Whether to enable automatic video splitting
+            progress_callback: Optional callback for progress updates
         Returns:
             Status message
                 return f"No {file_type} files found in the dataset"
             logger.info(f"Downloading {len(files)} {file_type} files from dataset {dataset_id}")
+            gr.Info(f"Starting download of {len(files)} {file_type} files from {dataset_id}")
+            # Initialize progress if callback provided
+            if progress_callback:
+                progress_callback(0, desc=f"Starting download of {len(files)} {file_type} files", total=len(files))
             # Track counts for status message
             video_count = 0
                 temp_path = Path(temp_dir)
                 # Process all files of the requested type
+                for i, filename in enumerate(files):
                     try:
+                        # Update progress
+                        if progress_callback:
+                            progress_callback(
+                                i,
+                                desc=f"Downloading file {i+1}/{len(files)}: {Path(filename).name}",
+                                total=len(files)
+                            )
                         # Download the file
                         file_path = hf_hub_download(
                             repo_id=dataset_id,
                         file_path = Path(file_path)
                         logger.info(f"Downloaded file to {file_path}")
+                        #gr.Info(f"Downloaded {file_path.name} ({i+1}/{len(files)})")
                         # Process based on file type
                         if file_type == "video":
                     except Exception as e:
                         logger.warning(f"Error processing file {filename}: {e}")
+                # Update progress to complete
+                if progress_callback:
+                    progress_callback(len(files), desc="Download complete", total=len(files))
                 # Generate status message
                 if file_type == "video":
+                    status_msg = f"Successfully imported {video_count} videos from dataset {dataset_id}"
                 elif file_type == "webdataset":
                     parts = []
                     if video_count > 0:
                         parts.append(f"{image_count} image{'s' if image_count != 1 else ''}")
                     if parts:
+                        status_msg = f"Successfully imported {' and '.join(parts)} from WebDataset archives"
                     else:
+                        status_msg = f"No media was found in the WebDataset archives"
+                else:
+                    status_msg = f"Unknown file type: {file_type}"
+                # Final notification
+                logger.info(f"✅ Download complete! {status_msg}")
+                # This info message will appear as a toast notification
+                gr.Info(f"✅ Download complete! {status_msg}")
+                return status_msg
         except Exception as e:
             error_msg = f"Error downloading {file_type} files: {str(e)}"
             logger.error(error_msg, exc_info=True)
+            gr.Error(error_msg)
             return error_msg
+    async def download_dataset(
+        self,
+        dataset_id: str,
+        enable_splitting: bool = True,
+        progress_callback: Optional[Callable] = None
+    ) -> Tuple[str, str]:
         """Download a dataset and process its video/image content
         Args:
             dataset_id: The dataset ID to download
             enable_splitting: Whether to enable automatic video splitting
+            progress_callback: Optional callback for progress tracking
         Returns:
             Tuple of (loading_msg, status_msg)
                 video_files = [s.rfilename for s in siblings if hasattr(s, 'rfilename') and s.rfilename.lower().endswith((".mp4", ".webm"))]
                 tar_files = [s.rfilename for s in siblings if hasattr(s, 'rfilename') and s.rfilename.lower().endswith(".tar")]
+            # Initialize progress tracking
+            total_files = len(video_files) + len(tar_files)
+            if progress_callback:
+                progress_callback(0, desc=f"Starting download of dataset: {dataset_id}", total=total_files)
             # Create a temporary directory for downloads
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_path = Path(temp_dir)
+                files_processed = 0
                 # If we have video files, download them individually
                 if video_files:
                     logger.info(f"Downloading {len(video_files)} video files from {dataset_id}")
                     for i, video_file in enumerate(video_files):
+                        # Update progress
+                        if progress_callback:
+                            progress_callback(
+                                files_processed,
+                                desc=f"Downloading video {i+1}/{len(video_files)}: {Path(video_file).name}",
+                                total=total_files
+                            )
                         # Download the video file
                         try:
                             file_path = hf_hub_download(
                             status_msg = f"Downloaded video {i+1}/{len(video_files)} from {dataset_id}"
                             logger.info(status_msg)
+                            files_processed += 1
                         except Exception as e:
                             logger.warning(f"Error downloading {video_file}: {e}")
                     logger.info(f"Downloading {len(tar_files)} WebDataset files from {dataset_id}")
                     for i, tar_file in enumerate(tar_files):
+                        # Update progress
+                        if progress_callback:
+                            progress_callback(
+                                files_processed,
+                                desc=f"Downloading WebDataset {i+1}/{len(tar_files)}: {Path(tar_file).name}",
+                                total=total_files
+                            )
                         try:
                             file_path = hf_hub_download(
                                 repo_id=dataset_id,
                             )
                             status_msg = f"Downloaded WebDataset {i+1}/{len(tar_files)} from {dataset_id}"
                             logger.info(status_msg)
+                            files_processed += 1
                         except Exception as e:
                             logger.warning(f"Error downloading {tar_file}: {e}")
                     loading_msg = f"{loading_msg}\n\nDownloading entire dataset repository..."
                     logger.info(f"No specific media files found, downloading entire repository for {dataset_id}")
+                    if progress_callback:
+                        progress_callback(0, desc=f"Downloading entire repository for {dataset_id}", total=1)
                     try:
                         snapshot_download(
                             repo_id=dataset_id,
                         )
                         status_msg = f"Downloaded entire repository for {dataset_id}"
                         logger.info(status_msg)
+                        if progress_callback:
+                            progress_callback(1, desc="Repository download complete", total=1)
                     except Exception as e:
                         logger.error(f"Error downloading dataset snapshot: {e}", exc_info=True)
                         return loading_msg, f"Error downloading dataset: {str(e)}"
                 loading_msg = f"{loading_msg}\n\nProcessing downloaded files..."
                 logger.info(f"Processing downloaded files from {dataset_id}")
+                if progress_callback:
+                    progress_callback(0, desc="Processing downloaded files", total=100)
                 # Count imported files
                 video_count = 0
                 image_count = 0
                 async def process_files():
                     nonlocal video_count, image_count, tar_count
+                    # Get total number of files to process
+                    file_count = 0
+                    for root, _, files in os.walk(temp_path):
+                        file_count += len(files)
+                    processed = 0
                     # Process all files in the temp directory
                     for root, _, files in os.walk(temp_path):
                         for file in files:
                             file_path = Path(root) / file
+                            # Update progress (every 5 files to avoid too many updates)
+                            if progress_callback and processed % 5 == 0:
+                                if file_count > 0:
+                                    progress_percent = int((processed / file_count) * 100)
+                                    progress_callback(
+                                        progress_percent,
+                                        desc=f"Processing files: {processed}/{file_count}",
+                                        total=100
+                                    )
                             # Process videos
                             if file.lower().endswith((".mp4", ".webm")):
                                 # Choose target path based on auto-splitting setting
                                     logger.info(f"Extracted {vid_count} videos and {img_count} images from {file}")
                                 except Exception as e:
                                     logger.error(f"Error processing WebDataset file {file_path}: {str(e)}", exc_info=True)
+                            processed += 1
                 # Run the processing asynchronously
                 await process_files()
+                # Update progress to complete
+                if progress_callback:
+                    progress_callback(100, desc="Processing complete", total=100)
                 # Generate final status message
                 parts = []
                 if video_count > 0:

vms/services/importer/import_service.py CHANGED Viewed

@@ -4,7 +4,7 @@ Delegates to specialized handler classes for different import types.
 """
 import logging
-from typing import List, Dict, Optional, Tuple, Any, Union
 from pathlib import Path
 import gradio as gr
@@ -76,27 +76,40 @@ class ImportService:
         """
         return self.hub_browser.get_dataset_info(dataset_id)
-    async def download_dataset(self, dataset_id: str, enable_splitting: bool = True) -> Tuple[str, str]:
         """Download a dataset and process its video/image content
         Args:
             dataset_id: The dataset ID to download
             enable_splitting: Whether to enable automatic video splitting
         Returns:
             Tuple of (loading_msg, status_msg)
         """
-        return await self.hub_browser.download_dataset(dataset_id, enable_splitting)
-    async def download_file_group(self, dataset_id: str, file_type: str, enable_splitting: bool = True) -> str:
         """Download a group of files (videos or WebDatasets)
         Args:
             dataset_id: The dataset ID
             file_type: Type of file ("video" or "webdataset")
             enable_splitting: Whether to enable automatic video splitting
         Returns:
             Status message
         """
-        return await self.hub_browser.download_file_group(dataset_id, file_type, enable_splitting)

 """
 import logging
+from typing import List, Dict, Optional, Tuple, Any, Union, Callable
 from pathlib import Path
 import gradio as gr
         """
         return self.hub_browser.get_dataset_info(dataset_id)
+    async def download_dataset(
+        self,
+        dataset_id: str,
+        enable_splitting: bool = True,
+        progress_callback: Optional[Callable] = None
+    ) -> Tuple[str, str]:
         """Download a dataset and process its video/image content
         Args:
             dataset_id: The dataset ID to download
             enable_splitting: Whether to enable automatic video splitting
+            progress_callback: Optional callback for progress tracking
         Returns:
             Tuple of (loading_msg, status_msg)
         """
+        return await self.hub_browser.download_dataset(dataset_id, enable_splitting, progress_callback)
+    async def download_file_group(
+        self,
+        dataset_id: str,
+        file_type: str,
+        enable_splitting: bool = True,
+        progress_callback: Optional[Callable] = None
+    ) -> str:
         """Download a group of files (videos or WebDatasets)
         Args:
             dataset_id: The dataset ID
             file_type: Type of file ("video" or "webdataset")
             enable_splitting: Whether to enable automatic video splitting
+            progress_callback: Optional callback for progress tracking
         Returns:
             Status message
         """
+        return await self.hub_browser.download_file_group(dataset_id, file_type, enable_splitting, progress_callback)

vms/tabs/import_tab/hub_tab.py CHANGED Viewed

@@ -6,6 +6,7 @@ Handles browsing, searching, and importing datasets from the Hugging Face Hub.
 import gradio as gr
 import logging
 import asyncio
 from pathlib import Path
 from typing import Dict, Any, List, Optional, Tuple
@@ -20,6 +21,7 @@ class HubTab(BaseTab):
         super().__init__(app_state)
         self.id = "hub_tab"
         self.title = "Import from Hugging Face"
     def create(self, parent=None) -> gr.Tab:
         """Create the Hub tab UI components"""
@@ -33,8 +35,8 @@ class HubTab(BaseTab):
                 with gr.Row():
                     self.components["dataset_search"] = gr.Textbox(
-                        label="Search Hugging Face Datasets",
-                        placeholder="Search for video datasets..."
                     )
                 with gr.Row():
@@ -46,7 +48,7 @@ class HubTab(BaseTab):
                     with gr.Column(scale=3):
                         self.components["dataset_results"] = gr.Dataframe(
-                            headers=["id", "title", "downloads"],
                             interactive=False,
                             wrap=True,
                             row_count=10,
@@ -58,6 +60,7 @@ class HubTab(BaseTab):
                         self.components["dataset_info"] = gr.Markdown("Select a dataset to see details")
                         self.components["dataset_id"] = gr.State(value=None)
                         self.components["file_type"] = gr.State(value=None)
                         # Files section that appears when a dataset is selected
                         with gr.Column(visible=False) as files_section:
@@ -66,27 +69,23 @@ class HubTab(BaseTab):
                             gr.Markdown("## Files:")
                             # Video files row (appears if videos are present)
-                            with gr.Row(visible=False) as video_files_row:
                                 self.components["video_files_row"] = video_files_row
-                                with gr.Column(scale=4):
-                                    self.components["video_count_text"] = gr.Markdown("Contains 0 video files")
-                                with gr.Column(scale=1):
-                                    self.components["download_videos_btn"] = gr.Button("Download", variant="primary")
                             # WebDataset files row (appears if tar files are present)
-                            with gr.Row(visible=False) as webdataset_files_row:
                                 self.components["webdataset_files_row"] = webdataset_files_row
-                                with gr.Column(scale=4):
-                                    self.components["webdataset_count_text"] = gr.Markdown("Contains 0 WebDataset (.tar) files")
-                                with gr.Column(scale=1):
-                                    self.components["download_webdataset_btn"] = gr.Button("Download", variant="primary")
-                        # Status and loading indicators
-                        self.components["dataset_loading"] = gr.Markdown(visible=False)
             return tab
@@ -102,7 +101,7 @@ class HubTab(BaseTab):
             ]
         )
-        # Dataset selection event - FIX HERE
         self.components["dataset_results"].select(
             fn=self.display_dataset_info,
             outputs=[
@@ -112,7 +111,8 @@ class HubTab(BaseTab):
                 self.components["video_files_row"],
                 self.components["video_count_text"],
                 self.components["webdataset_files_row"],
-                self.components["webdataset_count_text"]
             ]
         )
@@ -128,20 +128,11 @@ class HubTab(BaseTab):
                 self.components["file_type"]
             ],
             outputs=[
-                self.components["dataset_loading"],
-                self.components["import_status"]
-            ]
-        ).success(
-            fn=self.app.tabs["import_tab"].on_import_success,
-            inputs=[
-                self.components["enable_automatic_video_split"],
-                self.components["enable_automatic_content_captioning"],
-                self.app.tabs["caption_tab"].components["custom_prompt_prefix"]
-            ],
-            outputs=[
-                self.app.tabs_component,
-                self.app.tabs["split_tab"].components["video_list"],
-                self.app.tabs["split_tab"].components["detect_status"]
             ]
         )
@@ -157,20 +148,11 @@ class HubTab(BaseTab):
                 self.components["file_type"]
             ],
             outputs=[
-                self.components["dataset_loading"],
-                self.components["import_status"]
-            ]
-        ).success(
-            fn=self.app.tabs["import_tab"].on_import_success,
-            inputs=[
-                self.components["enable_automatic_video_split"],
-                self.components["enable_automatic_content_captioning"],
-                self.app.tabs["caption_tab"].components["custom_prompt_prefix"]
-            ],
-            outputs=[
-                self.app.tabs_component,
-                self.app.tabs["split_tab"].components["video_list"],
-                self.app.tabs["split_tab"].components["detect_status"]
             ]
         )
@@ -186,12 +168,16 @@ class HubTab(BaseTab):
         """Search datasets on the Hub matching the query"""
         try:
             logger.info(f"Searching for datasets with query: '{query}'")
-            results = self.app.importer.search_datasets(query)
             return results, gr.update(visible=True)
         except Exception as e:
             logger.error(f"Error searching datasets: {str(e)}", exc_info=True)
-            return [[f"Error: {str(e)}", "", ""]], gr.update(visible=True)
     def display_dataset_info(self, evt: gr.SelectData):
         """Display detailed information about the selected dataset"""
         try:
@@ -204,9 +190,11 @@ class HubTab(BaseTab):
                     gr.update(visible=False), # video_files_row
                     "",                     # video_count_text
                     gr.update(visible=False), # webdataset_files_row
-                    ""                      # webdataset_count_text
                 )
             dataset_id = evt.value[0] if isinstance(evt.value, list) else evt.value
             logger.info(f"Getting dataset info for: {dataset_id}")
@@ -225,7 +213,8 @@ class HubTab(BaseTab):
                 gr.update(visible=video_count > 0),      # video_files_row
                 f"Contains {video_count} video file{'s' if video_count != 1 else ''}", # video_count_text
                 gr.update(visible=webdataset_count > 0), # webdataset_files_row
-                f"Contains {webdataset_count} WebDataset (.tar) file{'s' if webdataset_count != 1 else ''}" # webdataset_count_text
             )
         except Exception as e:
             logger.error(f"Error displaying dataset info: {str(e)}", exc_info=True)
@@ -236,38 +225,91 @@ class HubTab(BaseTab):
                 gr.update(visible=False),                      # video_files_row
                 "",                                            # video_count_text
                 gr.update(visible=False),                      # webdataset_files_row
-                ""                                             # webdataset_count_text
             )
-    def download_file_group(self, dataset_id: str, enable_splitting: bool, file_type: str) -> Tuple[gr.update, str]:
-        """Handle download of a group of files (videos or WebDatasets)"""
         try:
             if not dataset_id:
-                return gr.update(visible=False), "No dataset selected"
             logger.info(f"Starting download of {file_type} files from dataset: {dataset_id}")
-            # Show loading indicator
-            loading_msg = gr.update(
-                value=f"## Downloading {file_type} files from {dataset_id}\n\nThis may take some time...",
-                visible=True
-            )
-            status_msg = f"Downloading {file_type} files from {dataset_id}..."
-            # Use the async version in a non-blocking way
-            asyncio.create_task(self._download_file_group_bg(dataset_id, file_type, enable_splitting))
-            return loading_msg, status_msg
         except Exception as e:
-            error_msg = f"Error initiating download: {str(e)}"
             logger.error(error_msg, exc_info=True)
-            return gr.update(visible=False), error_msg
-    async def _download_file_group_bg(self, dataset_id: str, file_type: str, enable_splitting: bool):
-        """Background task for group file download"""
-        try:
-            # This will execute in the background
-            await self.app.importer.download_file_group(dataset_id, file_type, enable_splitting)
-        except Exception as e:
-            logger.error(f"Error in background file group download: {str(e)}", exc_info=True)

 import gradio as gr
 import logging
 import asyncio
+import threading
 from pathlib import Path
 from typing import Dict, Any, List, Optional, Tuple
         super().__init__(app_state)
         self.id = "hub_tab"
         self.title = "Import from Hugging Face"
+        self.is_downloading = False
     def create(self, parent=None) -> gr.Tab:
         """Create the Hub tab UI components"""
                 with gr.Row():
                     self.components["dataset_search"] = gr.Textbox(
+                        label="Search Hugging Face Datasets (eg. cakeify, disney, rickroll..)",
+                        placeholder="Search for video datasets (eg. cakeify, disney, rickroll..)"
                     )
                 with gr.Row():
                     with gr.Column(scale=3):
                         self.components["dataset_results"] = gr.Dataframe(
+                            headers=["Dataset ID"],  # Simplified to show only dataset ID
                             interactive=False,
                             wrap=True,
                             row_count=10,
                         self.components["dataset_info"] = gr.Markdown("Select a dataset to see details")
                         self.components["dataset_id"] = gr.State(value=None)
                         self.components["file_type"] = gr.State(value=None)
+                        self.components["download_in_progress"] = gr.State(value=False)
                         # Files section that appears when a dataset is selected
                         with gr.Column(visible=False) as files_section:
                             gr.Markdown("## Files:")
                             # Video files row (appears if videos are present)
+                            with gr.Row() as video_files_row:
                                 self.components["video_files_row"] = video_files_row
+                                self.components["video_count_text"] = gr.Markdown("Contains 0 video files")
+                                self.components["download_videos_btn"] = gr.Button("Download", variant="primary")
                             # WebDataset files row (appears if tar files are present)
+                            with gr.Row() as webdataset_files_row:
                                 self.components["webdataset_files_row"] = webdataset_files_row
+                                self.components["webdataset_count_text"] = gr.Markdown("Contains 0 WebDataset (.tar) files")
+                                self.components["download_webdataset_btn"] = gr.Button("Download", variant="primary")
+                        # Status indicator
+                        self.components["status_output"] = gr.Markdown("")
             return tab
             ]
         )
+        # Dataset selection event
         self.components["dataset_results"].select(
             fn=self.display_dataset_info,
             outputs=[
                 self.components["video_files_row"],
                 self.components["video_count_text"],
                 self.components["webdataset_files_row"],
+                self.components["webdataset_count_text"],
+                self.components["status_output"]  # Reset status output
             ]
         )
                 self.components["file_type"]
             ],
             outputs=[
+                self.components["status_output"],
+                self.components["import_status"],
+                self.components["download_videos_btn"],
+                self.components["download_webdataset_btn"],
+                self.components["download_in_progress"]
             ]
         )
                 self.components["file_type"]
             ],
             outputs=[
+                self.components["status_output"],
+                self.components["import_status"],
+                self.components["download_videos_btn"],
+                self.components["download_webdataset_btn"],
+                self.components["download_in_progress"]
             ]
         )
         """Search datasets on the Hub matching the query"""
         try:
             logger.info(f"Searching for datasets with query: '{query}'")
+            results_full = self.app.importer.search_datasets(query)
+            # Extract just the first column (dataset IDs) for display
+            results = [[row[0]] for row in results_full]
             return results, gr.update(visible=True)
         except Exception as e:
             logger.error(f"Error searching datasets: {str(e)}", exc_info=True)
+            return [[f"Error: {str(e)}"]], gr.update(visible=True)
     def display_dataset_info(self, evt: gr.SelectData):
         """Display detailed information about the selected dataset"""
         try:
                     gr.update(visible=False), # video_files_row
                     "",                     # video_count_text
                     gr.update(visible=False), # webdataset_files_row
+                    "",                      # webdataset_count_text
+                    ""                       # status_output
                 )
+            # Extract dataset_id from the simplified format
             dataset_id = evt.value[0] if isinstance(evt.value, list) else evt.value
             logger.info(f"Getting dataset info for: {dataset_id}")
                 gr.update(visible=video_count > 0),      # video_files_row
                 f"Contains {video_count} video file{'s' if video_count != 1 else ''}", # video_count_text
                 gr.update(visible=webdataset_count > 0), # webdataset_files_row
+                f"Contains {webdataset_count} WebDataset (.tar) file{'s' if webdataset_count != 1 else ''}", # webdataset_count_text
+                ""                                       # status_output
             )
         except Exception as e:
             logger.error(f"Error displaying dataset info: {str(e)}", exc_info=True)
                 gr.update(visible=False),                      # video_files_row
                 "",                                            # video_count_text
                 gr.update(visible=False),                      # webdataset_files_row
+                "",                                            # webdataset_count_text
+                ""                                             # status_output
             )
+    async def _download_with_progress(self, dataset_id, file_type, enable_splitting, progress_callback):
+        """Wrapper for download_file_group that integrates with progress tracking"""
+        try:
+            # Set up the progress callback adapter
+            def progress_adapter(progress_value, desc=None, total=None):
+                # For a progress bar, we need to convert the values to a 0-1 range
+                if isinstance(progress_value, (int, float)):
+                    if total is not None and total > 0:
+                        # If we have a total, calculate the fraction
+                        fraction = min(1.0, progress_value / total)
+                    else:
+                        # Otherwise, just use the value directly (assumed to be 0-1)
+                        fraction = min(1.0, progress_value)
+                    # Update the progress with the calculated fraction
+                    progress_callback(fraction, desc=desc)
+            # Call the actual download function with our adapter
+            result = await self.app.importer.download_file_group(
+                dataset_id,
+                file_type,
+                enable_splitting,
+                progress_callback=progress_adapter
+            )
+            return result
+        except Exception as e:
+            logger.error(f"Error in download with progress: {str(e)}", exc_info=True)
+            return f"Error: {str(e)}"
+    def download_file_group(self, dataset_id: str, enable_splitting: bool, file_type: str, progress=gr.Progress()) -> Tuple:
+        """Handle download of a group of files (videos or WebDatasets) with progress tracking"""
         try:
             if not dataset_id:
+                return ("No dataset selected",
+                       "No dataset selected",
+                       gr.update(),
+                       gr.update(),
+                       False)
             logger.info(f"Starting download of {file_type} files from dataset: {dataset_id}")
+            # Initialize progress tracking
+            progress(0, desc=f"Starting download of {file_type} files from {dataset_id}")
+            # Disable download buttons during the process
+            videos_btn_update = gr.update(interactive=False)
+            webdataset_btn_update = gr.update(interactive=False)
+            # Run the download function with progress tracking
+            # We need to use asyncio.run to run the coroutine in a synchronous context
+            result = asyncio.run(self._download_with_progress(
+                dataset_id,
+                file_type,
+                enable_splitting,
+                progress
+            ))
+            # When download is complete, update the UI
+            progress(1.0, desc="Download complete!")
+            # Create a success message
+            success_msg = f"✅ Download complete! {result}"
+            # Update the UI components
+            return (
+                success_msg,                 # status_output - shows the successful result
+                result,                      # import_status
+                gr.update(interactive=True), # download_videos_btn
+                gr.update(interactive=True), # download_webdataset_btn
+                False                        # download_in_progress
+            )
         except Exception as e:
+            error_msg = f"Error downloading {file_type} files: {str(e)}"
             logger.error(error_msg, exc_info=True)
+            return (
+                f"❌ Error: {error_msg}",     # status_output
+                error_msg,                   # import_status
+                gr.update(interactive=True), # download_videos_btn
+                gr.update(interactive=True), # download_webdataset_btn
+                False                        # download_in_progress
+            )

vms/tabs/import_tab/import_tab.py CHANGED Viewed

@@ -5,6 +5,7 @@ Parent import tab for Video Model Studio UI that contains sub-tabs
 import gradio as gr
 import logging
 import asyncio
 from pathlib import Path
 from typing import Dict, Any, List, Optional, Tuple
@@ -82,44 +83,97 @@ class ImportTab(BaseTab):
         self.youtube_tab.connect_events()
         self.hub_tab.connect_events()
-    async def on_import_success(self, enable_splitting, enable_automatic_content_captioning, prompt_prefix):
         """Handle successful import of files"""
         videos = self.app.tabs["split_tab"].list_unprocessed_videos()
         # If scene detection isn't already running and there are videos to process,
         # and auto-splitting is enabled, start the detection
         if videos and not self.app.splitter.is_processing() and enable_splitting:
-            await self.app.tabs["split_tab"].start_scene_detection(enable_splitting)
             msg = "Starting automatic scene detection..."
         else:
             # Just copy files without splitting if auto-split disabled
-            for video_file in VIDEOS_TO_SPLIT_PATH.glob("*.mp4"):
-                await self.app.splitter.process_video(video_file, enable_splitting=False)
             msg = "Copying videos without splitting..."
         self.app.tabs["caption_tab"].copy_files_to_training_dir(prompt_prefix)
-        # Start auto-captioning if enabled, and handle async generator properly
         if enable_automatic_content_captioning:
-            # Create a background task for captioning
-            asyncio.create_task(self.app.tabs["caption_tab"]._process_caption_generator(
-                DEFAULT_CAPTIONING_BOT_INSTRUCTIONS,
-                prompt_prefix
-            ))
-        return {
-            "tabs": gr.Tabs(selected="split_tab"),
-            "video_list": videos,
-            "detect_status": msg
-        }
     async def update_titles_after_import(self, enable_splitting, enable_automatic_content_captioning, prompt_prefix):
         """Handle post-import updates including titles"""
-        import_result = await self.on_import_success(enable_splitting, enable_automatic_content_captioning, prompt_prefix)
         titles = self.app.update_titles()
-        return (
-            import_result["tabs"],
-            import_result["video_list"],
-            import_result["detect_status"],
-            *titles
-        )

 import gradio as gr
 import logging
 import asyncio
+import threading
 from pathlib import Path
 from typing import Dict, Any, List, Optional, Tuple
         self.youtube_tab.connect_events()
         self.hub_tab.connect_events()
+    def on_import_success(self, enable_splitting, enable_automatic_content_captioning, prompt_prefix):
         """Handle successful import of files"""
         videos = self.app.tabs["split_tab"].list_unprocessed_videos()
         # If scene detection isn't already running and there are videos to process,
         # and auto-splitting is enabled, start the detection
         if videos and not self.app.splitter.is_processing() and enable_splitting:
+            # Start the scene detection in a separate thread
+            self._start_scene_detection_bg(enable_splitting)
             msg = "Starting automatic scene detection..."
         else:
             # Just copy files without splitting if auto-split disabled
+            self._start_copy_files_bg(enable_splitting)
             msg = "Copying videos without splitting..."
         self.app.tabs["caption_tab"].copy_files_to_training_dir(prompt_prefix)
+        # Start auto-captioning if enabled
         if enable_automatic_content_captioning:
+            self._start_captioning_bg(DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, prompt_prefix)
+        # Return the correct tuple of values as expected by the UI
+        return gr.update(selected="split_tab"), videos, msg
+    def _start_scene_detection_bg(self, enable_splitting):
+        """Start scene detection in a background thread"""
+        def run_async_in_thread():
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                loop.run_until_complete(
+                    self.app.tabs["split_tab"].start_scene_detection(enable_splitting)
+                )
+            except Exception as e:
+                logger.error(f"Error in background scene detection: {str(e)}", exc_info=True)
+            finally:
+                loop.close()
+        thread = threading.Thread(target=run_async_in_thread)
+        thread.daemon = True
+        thread.start()
+    def _start_copy_files_bg(self, enable_splitting):
+        """Start copying files in a background thread"""
+        def run_async_in_thread():
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                async def copy_files():
+                    for video_file in VIDEOS_TO_SPLIT_PATH.glob("*.mp4"):
+                        await self.app.splitter.process_video(video_file, enable_splitting=False)
+                loop.run_until_complete(copy_files())
+            except Exception as e:
+                logger.error(f"Error in background file copying: {str(e)}", exc_info=True)
+            finally:
+                loop.close()
+        thread = threading.Thread(target=run_async_in_thread)
+        thread.daemon = True
+        thread.start()
+    def _start_captioning_bg(self, instructions, prompt_prefix):
+        """Start captioning in a background thread"""
+        def run_async_in_thread():
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                loop.run_until_complete(
+                    self.app.tabs["caption_tab"]._process_caption_generator(
+                        instructions, prompt_prefix
+                    )
+                )
+            except Exception as e:
+                logger.error(f"Error in background captioning: {str(e)}", exc_info=True)
+            finally:
+                loop.close()
+        thread = threading.Thread(target=run_async_in_thread)
+        thread.daemon = True
+        thread.start()
     async def update_titles_after_import(self, enable_splitting, enable_automatic_content_captioning, prompt_prefix):
         """Handle post-import updates including titles"""
+        # Call the non-async version since we need to return immediately for the UI
+        tabs, video_list, detect_status = self.on_import_success(
+            enable_splitting, enable_automatic_content_captioning, prompt_prefix
+        )
+        # Get updated titles
         titles = self.app.update_titles()
+        # Return all expected outputs
+        return tabs, video_list, detect_status, *titles

vms/ui/video_trainer_ui.py CHANGED Viewed

@@ -72,7 +72,33 @@ class VideoTrainerUI:
         # Log recovery status
         logger.info(f"Initialization complete. Recovery status: {self.recovery_status}")
     def create_ui(self):
         """Create the main Gradio UI"""
         with gr.Blocks(title="🎥 Video Model Studio") as app:

         # Log recovery status
         logger.info(f"Initialization complete. Recovery status: {self.recovery_status}")
+    def add_periodic_callback(self, callback_fn, interval=1.0):
+        """Add a periodic callback function to the UI
+        Args:
+            callback_fn: Function to call periodically
+            interval: Time in seconds between calls (default: 1.0)
+        """
+        try:
+            # Store a reference to the callback function
+            if not hasattr(self, "_periodic_callbacks"):
+                self._periodic_callbacks = []
+            self._periodic_callbacks.append(callback_fn)
+            # Add the callback to the Gradio app
+            self.app.add_callback(
+                interval,  # Interval in seconds
+                callback_fn,  # Function to call
+                inputs=None,  # No inputs needed
+                outputs=list(self.components.values())  # All components as possible outputs
+            )
+            logger.info(f"Added periodic callback {callback_fn.__name__} with interval {interval}s")
+        except Exception as e:
+            logger.error(f"Error adding periodic callback: {e}", exc_info=True)
     def create_ui(self):
         """Create the main Gradio UI"""
         with gr.Blocks(title="🎥 Video Model Studio") as app: