smolvlm2-video-highlights2

Sleeping

avinashHuggingface108 commited on Sep 19

Commit

5d1f54f

1 Parent(s): 1fd0637

Optimize: Visual-only mode for HuggingFace Spaces

- Added visual_only_mode parameter to skip Whisper loading
- Saves memory by not loading audio processing models
- Focus all resources on SmolVLM2 visual analysis
- Should improve performance on limited hardware (2 vCPU, 16GB RAM)
- Uses pure visual scoring for highlights detection

Files changed (2) hide show

audio_enhanced_highlights_final.py +35 -39
highlights_api.py +4 -3

audio_enhanced_highlights_final.py CHANGED Viewed

@@ -43,38 +43,27 @@ logger = logging.getLogger(__name__)
 class AudioVisualAnalyzer:
     """Comprehensive analyzer combining visual and audio analysis"""
-    def __init__(self, whisper_model_size="base", timeout_seconds=120, enable_visual=True):
-        """Initialize with SmolVLM2 and Whisper models"""
-        print("🔧 Initializing Audio-Visual Analyzer...")
         self.enable_visual = enable_visual
         # Initialize SmolVLM2 for visual analysis
         if self.enable_visual:
             print("🔥 Loading SmolVLM2...")
             self.vlm_handler = SmolVLM2Handler()
-            # Warm up the model with a simple test
-            print("🔧 Warming up SmolVLM2 model...")
-            try:
-                import tempfile
-                from PIL import Image
-                # Create a small test image
-                test_img = Image.new('RGB', (224, 224), color='black')
-                temp_path = tempfile.mktemp(suffix='.png')
-                test_img.save(temp_path)
-                # Quick warmup inference
-                self.vlm_handler.analyze_image(temp_path, "What do you see?", max_tokens=10)
-                print("✅ SmolVLM2 model warmed up")
-                os.remove(temp_path)
-            except Exception as e:
-                print(f"⚠️ Model warmup failed: {e}")
         else:
-            print("🔇 Visual analysis disabled - audio-only mode")
             self.vlm_handler = None
         self.timeout_seconds = timeout_seconds
-        # Initialize Whisper for audio analysis
-        if WHISPER_AVAILABLE:
             print(f"📥 Loading Whisper model ({whisper_model_size})...")
             self.whisper_model = whisper.load_model(whisper_model_size)
             print("✅ Whisper model loaded successfully")
@@ -293,24 +282,31 @@ class AudioVisualAnalyzer:
         # Visual analysis
         visual_analysis = self.analyze_visual_content(temp_frame_path)
-        # Audio analysis
-        audio_files = self.extract_audio_segments(video_path, [segment])
-        audio_analysis = {"text": "", "language": "unknown", "confidence": 0.0}
-        if audio_files and audio_files[0]:
-            audio_analysis = self.transcribe_audio_segment(audio_files[0])
-            # Cleanup temporary audio file
-            try:
-                os.unlink(audio_files[0])
-            except:
-                pass
-        # Combined scoring
-        combined_score = self.calculate_combined_score(
-            visual_analysis['score'],
-            audio_analysis['text'],
-            audio_analysis['language']
-        )
         return {
             'start_time': start_time,

 class AudioVisualAnalyzer:
     """Comprehensive analyzer combining visual and audio analysis"""
+    def __init__(self, whisper_model_size="base", timeout_seconds=90, enable_visual=True, visual_only_mode=False):
+        """Initialize with SmolVLM2 and optionally Whisper models"""
+        print("🔧 Initializing Visual Analyzer...")
         self.enable_visual = enable_visual
+        self.visual_only_mode = visual_only_mode
         # Initialize SmolVLM2 for visual analysis
         if self.enable_visual:
             print("🔥 Loading SmolVLM2...")
             self.vlm_handler = SmolVLM2Handler()
         else:
+            print("🔇 Visual analysis disabled")
             self.vlm_handler = None
         self.timeout_seconds = timeout_seconds
+        # Skip Whisper loading in visual-only mode to save memory/resources
+        if self.visual_only_mode:
+            print("👁️ Visual-only mode enabled - skipping audio processing to optimize performance")
+            self.whisper_model = None
+        elif WHISPER_AVAILABLE:
             print(f"📥 Loading Whisper model ({whisper_model_size})...")
             self.whisper_model = whisper.load_model(whisper_model_size)
             print("✅ Whisper model loaded successfully")
         # Visual analysis
         visual_analysis = self.analyze_visual_content(temp_frame_path)
+        # Skip audio analysis in visual-only mode to save resources
+        if self.visual_only_mode:
+            logger.info("👁️ Visual-only mode: skipping audio analysis")
+            audio_analysis = {"text": "", "language": "unknown", "confidence": 0.0}
+            # Use pure visual score for highlights
+            combined_score = visual_analysis['score']
+        else:
+            # Audio analysis
+            audio_files = self.extract_audio_segments(video_path, [segment])
+            audio_analysis = {"text": "", "language": "unknown", "confidence": 0.0}
+            if audio_files and audio_files[0]:
+                audio_analysis = self.transcribe_audio_segment(audio_files[0])
+                # Cleanup temporary audio file
+                try:
+                    os.unlink(audio_files[0])
+                except:
+                    pass
+            # Combined scoring
+            combined_score = self.calculate_combined_score(
+                visual_analysis['score'],
+                audio_analysis['text'],
+                audio_analysis['language']
+            )
         return {
             'start_time': start_time,

highlights_api.py CHANGED Viewed

@@ -118,7 +118,7 @@ async def upload_video(
     min_score: float = 3.0,
     max_highlights: int = 3,
     whisper_model: str = "base",
-    timeout: int = 120,
     enable_visual: bool = True
 ):
     """
@@ -240,11 +240,12 @@ async def process_video_highlights(
         active_jobs[job_id]["progress"] = 10
         active_jobs[job_id]["message"] = "Initializing AI models..."
-        # Initialize analyzer
         analyzer = AudioVisualAnalyzer(
             whisper_model_size=whisper_model,
             timeout_seconds=timeout,
-            enable_visual=enable_visual
         )
         active_jobs[job_id]["progress"] = 20

     min_score: float = 3.0,
     max_highlights: int = 3,
     whisper_model: str = "base",
+    timeout: int = 60,
     enable_visual: bool = True
 ):
     """
         active_jobs[job_id]["progress"] = 10
         active_jobs[job_id]["message"] = "Initializing AI models..."
+        # Initialize analyzer in visual-only mode for HuggingFace Spaces optimization
         analyzer = AudioVisualAnalyzer(
             whisper_model_size=whisper_model,
             timeout_seconds=timeout,
+            enable_visual=enable_visual,
+            visual_only_mode=True  # Skip audio processing to focus resources on visual analysis
         )
         active_jobs[job_id]["progress"] = 20