avinashHuggingface108 commited on
Commit
5d1f54f
Β·
1 Parent(s): 1fd0637

Optimize: Visual-only mode for HuggingFace Spaces

Browse files

- Added visual_only_mode parameter to skip Whisper loading
- Saves memory by not loading audio processing models
- Focus all resources on SmolVLM2 visual analysis
- Should improve performance on limited hardware (2 vCPU, 16GB RAM)
- Uses pure visual scoring for highlights detection

audio_enhanced_highlights_final.py CHANGED
@@ -43,38 +43,27 @@ logger = logging.getLogger(__name__)
43
  class AudioVisualAnalyzer:
44
  """Comprehensive analyzer combining visual and audio analysis"""
45
 
46
- def __init__(self, whisper_model_size="base", timeout_seconds=120, enable_visual=True):
47
- """Initialize with SmolVLM2 and Whisper models"""
48
- print("πŸ”§ Initializing Audio-Visual Analyzer...")
49
 
50
  self.enable_visual = enable_visual
 
51
 
52
  # Initialize SmolVLM2 for visual analysis
53
  if self.enable_visual:
54
  print("πŸ”₯ Loading SmolVLM2...")
55
  self.vlm_handler = SmolVLM2Handler()
56
- # Warm up the model with a simple test
57
- print("πŸ”§ Warming up SmolVLM2 model...")
58
- try:
59
- import tempfile
60
- from PIL import Image
61
- # Create a small test image
62
- test_img = Image.new('RGB', (224, 224), color='black')
63
- temp_path = tempfile.mktemp(suffix='.png')
64
- test_img.save(temp_path)
65
- # Quick warmup inference
66
- self.vlm_handler.analyze_image(temp_path, "What do you see?", max_tokens=10)
67
- print("βœ… SmolVLM2 model warmed up")
68
- os.remove(temp_path)
69
- except Exception as e:
70
- print(f"⚠️ Model warmup failed: {e}")
71
  else:
72
- print("πŸ”‡ Visual analysis disabled - audio-only mode")
73
  self.vlm_handler = None
74
  self.timeout_seconds = timeout_seconds
75
 
76
- # Initialize Whisper for audio analysis
77
- if WHISPER_AVAILABLE:
 
 
 
78
  print(f"πŸ“₯ Loading Whisper model ({whisper_model_size})...")
79
  self.whisper_model = whisper.load_model(whisper_model_size)
80
  print("βœ… Whisper model loaded successfully")
@@ -293,24 +282,31 @@ class AudioVisualAnalyzer:
293
  # Visual analysis
294
  visual_analysis = self.analyze_visual_content(temp_frame_path)
295
 
296
- # Audio analysis
297
- audio_files = self.extract_audio_segments(video_path, [segment])
298
- audio_analysis = {"text": "", "language": "unknown", "confidence": 0.0}
299
-
300
- if audio_files and audio_files[0]:
301
- audio_analysis = self.transcribe_audio_segment(audio_files[0])
302
- # Cleanup temporary audio file
303
- try:
304
- os.unlink(audio_files[0])
305
- except:
306
- pass
307
-
308
- # Combined scoring
309
- combined_score = self.calculate_combined_score(
310
- visual_analysis['score'],
311
- audio_analysis['text'],
312
- audio_analysis['language']
313
- )
 
 
 
 
 
 
 
314
 
315
  return {
316
  'start_time': start_time,
 
43
  class AudioVisualAnalyzer:
44
  """Comprehensive analyzer combining visual and audio analysis"""
45
 
46
+ def __init__(self, whisper_model_size="base", timeout_seconds=90, enable_visual=True, visual_only_mode=False):
47
+ """Initialize with SmolVLM2 and optionally Whisper models"""
48
+ print("πŸ”§ Initializing Visual Analyzer...")
49
 
50
  self.enable_visual = enable_visual
51
+ self.visual_only_mode = visual_only_mode
52
 
53
  # Initialize SmolVLM2 for visual analysis
54
  if self.enable_visual:
55
  print("πŸ”₯ Loading SmolVLM2...")
56
  self.vlm_handler = SmolVLM2Handler()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  else:
58
+ print("πŸ”‡ Visual analysis disabled")
59
  self.vlm_handler = None
60
  self.timeout_seconds = timeout_seconds
61
 
62
+ # Skip Whisper loading in visual-only mode to save memory/resources
63
+ if self.visual_only_mode:
64
+ print("πŸ‘οΈ Visual-only mode enabled - skipping audio processing to optimize performance")
65
+ self.whisper_model = None
66
+ elif WHISPER_AVAILABLE:
67
  print(f"πŸ“₯ Loading Whisper model ({whisper_model_size})...")
68
  self.whisper_model = whisper.load_model(whisper_model_size)
69
  print("βœ… Whisper model loaded successfully")
 
282
  # Visual analysis
283
  visual_analysis = self.analyze_visual_content(temp_frame_path)
284
 
285
+ # Skip audio analysis in visual-only mode to save resources
286
+ if self.visual_only_mode:
287
+ logger.info("πŸ‘οΈ Visual-only mode: skipping audio analysis")
288
+ audio_analysis = {"text": "", "language": "unknown", "confidence": 0.0}
289
+ # Use pure visual score for highlights
290
+ combined_score = visual_analysis['score']
291
+ else:
292
+ # Audio analysis
293
+ audio_files = self.extract_audio_segments(video_path, [segment])
294
+ audio_analysis = {"text": "", "language": "unknown", "confidence": 0.0}
295
+
296
+ if audio_files and audio_files[0]:
297
+ audio_analysis = self.transcribe_audio_segment(audio_files[0])
298
+ # Cleanup temporary audio file
299
+ try:
300
+ os.unlink(audio_files[0])
301
+ except:
302
+ pass
303
+
304
+ # Combined scoring
305
+ combined_score = self.calculate_combined_score(
306
+ visual_analysis['score'],
307
+ audio_analysis['text'],
308
+ audio_analysis['language']
309
+ )
310
 
311
  return {
312
  'start_time': start_time,
highlights_api.py CHANGED
@@ -118,7 +118,7 @@ async def upload_video(
118
  min_score: float = 3.0,
119
  max_highlights: int = 3,
120
  whisper_model: str = "base",
121
- timeout: int = 120,
122
  enable_visual: bool = True
123
  ):
124
  """
@@ -240,11 +240,12 @@ async def process_video_highlights(
240
  active_jobs[job_id]["progress"] = 10
241
  active_jobs[job_id]["message"] = "Initializing AI models..."
242
 
243
- # Initialize analyzer
244
  analyzer = AudioVisualAnalyzer(
245
  whisper_model_size=whisper_model,
246
  timeout_seconds=timeout,
247
- enable_visual=enable_visual
 
248
  )
249
 
250
  active_jobs[job_id]["progress"] = 20
 
118
  min_score: float = 3.0,
119
  max_highlights: int = 3,
120
  whisper_model: str = "base",
121
+ timeout: int = 60,
122
  enable_visual: bool = True
123
  ):
124
  """
 
240
  active_jobs[job_id]["progress"] = 10
241
  active_jobs[job_id]["message"] = "Initializing AI models..."
242
 
243
+ # Initialize analyzer in visual-only mode for HuggingFace Spaces optimization
244
  analyzer = AudioVisualAnalyzer(
245
  whisper_model_size=whisper_model,
246
  timeout_seconds=timeout,
247
+ enable_visual=enable_visual,
248
+ visual_only_mode=True # Skip audio processing to focus resources on visual analysis
249
  )
250
 
251
  active_jobs[job_id]["progress"] = 20