avinashHuggingface108 commited on
Commit
7ef6739
Β·
1 Parent(s): 56d688e

Update deployment to use SmolVLM2-256M-Video-Instruct model

Browse files

- Switch default model from 2.2B-Instruct to 256M-Video-Instruct for faster processing
- Update all model references in app.py, README.md, and processing scripts
- Fix with_effects parameter handling in huggingface_exact_approach.py
- Add effects support with fade transitions and fallback to basic concatenation
- Update documentation to reflect faster processing with video-specialized model

README.md CHANGED
@@ -19,7 +19,7 @@ This is a FastAPI service that uses HuggingFace's proven segment-based classific
19
 
20
  - **Segment-Based Analysis**: Processes videos in fixed 5-second segments for consistent AI classification
21
  - **Dual Criteria Generation**: Creates two different highlight criteria sets and selects the most selective one
22
- - **SmolVLM2-2.2B-Instruct**: Better reasoning capabilities for reliable segment classification
23
  - **Visual Effects**: Optional fade transitions between segments for professional-quality output
24
  - **REST API**: Upload videos and download processed highlights with job tracking
25
  - **Background Processing**: Non-blocking video processing with real-time status updates
@@ -39,7 +39,7 @@ This is a FastAPI service that uses HuggingFace's proven segment-based classific
39
  curl -X POST \
40
  -F "video=@your_video.mp4" \
41
  -F "segment_length=5.0" \
42
- -F "model_name=HuggingFaceTB/SmolVLM2-2.2B-Instruct" \
43
  -F "with_effects=true" \
44
  https://your-space-url.hf.space/upload-video
45
 
@@ -58,13 +58,13 @@ Use the provided Android client code to integrate with your mobile app.
58
 
59
  Default settings:
60
  - **Segment Length**: 5 seconds (fixed segments for consistent classification)
61
- - **Model**: SmolVLM2-2.2B-Instruct (better reasoning capabilities)
62
  - **Effects**: Enabled (fade transitions between segments)
63
  - **Dual Criteria**: Two prompt variations for robust selection
64
 
65
  ## πŸ› οΈ Technology Stack
66
 
67
- - **SmolVLM2-2.2B-Instruct**: Advanced vision-language model with superior reasoning for complex tasks
68
  - **HuggingFace Transformers**: Latest transformer models and inference
69
  - **FastAPI**: Modern web framework for APIs
70
  - **FFmpeg**: Video processing with advanced filter support
 
19
 
20
  - **Segment-Based Analysis**: Processes videos in fixed 5-second segments for consistent AI classification
21
  - **Dual Criteria Generation**: Creates two different highlight criteria sets and selects the most selective one
22
+ - **SmolVLM2-256M-Video-Instruct**: Faster processing with specialized video understanding
23
  - **Visual Effects**: Optional fade transitions between segments for professional-quality output
24
  - **REST API**: Upload videos and download processed highlights with job tracking
25
  - **Background Processing**: Non-blocking video processing with real-time status updates
 
39
  curl -X POST \
40
  -F "video=@your_video.mp4" \
41
  -F "segment_length=5.0" \
42
+ -F "model_name=HuggingFaceTB/SmolVLM2-256M-Video-Instruct" \
43
  -F "with_effects=true" \
44
  https://your-space-url.hf.space/upload-video
45
 
 
58
 
59
  Default settings:
60
  - **Segment Length**: 5 seconds (fixed segments for consistent classification)
61
+ - **Model**: SmolVLM2-256M-Video-Instruct (faster processing)
62
  - **Effects**: Enabled (fade transitions between segments)
63
  - **Dual Criteria**: Two prompt variations for robust selection
64
 
65
  ## πŸ› οΈ Technology Stack
66
 
67
+ - **SmolVLM2-256M-Video-Instruct**: Efficient vision-language model optimized for video understanding
68
  - **HuggingFace Transformers**: Latest transformer models and inference
69
  - **FastAPI**: Modern web framework for APIs
70
  - **FFmpeg**: Video processing with advanced filter support
app.py CHANGED
@@ -64,7 +64,7 @@ app.add_middleware(
64
  # Request/Response models
65
  class AnalysisRequest(BaseModel):
66
  segment_length: float = 5.0
67
- model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
68
  with_effects: bool = True
69
 
70
  class AnalysisResponse(BaseModel):
@@ -102,7 +102,7 @@ async def read_root():
102
  "message": "SmolVLM2 Optimized HuggingFace Video Highlights API",
103
  "version": "3.0.0",
104
  "approach": "Optimized HuggingFace exact approach with STRICT prompting",
105
- "model": "SmolVLM2-2.2B-Instruct (67% success rate)",
106
  "improvements": [
107
  "STRICT system prompting for selectivity",
108
  "Structured YES/NO user prompts",
@@ -120,7 +120,7 @@ async def read_root():
120
  @app.get("/health")
121
  async def health_check():
122
  """Health check endpoint"""
123
- return {"status": "healthy", "model": "SmolVLM2-2.2B-Instruct"}
124
 
125
  async def process_video_background(job_id: str, video_path: str, output_path: str,
126
  segment_length: float, model_name: str, with_effects: bool):
@@ -191,7 +191,7 @@ async def upload_video(
191
  background_tasks: BackgroundTasks,
192
  video: UploadFile = File(...),
193
  segment_length: float = 5.0,
194
- model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
195
  with_effects: bool = True
196
  ):
197
  """
 
64
  # Request/Response models
65
  class AnalysisRequest(BaseModel):
66
  segment_length: float = 5.0
67
+ model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
68
  with_effects: bool = True
69
 
70
  class AnalysisResponse(BaseModel):
 
102
  "message": "SmolVLM2 Optimized HuggingFace Video Highlights API",
103
  "version": "3.0.0",
104
  "approach": "Optimized HuggingFace exact approach with STRICT prompting",
105
+ "model": "SmolVLM2-256M-Video-Instruct (faster processing)",
106
  "improvements": [
107
  "STRICT system prompting for selectivity",
108
  "Structured YES/NO user prompts",
 
120
  @app.get("/health")
121
  async def health_check():
122
  """Health check endpoint"""
123
+ return {"status": "healthy", "model": "SmolVLM2-256M-Video-Instruct"}
124
 
125
  async def process_video_background(job_id: str, video_path: str, output_path: str,
126
  segment_length: float, model_name: str, with_effects: bool):
 
191
  background_tasks: BackgroundTasks,
192
  video: UploadFile = File(...),
193
  segment_length: float = 5.0,
194
+ model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
195
  with_effects: bool = True
196
  ):
197
  """
huggingface_exact_approach.py CHANGED
@@ -203,13 +203,21 @@ class VideoHighlightDetector:
203
  self,
204
  video_path: str,
205
  scene_times: list,
206
- output_path: str
 
207
  ):
208
- """Concatenate selected scenes into final video."""
209
  if not scene_times:
210
  logger.warning("No scenes to concatenate, skipping.")
211
  return
212
-
 
 
 
 
 
 
 
213
  filter_complex_parts = []
214
  concat_inputs = []
215
  for i, (start_sec, end_sec) in enumerate(scene_times):
@@ -239,14 +247,82 @@ class VideoHighlightDetector:
239
  ]
240
 
241
  logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
242
- subprocess.run(cmd, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- def process_video(self, video_path: str, output_path: str, segment_length: float = 10.0) -> Dict:
245
  """Process video using exact HuggingFace approach."""
246
  print("πŸš€ Starting HuggingFace Exact Video Highlight Detection")
247
  print(f"πŸ“ Input: {video_path}")
248
  print(f"πŸ“ Output: {output_path}")
249
  print(f"⏱️ Segment Length: {segment_length}s")
 
250
  print()
251
 
252
  # Get video duration
@@ -358,7 +434,7 @@ class VideoHighlightDetector:
358
 
359
  # Step 4: Create final video
360
  print(f"🎬 Step 4: Creating final highlights video...")
361
- self._concatenate_scenes(video_path, final_segments, output_path)
362
 
363
  print("βœ… Highlights video created successfully!")
364
  print(f"πŸŽ‰ SUCCESS! Created highlights with {len(final_segments)} segments")
@@ -387,7 +463,7 @@ def main():
387
  parser.add_argument('--output', required=True, help='Path to output highlights video')
388
  parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
389
  parser.add_argument('--segment-length', type=float, default=10.0, help='Length of each segment in seconds (default: 10.0)')
390
- parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-2.2B-Instruct', help='SmolVLM2 model to use')
391
 
392
  args = parser.parse_args()
393
 
 
203
  self,
204
  video_path: str,
205
  scene_times: list,
206
+ output_path: str,
207
+ with_effects: bool = True
208
  ):
209
+ """Concatenate selected scenes into final video with optional effects."""
210
  if not scene_times:
211
  logger.warning("No scenes to concatenate, skipping.")
212
  return
213
+
214
+ if with_effects:
215
+ self._concatenate_with_effects(video_path, scene_times, output_path)
216
+ else:
217
+ self._concatenate_basic(video_path, scene_times, output_path)
218
+
219
+ def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
220
+ """Basic concatenation without effects."""
221
  filter_complex_parts = []
222
  concat_inputs = []
223
  for i, (start_sec, end_sec) in enumerate(scene_times):
 
247
  ]
248
 
249
  logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
250
+ subprocess.run(cmd, check=True, capture_output=True, text=True)
251
+
252
+ def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
253
+ """Concatenate with fade effects between segments."""
254
+ if len(scene_times) == 1:
255
+ # Single segment - just extract with fade in/out
256
+ start_sec, end_sec = scene_times[0]
257
+ duration = end_sec - start_sec
258
+ fade_duration = min(0.5, duration / 4) # 0.5s or 25% of duration, whichever is shorter
259
+
260
+ cmd = [
261
+ "ffmpeg", "-y",
262
+ "-i", video_path,
263
+ "-ss", str(start_sec),
264
+ "-t", str(duration),
265
+ "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
266
+ "-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
267
+ "-c:v", "libx264", "-c:a", "aac",
268
+ output_path
269
+ ]
270
+ else:
271
+ # Multiple segments - create with crossfade transitions
272
+ filter_parts = []
273
+ audio_parts = []
274
+
275
+ for i, (start_sec, end_sec) in enumerate(scene_times):
276
+ duration = end_sec - start_sec
277
+ fade_duration = min(0.3, duration / 6) # Shorter fades for multiple segments
278
+
279
+ # Video with fade
280
+ filter_parts.append(
281
+ f"[0:v]trim=start={start_sec}:end={end_sec},setpts=PTS-STARTPTS,"
282
+ f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
283
+ )
284
+
285
+ # Audio with fade
286
+ audio_parts.append(
287
+ f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
288
+ f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
289
+ )
290
+
291
+ # Concatenate all segments
292
+ video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
293
+ audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
294
+
295
+ filter_complex = (
296
+ ";".join(filter_parts) + ";" +
297
+ ";".join(audio_parts) + ";" +
298
+ f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
299
+ f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
300
+ )
301
+
302
+ cmd = [
303
+ "ffmpeg", "-y",
304
+ "-i", video_path,
305
+ "-filter_complex", filter_complex,
306
+ "-map", "[outv]", "-map", "[outa]",
307
+ "-c:v", "libx264", "-c:a", "aac",
308
+ output_path
309
+ ]
310
+
311
+ logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
312
+ result = subprocess.run(cmd, capture_output=True, text=True)
313
+ if result.returncode != 0:
314
+ logger.error(f"FFmpeg error: {result.stderr}")
315
+ # Fall back to basic concatenation
316
+ logger.info("Falling back to basic concatenation...")
317
+ self._concatenate_basic(video_path, scene_times, output_path)
318
 
319
+ def process_video(self, video_path: str, output_path: str, segment_length: float = 10.0, with_effects: bool = True) -> Dict:
320
  """Process video using exact HuggingFace approach."""
321
  print("πŸš€ Starting HuggingFace Exact Video Highlight Detection")
322
  print(f"πŸ“ Input: {video_path}")
323
  print(f"πŸ“ Output: {output_path}")
324
  print(f"⏱️ Segment Length: {segment_length}s")
325
+ print(f"🎨 With Effects: {with_effects}")
326
  print()
327
 
328
  # Get video duration
 
434
 
435
  # Step 4: Create final video
436
  print(f"🎬 Step 4: Creating final highlights video...")
437
+ self._concatenate_scenes(video_path, final_segments, output_path, with_effects)
438
 
439
  print("βœ… Highlights video created successfully!")
440
  print(f"πŸŽ‰ SUCCESS! Created highlights with {len(final_segments)} segments")
 
463
  parser.add_argument('--output', required=True, help='Path to output highlights video')
464
  parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
465
  parser.add_argument('--segment-length', type=float, default=10.0, help='Length of each segment in seconds (default: 10.0)')
466
+ parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-256M-Video-Instruct', help='SmolVLM2 model to use')
467
 
468
  args = parser.parse_args()
469
 
huggingface_segment_highlights.py CHANGED
@@ -34,7 +34,7 @@ class HuggingFaceVideoHighlightDetector:
34
  Uses fixed-length segments for consistent AI classification
35
  """
36
 
37
- def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"):
38
  """Initialize with SmolVLM2 model - 2.2B provides much better reasoning than 256M"""
39
  print(f"πŸ”₯ Loading {model_name} for HuggingFace Segment-Based Analysis...")
40
  self.vlm_handler = SmolVLM2Handler(model_name=model_name)
@@ -501,7 +501,7 @@ def main():
501
  parser.add_argument('--output', required=True, help='Path to output highlights video')
502
  parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
503
  parser.add_argument('--segment-length', type=float, default=5.0, help='Length of each segment in seconds (default: 5.0)')
504
- parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-2.2B-Instruct', help='SmolVLM2 model to use')
505
  parser.add_argument('--effects', action='store_true', default=True, help='Enable beautiful effects & transitions (default: True)')
506
  parser.add_argument('--no-effects', action='store_true', help='Disable effects - basic concatenation only')
507
 
 
34
  Uses fixed-length segments for consistent AI classification
35
  """
36
 
37
+ def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"):
38
  """Initialize with SmolVLM2 model - 2.2B provides much better reasoning than 256M"""
39
  print(f"πŸ”₯ Loading {model_name} for HuggingFace Segment-Based Analysis...")
40
  self.vlm_handler = SmolVLM2Handler(model_name=model_name)
 
501
  parser.add_argument('--output', required=True, help='Path to output highlights video')
502
  parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
503
  parser.add_argument('--segment-length', type=float, default=5.0, help='Length of each segment in seconds (default: 5.0)')
504
+ parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-256M-Video-Instruct', help='SmolVLM2 model to use')
505
  parser.add_argument('--effects', action='store_true', default=True, help='Enable beautiful effects & transitions (default: True)')
506
  parser.add_argument('--no-effects', action='store_true', help='Disable effects - basic concatenation only')
507
 
src/smolvlm2_handler.py CHANGED
@@ -39,7 +39,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
39
  class SmolVLM2Handler:
40
  """Handler for SmolVLM2 model operations"""
41
 
42
- def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct", device: str = "auto"):
43
  """
44
  Initialize SmolVLM2 model (2.2B version - better reasoning capabilities)
45
 
 
39
  class SmolVLM2Handler:
40
  """Handler for SmolVLM2 model operations"""
41
 
42
+ def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct", device: str = "auto"):
43
  """
44
  Initialize SmolVLM2 model (2.2B version - better reasoning capabilities)
45