smolvlm2-video-highlights2

Sleeping

avinashHuggingface108 commited on Sep 20

Commit

7ef6739

1 Parent(s): 56d688e

Update deployment to use SmolVLM2-256M-Video-Instruct model

- Switch default model from 2.2B-Instruct to 256M-Video-Instruct for faster processing
- Update all model references in app.py, README.md, and processing scripts
- Fix with_effects parameter handling in huggingface_exact_approach.py
- Add effects support with fade transitions and fallback to basic concatenation
- Update documentation to reflect faster processing with video-specialized model

Files changed (5) hide show

README.md +4 -4
app.py +4 -4
huggingface_exact_approach.py +83 -7
huggingface_segment_highlights.py +2 -2
src/smolvlm2_handler.py +1 -1

README.md CHANGED Viewed

@@ -19,7 +19,7 @@ This is a FastAPI service that uses HuggingFace's proven segment-based classific
 - **Segment-Based Analysis**: Processes videos in fixed 5-second segments for consistent AI classification
 - **Dual Criteria Generation**: Creates two different highlight criteria sets and selects the most selective one
-- **SmolVLM2-2.2B-Instruct**: Better reasoning capabilities for reliable segment classification
 - **Visual Effects**: Optional fade transitions between segments for professional-quality output
 - **REST API**: Upload videos and download processed highlights with job tracking
 - **Background Processing**: Non-blocking video processing with real-time status updates
@@ -39,7 +39,7 @@ This is a FastAPI service that uses HuggingFace's proven segment-based classific
 curl -X POST \
   -F "video=@your_video.mp4" \
   -F "segment_length=5.0" \
-  -F "model_name=HuggingFaceTB/SmolVLM2-2.2B-Instruct" \
   -F "with_effects=true" \
   https://your-space-url.hf.space/upload-video
@@ -58,13 +58,13 @@ Use the provided Android client code to integrate with your mobile app.
 Default settings:
 - **Segment Length**: 5 seconds (fixed segments for consistent classification)
-- **Model**: SmolVLM2-2.2B-Instruct (better reasoning capabilities)
 - **Effects**: Enabled (fade transitions between segments)
 - **Dual Criteria**: Two prompt variations for robust selection
 ## 🛠️ Technology Stack
-- **SmolVLM2-2.2B-Instruct**: Advanced vision-language model with superior reasoning for complex tasks
 - **HuggingFace Transformers**: Latest transformer models and inference
 - **FastAPI**: Modern web framework for APIs
 - **FFmpeg**: Video processing with advanced filter support

 - **Segment-Based Analysis**: Processes videos in fixed 5-second segments for consistent AI classification
 - **Dual Criteria Generation**: Creates two different highlight criteria sets and selects the most selective one
+- **SmolVLM2-256M-Video-Instruct**: Faster processing with specialized video understanding
 - **Visual Effects**: Optional fade transitions between segments for professional-quality output
 - **REST API**: Upload videos and download processed highlights with job tracking
 - **Background Processing**: Non-blocking video processing with real-time status updates
 curl -X POST \
   -F "video=@your_video.mp4" \
   -F "segment_length=5.0" \
+  -F "model_name=HuggingFaceTB/SmolVLM2-256M-Video-Instruct" \
   -F "with_effects=true" \
   https://your-space-url.hf.space/upload-video
 Default settings:
 - **Segment Length**: 5 seconds (fixed segments for consistent classification)
+- **Model**: SmolVLM2-256M-Video-Instruct (faster processing)
 - **Effects**: Enabled (fade transitions between segments)
 - **Dual Criteria**: Two prompt variations for robust selection
 ## 🛠️ Technology Stack
+- **SmolVLM2-256M-Video-Instruct**: Efficient vision-language model optimized for video understanding
 - **HuggingFace Transformers**: Latest transformer models and inference
 - **FastAPI**: Modern web framework for APIs
 - **FFmpeg**: Video processing with advanced filter support

app.py CHANGED Viewed

@@ -64,7 +64,7 @@ app.add_middleware(
 # Request/Response models
 class AnalysisRequest(BaseModel):
     segment_length: float = 5.0
-    model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
     with_effects: bool = True
 class AnalysisResponse(BaseModel):
@@ -102,7 +102,7 @@ async def read_root():
         "message": "SmolVLM2 Optimized HuggingFace Video Highlights API",
         "version": "3.0.0",
         "approach": "Optimized HuggingFace exact approach with STRICT prompting",
-        "model": "SmolVLM2-2.2B-Instruct (67% success rate)",
         "improvements": [
             "STRICT system prompting for selectivity",
             "Structured YES/NO user prompts",
@@ -120,7 +120,7 @@ async def read_root():
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
-    return {"status": "healthy", "model": "SmolVLM2-2.2B-Instruct"}
 async def process_video_background(job_id: str, video_path: str, output_path: str,
                                  segment_length: float, model_name: str, with_effects: bool):
@@ -191,7 +191,7 @@ async def upload_video(
     background_tasks: BackgroundTasks,
     video: UploadFile = File(...),
     segment_length: float = 5.0,
-    model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     with_effects: bool = True
 ):
     """

 # Request/Response models
 class AnalysisRequest(BaseModel):
     segment_length: float = 5.0
+    model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
     with_effects: bool = True
 class AnalysisResponse(BaseModel):
         "message": "SmolVLM2 Optimized HuggingFace Video Highlights API",
         "version": "3.0.0",
         "approach": "Optimized HuggingFace exact approach with STRICT prompting",
+        "model": "SmolVLM2-256M-Video-Instruct (faster processing)",
         "improvements": [
             "STRICT system prompting for selectivity",
             "Structured YES/NO user prompts",
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
+    return {"status": "healthy", "model": "SmolVLM2-256M-Video-Instruct"}
 async def process_video_background(job_id: str, video_path: str, output_path: str,
                                  segment_length: float, model_name: str, with_effects: bool):
     background_tasks: BackgroundTasks,
     video: UploadFile = File(...),
     segment_length: float = 5.0,
+    model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
     with_effects: bool = True
 ):
     """

huggingface_exact_approach.py CHANGED Viewed

@@ -203,13 +203,21 @@ class VideoHighlightDetector:
         self,
         video_path: str,
         scene_times: list,
-        output_path: str
     ):
-        """Concatenate selected scenes into final video."""
         if not scene_times:
             logger.warning("No scenes to concatenate, skipping.")
             return
         filter_complex_parts = []
         concat_inputs = []
         for i, (start_sec, end_sec) in enumerate(scene_times):
@@ -239,14 +247,82 @@ class VideoHighlightDetector:
         ]
         logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
-        subprocess.run(cmd, check=True)
-    def process_video(self, video_path: str, output_path: str, segment_length: float = 10.0) -> Dict:
         """Process video using exact HuggingFace approach."""
         print("🚀 Starting HuggingFace Exact Video Highlight Detection")
         print(f"📁 Input: {video_path}")
         print(f"📁 Output: {output_path}")
         print(f"⏱️ Segment Length: {segment_length}s")
         print()
         # Get video duration
@@ -358,7 +434,7 @@ class VideoHighlightDetector:
         # Step 4: Create final video
         print(f"🎬 Step 4: Creating final highlights video...")
-        self._concatenate_scenes(video_path, final_segments, output_path)
         print("✅ Highlights video created successfully!")
         print(f"🎉 SUCCESS! Created highlights with {len(final_segments)} segments")
@@ -387,7 +463,7 @@ def main():
     parser.add_argument('--output', required=True, help='Path to output highlights video')
     parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
     parser.add_argument('--segment-length', type=float, default=10.0, help='Length of each segment in seconds (default: 10.0)')
-    parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-2.2B-Instruct', help='SmolVLM2 model to use')
     args = parser.parse_args()

         self,
         video_path: str,
         scene_times: list,
+        output_path: str,
+        with_effects: bool = True
     ):
+        """Concatenate selected scenes into final video with optional effects."""
         if not scene_times:
             logger.warning("No scenes to concatenate, skipping.")
             return
+        if with_effects:
+            self._concatenate_with_effects(video_path, scene_times, output_path)
+        else:
+            self._concatenate_basic(video_path, scene_times, output_path)
+    def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
+        """Basic concatenation without effects."""
         filter_complex_parts = []
         concat_inputs = []
         for i, (start_sec, end_sec) in enumerate(scene_times):
         ]
         logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
+    def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
+        """Concatenate with fade effects between segments."""
+        if len(scene_times) == 1:
+            # Single segment - just extract with fade in/out
+            start_sec, end_sec = scene_times[0]
+            duration = end_sec - start_sec
+            fade_duration = min(0.5, duration / 4)  # 0.5s or 25% of duration, whichever is shorter
+            cmd = [
+                "ffmpeg", "-y",
+                "-i", video_path,
+                "-ss", str(start_sec),
+                "-t", str(duration),
+                "-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
+                "-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
+                "-c:v", "libx264", "-c:a", "aac",
+                output_path
+            ]
+        else:
+            # Multiple segments - create with crossfade transitions
+            filter_parts = []
+            audio_parts = []
+            for i, (start_sec, end_sec) in enumerate(scene_times):
+                duration = end_sec - start_sec
+                fade_duration = min(0.3, duration / 6)  # Shorter fades for multiple segments
+                # Video with fade
+                filter_parts.append(
+                    f"[0:v]trim=start={start_sec}:end={end_sec},setpts=PTS-STARTPTS,"
+                    f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
+                )
+                # Audio with fade
+                audio_parts.append(
+                    f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
+                    f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
+                )
+            # Concatenate all segments
+            video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
+            audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
+            filter_complex = (
+                ";".join(filter_parts) + ";" +
+                ";".join(audio_parts) + ";" +
+                f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
+                f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
+            )
+            cmd = [
+                "ffmpeg", "-y",
+                "-i", video_path,
+                "-filter_complex", filter_complex,
+                "-map", "[outv]", "-map", "[outa]",
+                "-c:v", "libx264", "-c:a", "aac",
+                output_path
+            ]
+        logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error(f"FFmpeg error: {result.stderr}")
+            # Fall back to basic concatenation
+            logger.info("Falling back to basic concatenation...")
+            self._concatenate_basic(video_path, scene_times, output_path)
+    def process_video(self, video_path: str, output_path: str, segment_length: float = 10.0, with_effects: bool = True) -> Dict:
         """Process video using exact HuggingFace approach."""
         print("🚀 Starting HuggingFace Exact Video Highlight Detection")
         print(f"📁 Input: {video_path}")
         print(f"📁 Output: {output_path}")
         print(f"⏱️ Segment Length: {segment_length}s")
+        print(f"🎨 With Effects: {with_effects}")
         print()
         # Get video duration
         # Step 4: Create final video
         print(f"🎬 Step 4: Creating final highlights video...")
+        self._concatenate_scenes(video_path, final_segments, output_path, with_effects)
         print("✅ Highlights video created successfully!")
         print(f"🎉 SUCCESS! Created highlights with {len(final_segments)} segments")
     parser.add_argument('--output', required=True, help='Path to output highlights video')
     parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
     parser.add_argument('--segment-length', type=float, default=10.0, help='Length of each segment in seconds (default: 10.0)')
+    parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-256M-Video-Instruct', help='SmolVLM2 model to use')
     args = parser.parse_args()

huggingface_segment_highlights.py CHANGED Viewed

@@ -34,7 +34,7 @@ class HuggingFaceVideoHighlightDetector:
     Uses fixed-length segments for consistent AI classification
     """
-    def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"):
         """Initialize with SmolVLM2 model - 2.2B provides much better reasoning than 256M"""
         print(f"🔥 Loading {model_name} for HuggingFace Segment-Based Analysis...")
         self.vlm_handler = SmolVLM2Handler(model_name=model_name)
@@ -501,7 +501,7 @@ def main():
     parser.add_argument('--output', required=True, help='Path to output highlights video')
     parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
     parser.add_argument('--segment-length', type=float, default=5.0, help='Length of each segment in seconds (default: 5.0)')
-    parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-2.2B-Instruct', help='SmolVLM2 model to use')
     parser.add_argument('--effects', action='store_true', default=True, help='Enable beautiful effects & transitions (default: True)')
     parser.add_argument('--no-effects', action='store_true', help='Disable effects - basic concatenation only')

     Uses fixed-length segments for consistent AI classification
     """
+    def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"):
         """Initialize with SmolVLM2 model - 2.2B provides much better reasoning than 256M"""
         print(f"🔥 Loading {model_name} for HuggingFace Segment-Based Analysis...")
         self.vlm_handler = SmolVLM2Handler(model_name=model_name)
     parser.add_argument('--output', required=True, help='Path to output highlights video')
     parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
     parser.add_argument('--segment-length', type=float, default=5.0, help='Length of each segment in seconds (default: 5.0)')
+    parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-256M-Video-Instruct', help='SmolVLM2 model to use')
     parser.add_argument('--effects', action='store_true', default=True, help='Enable beautiful effects & transitions (default: True)')
     parser.add_argument('--no-effects', action='store_true', help='Disable effects - basic concatenation only')

src/smolvlm2_handler.py CHANGED Viewed

@@ -39,7 +39,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
 class SmolVLM2Handler:
     """Handler for SmolVLM2 model operations"""
-    def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct", device: str = "auto"):
         """
         Initialize SmolVLM2 model (2.2B version - better reasoning capabilities)

 class SmolVLM2Handler:
     """Handler for SmolVLM2 model operations"""
+    def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct", device: str = "auto"):
         """
         Initialize SmolVLM2 model (2.2B version - better reasoning capabilities)