Commit
Β·
7ef6739
1
Parent(s):
56d688e
Update deployment to use SmolVLM2-256M-Video-Instruct model
Browse files- Switch default model from 2.2B-Instruct to 256M-Video-Instruct for faster processing
- Update all model references in app.py, README.md, and processing scripts
- Fix with_effects parameter handling in huggingface_exact_approach.py
- Add effects support with fade transitions and fallback to basic concatenation
- Update documentation to reflect faster processing with video-specialized model
- README.md +4 -4
- app.py +4 -4
- huggingface_exact_approach.py +83 -7
- huggingface_segment_highlights.py +2 -2
- src/smolvlm2_handler.py +1 -1
README.md
CHANGED
|
@@ -19,7 +19,7 @@ This is a FastAPI service that uses HuggingFace's proven segment-based classific
|
|
| 19 |
|
| 20 |
- **Segment-Based Analysis**: Processes videos in fixed 5-second segments for consistent AI classification
|
| 21 |
- **Dual Criteria Generation**: Creates two different highlight criteria sets and selects the most selective one
|
| 22 |
-
- **SmolVLM2-
|
| 23 |
- **Visual Effects**: Optional fade transitions between segments for professional-quality output
|
| 24 |
- **REST API**: Upload videos and download processed highlights with job tracking
|
| 25 |
- **Background Processing**: Non-blocking video processing with real-time status updates
|
|
@@ -39,7 +39,7 @@ This is a FastAPI service that uses HuggingFace's proven segment-based classific
|
|
| 39 |
curl -X POST \
|
| 40 |
-F "video=@your_video.mp4" \
|
| 41 |
-F "segment_length=5.0" \
|
| 42 |
-
-F "model_name=HuggingFaceTB/SmolVLM2-
|
| 43 |
-F "with_effects=true" \
|
| 44 |
https://your-space-url.hf.space/upload-video
|
| 45 |
|
|
@@ -58,13 +58,13 @@ Use the provided Android client code to integrate with your mobile app.
|
|
| 58 |
|
| 59 |
Default settings:
|
| 60 |
- **Segment Length**: 5 seconds (fixed segments for consistent classification)
|
| 61 |
-
- **Model**: SmolVLM2-
|
| 62 |
- **Effects**: Enabled (fade transitions between segments)
|
| 63 |
- **Dual Criteria**: Two prompt variations for robust selection
|
| 64 |
|
| 65 |
## π οΈ Technology Stack
|
| 66 |
|
| 67 |
-
- **SmolVLM2-
|
| 68 |
- **HuggingFace Transformers**: Latest transformer models and inference
|
| 69 |
- **FastAPI**: Modern web framework for APIs
|
| 70 |
- **FFmpeg**: Video processing with advanced filter support
|
|
|
|
| 19 |
|
| 20 |
- **Segment-Based Analysis**: Processes videos in fixed 5-second segments for consistent AI classification
|
| 21 |
- **Dual Criteria Generation**: Creates two different highlight criteria sets and selects the most selective one
|
| 22 |
+
- **SmolVLM2-256M-Video-Instruct**: Faster processing with specialized video understanding
|
| 23 |
- **Visual Effects**: Optional fade transitions between segments for professional-quality output
|
| 24 |
- **REST API**: Upload videos and download processed highlights with job tracking
|
| 25 |
- **Background Processing**: Non-blocking video processing with real-time status updates
|
|
|
|
| 39 |
curl -X POST \
|
| 40 |
-F "video=@your_video.mp4" \
|
| 41 |
-F "segment_length=5.0" \
|
| 42 |
+
-F "model_name=HuggingFaceTB/SmolVLM2-256M-Video-Instruct" \
|
| 43 |
-F "with_effects=true" \
|
| 44 |
https://your-space-url.hf.space/upload-video
|
| 45 |
|
|
|
|
| 58 |
|
| 59 |
Default settings:
|
| 60 |
- **Segment Length**: 5 seconds (fixed segments for consistent classification)
|
| 61 |
+
- **Model**: SmolVLM2-256M-Video-Instruct (faster processing)
|
| 62 |
- **Effects**: Enabled (fade transitions between segments)
|
| 63 |
- **Dual Criteria**: Two prompt variations for robust selection
|
| 64 |
|
| 65 |
## π οΈ Technology Stack
|
| 66 |
|
| 67 |
+
- **SmolVLM2-256M-Video-Instruct**: Efficient vision-language model optimized for video understanding
|
| 68 |
- **HuggingFace Transformers**: Latest transformer models and inference
|
| 69 |
- **FastAPI**: Modern web framework for APIs
|
| 70 |
- **FFmpeg**: Video processing with advanced filter support
|
app.py
CHANGED
|
@@ -64,7 +64,7 @@ app.add_middleware(
|
|
| 64 |
# Request/Response models
|
| 65 |
class AnalysisRequest(BaseModel):
|
| 66 |
segment_length: float = 5.0
|
| 67 |
-
model_name: str = "HuggingFaceTB/SmolVLM2-
|
| 68 |
with_effects: bool = True
|
| 69 |
|
| 70 |
class AnalysisResponse(BaseModel):
|
|
@@ -102,7 +102,7 @@ async def read_root():
|
|
| 102 |
"message": "SmolVLM2 Optimized HuggingFace Video Highlights API",
|
| 103 |
"version": "3.0.0",
|
| 104 |
"approach": "Optimized HuggingFace exact approach with STRICT prompting",
|
| 105 |
-
"model": "SmolVLM2-
|
| 106 |
"improvements": [
|
| 107 |
"STRICT system prompting for selectivity",
|
| 108 |
"Structured YES/NO user prompts",
|
|
@@ -120,7 +120,7 @@ async def read_root():
|
|
| 120 |
@app.get("/health")
|
| 121 |
async def health_check():
|
| 122 |
"""Health check endpoint"""
|
| 123 |
-
return {"status": "healthy", "model": "SmolVLM2-
|
| 124 |
|
| 125 |
async def process_video_background(job_id: str, video_path: str, output_path: str,
|
| 126 |
segment_length: float, model_name: str, with_effects: bool):
|
|
@@ -191,7 +191,7 @@ async def upload_video(
|
|
| 191 |
background_tasks: BackgroundTasks,
|
| 192 |
video: UploadFile = File(...),
|
| 193 |
segment_length: float = 5.0,
|
| 194 |
-
model_name: str = "HuggingFaceTB/SmolVLM2-
|
| 195 |
with_effects: bool = True
|
| 196 |
):
|
| 197 |
"""
|
|
|
|
| 64 |
# Request/Response models
|
| 65 |
class AnalysisRequest(BaseModel):
|
| 66 |
segment_length: float = 5.0
|
| 67 |
+
model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
|
| 68 |
with_effects: bool = True
|
| 69 |
|
| 70 |
class AnalysisResponse(BaseModel):
|
|
|
|
| 102 |
"message": "SmolVLM2 Optimized HuggingFace Video Highlights API",
|
| 103 |
"version": "3.0.0",
|
| 104 |
"approach": "Optimized HuggingFace exact approach with STRICT prompting",
|
| 105 |
+
"model": "SmolVLM2-256M-Video-Instruct (faster processing)",
|
| 106 |
"improvements": [
|
| 107 |
"STRICT system prompting for selectivity",
|
| 108 |
"Structured YES/NO user prompts",
|
|
|
|
| 120 |
@app.get("/health")
|
| 121 |
async def health_check():
|
| 122 |
"""Health check endpoint"""
|
| 123 |
+
return {"status": "healthy", "model": "SmolVLM2-256M-Video-Instruct"}
|
| 124 |
|
| 125 |
async def process_video_background(job_id: str, video_path: str, output_path: str,
|
| 126 |
segment_length: float, model_name: str, with_effects: bool):
|
|
|
|
| 191 |
background_tasks: BackgroundTasks,
|
| 192 |
video: UploadFile = File(...),
|
| 193 |
segment_length: float = 5.0,
|
| 194 |
+
model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
| 195 |
with_effects: bool = True
|
| 196 |
):
|
| 197 |
"""
|
huggingface_exact_approach.py
CHANGED
|
@@ -203,13 +203,21 @@ class VideoHighlightDetector:
|
|
| 203 |
self,
|
| 204 |
video_path: str,
|
| 205 |
scene_times: list,
|
| 206 |
-
output_path: str
|
|
|
|
| 207 |
):
|
| 208 |
-
"""Concatenate selected scenes into final video."""
|
| 209 |
if not scene_times:
|
| 210 |
logger.warning("No scenes to concatenate, skipping.")
|
| 211 |
return
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
filter_complex_parts = []
|
| 214 |
concat_inputs = []
|
| 215 |
for i, (start_sec, end_sec) in enumerate(scene_times):
|
|
@@ -239,14 +247,82 @@ class VideoHighlightDetector:
|
|
| 239 |
]
|
| 240 |
|
| 241 |
logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
|
| 242 |
-
subprocess.run(cmd, check=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
-
def process_video(self, video_path: str, output_path: str, segment_length: float = 10.0) -> Dict:
|
| 245 |
"""Process video using exact HuggingFace approach."""
|
| 246 |
print("π Starting HuggingFace Exact Video Highlight Detection")
|
| 247 |
print(f"π Input: {video_path}")
|
| 248 |
print(f"π Output: {output_path}")
|
| 249 |
print(f"β±οΈ Segment Length: {segment_length}s")
|
|
|
|
| 250 |
print()
|
| 251 |
|
| 252 |
# Get video duration
|
|
@@ -358,7 +434,7 @@ class VideoHighlightDetector:
|
|
| 358 |
|
| 359 |
# Step 4: Create final video
|
| 360 |
print(f"π¬ Step 4: Creating final highlights video...")
|
| 361 |
-
self._concatenate_scenes(video_path, final_segments, output_path)
|
| 362 |
|
| 363 |
print("β
Highlights video created successfully!")
|
| 364 |
print(f"π SUCCESS! Created highlights with {len(final_segments)} segments")
|
|
@@ -387,7 +463,7 @@ def main():
|
|
| 387 |
parser.add_argument('--output', required=True, help='Path to output highlights video')
|
| 388 |
parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
|
| 389 |
parser.add_argument('--segment-length', type=float, default=10.0, help='Length of each segment in seconds (default: 10.0)')
|
| 390 |
-
parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-
|
| 391 |
|
| 392 |
args = parser.parse_args()
|
| 393 |
|
|
|
|
| 203 |
self,
|
| 204 |
video_path: str,
|
| 205 |
scene_times: list,
|
| 206 |
+
output_path: str,
|
| 207 |
+
with_effects: bool = True
|
| 208 |
):
|
| 209 |
+
"""Concatenate selected scenes into final video with optional effects."""
|
| 210 |
if not scene_times:
|
| 211 |
logger.warning("No scenes to concatenate, skipping.")
|
| 212 |
return
|
| 213 |
+
|
| 214 |
+
if with_effects:
|
| 215 |
+
self._concatenate_with_effects(video_path, scene_times, output_path)
|
| 216 |
+
else:
|
| 217 |
+
self._concatenate_basic(video_path, scene_times, output_path)
|
| 218 |
+
|
| 219 |
+
def _concatenate_basic(self, video_path: str, scene_times: list, output_path: str):
|
| 220 |
+
"""Basic concatenation without effects."""
|
| 221 |
filter_complex_parts = []
|
| 222 |
concat_inputs = []
|
| 223 |
for i, (start_sec, end_sec) in enumerate(scene_times):
|
|
|
|
| 247 |
]
|
| 248 |
|
| 249 |
logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
|
| 250 |
+
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
| 251 |
+
|
| 252 |
+
def _concatenate_with_effects(self, video_path: str, scene_times: list, output_path: str):
|
| 253 |
+
"""Concatenate with fade effects between segments."""
|
| 254 |
+
if len(scene_times) == 1:
|
| 255 |
+
# Single segment - just extract with fade in/out
|
| 256 |
+
start_sec, end_sec = scene_times[0]
|
| 257 |
+
duration = end_sec - start_sec
|
| 258 |
+
fade_duration = min(0.5, duration / 4) # 0.5s or 25% of duration, whichever is shorter
|
| 259 |
+
|
| 260 |
+
cmd = [
|
| 261 |
+
"ffmpeg", "-y",
|
| 262 |
+
"-i", video_path,
|
| 263 |
+
"-ss", str(start_sec),
|
| 264 |
+
"-t", str(duration),
|
| 265 |
+
"-vf", f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}",
|
| 266 |
+
"-af", f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}",
|
| 267 |
+
"-c:v", "libx264", "-c:a", "aac",
|
| 268 |
+
output_path
|
| 269 |
+
]
|
| 270 |
+
else:
|
| 271 |
+
# Multiple segments - create with crossfade transitions
|
| 272 |
+
filter_parts = []
|
| 273 |
+
audio_parts = []
|
| 274 |
+
|
| 275 |
+
for i, (start_sec, end_sec) in enumerate(scene_times):
|
| 276 |
+
duration = end_sec - start_sec
|
| 277 |
+
fade_duration = min(0.3, duration / 6) # Shorter fades for multiple segments
|
| 278 |
+
|
| 279 |
+
# Video with fade
|
| 280 |
+
filter_parts.append(
|
| 281 |
+
f"[0:v]trim=start={start_sec}:end={end_sec},setpts=PTS-STARTPTS,"
|
| 282 |
+
f"fade=in:0:{int(fade_duration*30)},fade=out:{int((duration-fade_duration)*30)}:{int(fade_duration*30)}[v{i}]"
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Audio with fade
|
| 286 |
+
audio_parts.append(
|
| 287 |
+
f"[0:a]atrim=start={start_sec}:end={end_sec},asetpts=PTS-STARTPTS,"
|
| 288 |
+
f"afade=in:st=0:d={fade_duration},afade=out:st={duration-fade_duration}:d={fade_duration}[a{i}]"
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# Concatenate all segments
|
| 292 |
+
video_concat = "".join([f"[v{i}]" for i in range(len(scene_times))])
|
| 293 |
+
audio_concat = "".join([f"[a{i}]" for i in range(len(scene_times))])
|
| 294 |
+
|
| 295 |
+
filter_complex = (
|
| 296 |
+
";".join(filter_parts) + ";" +
|
| 297 |
+
";".join(audio_parts) + ";" +
|
| 298 |
+
f"{video_concat}concat=n={len(scene_times)}:v=1:a=0[outv];" +
|
| 299 |
+
f"{audio_concat}concat=n={len(scene_times)}:v=0:a=1[outa]"
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
cmd = [
|
| 303 |
+
"ffmpeg", "-y",
|
| 304 |
+
"-i", video_path,
|
| 305 |
+
"-filter_complex", filter_complex,
|
| 306 |
+
"-map", "[outv]", "-map", "[outa]",
|
| 307 |
+
"-c:v", "libx264", "-c:a", "aac",
|
| 308 |
+
output_path
|
| 309 |
+
]
|
| 310 |
+
|
| 311 |
+
logger.info(f"Running ffmpeg command with effects: {' '.join(cmd)}")
|
| 312 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 313 |
+
if result.returncode != 0:
|
| 314 |
+
logger.error(f"FFmpeg error: {result.stderr}")
|
| 315 |
+
# Fall back to basic concatenation
|
| 316 |
+
logger.info("Falling back to basic concatenation...")
|
| 317 |
+
self._concatenate_basic(video_path, scene_times, output_path)
|
| 318 |
|
| 319 |
+
def process_video(self, video_path: str, output_path: str, segment_length: float = 10.0, with_effects: bool = True) -> Dict:
|
| 320 |
"""Process video using exact HuggingFace approach."""
|
| 321 |
print("π Starting HuggingFace Exact Video Highlight Detection")
|
| 322 |
print(f"π Input: {video_path}")
|
| 323 |
print(f"π Output: {output_path}")
|
| 324 |
print(f"β±οΈ Segment Length: {segment_length}s")
|
| 325 |
+
print(f"π¨ With Effects: {with_effects}")
|
| 326 |
print()
|
| 327 |
|
| 328 |
# Get video duration
|
|
|
|
| 434 |
|
| 435 |
# Step 4: Create final video
|
| 436 |
print(f"π¬ Step 4: Creating final highlights video...")
|
| 437 |
+
self._concatenate_scenes(video_path, final_segments, output_path, with_effects)
|
| 438 |
|
| 439 |
print("β
Highlights video created successfully!")
|
| 440 |
print(f"π SUCCESS! Created highlights with {len(final_segments)} segments")
|
|
|
|
| 463 |
parser.add_argument('--output', required=True, help='Path to output highlights video')
|
| 464 |
parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
|
| 465 |
parser.add_argument('--segment-length', type=float, default=10.0, help='Length of each segment in seconds (default: 10.0)')
|
| 466 |
+
parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-256M-Video-Instruct', help='SmolVLM2 model to use')
|
| 467 |
|
| 468 |
args = parser.parse_args()
|
| 469 |
|
huggingface_segment_highlights.py
CHANGED
|
@@ -34,7 +34,7 @@ class HuggingFaceVideoHighlightDetector:
|
|
| 34 |
Uses fixed-length segments for consistent AI classification
|
| 35 |
"""
|
| 36 |
|
| 37 |
-
def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-
|
| 38 |
"""Initialize with SmolVLM2 model - 2.2B provides much better reasoning than 256M"""
|
| 39 |
print(f"π₯ Loading {model_name} for HuggingFace Segment-Based Analysis...")
|
| 40 |
self.vlm_handler = SmolVLM2Handler(model_name=model_name)
|
|
@@ -501,7 +501,7 @@ def main():
|
|
| 501 |
parser.add_argument('--output', required=True, help='Path to output highlights video')
|
| 502 |
parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
|
| 503 |
parser.add_argument('--segment-length', type=float, default=5.0, help='Length of each segment in seconds (default: 5.0)')
|
| 504 |
-
parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-
|
| 505 |
parser.add_argument('--effects', action='store_true', default=True, help='Enable beautiful effects & transitions (default: True)')
|
| 506 |
parser.add_argument('--no-effects', action='store_true', help='Disable effects - basic concatenation only')
|
| 507 |
|
|
|
|
| 34 |
Uses fixed-length segments for consistent AI classification
|
| 35 |
"""
|
| 36 |
|
| 37 |
+
def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"):
|
| 38 |
"""Initialize with SmolVLM2 model - 2.2B provides much better reasoning than 256M"""
|
| 39 |
print(f"π₯ Loading {model_name} for HuggingFace Segment-Based Analysis...")
|
| 40 |
self.vlm_handler = SmolVLM2Handler(model_name=model_name)
|
|
|
|
| 501 |
parser.add_argument('--output', required=True, help='Path to output highlights video')
|
| 502 |
parser.add_argument('--save-analysis', action='store_true', help='Save analysis results to JSON')
|
| 503 |
parser.add_argument('--segment-length', type=float, default=5.0, help='Length of each segment in seconds (default: 5.0)')
|
| 504 |
+
parser.add_argument('--model', default='HuggingFaceTB/SmolVLM2-256M-Video-Instruct', help='SmolVLM2 model to use')
|
| 505 |
parser.add_argument('--effects', action='store_true', default=True, help='Enable beautiful effects & transitions (default: True)')
|
| 506 |
parser.add_argument('--no-effects', action='store_true', help='Disable effects - basic concatenation only')
|
| 507 |
|
src/smolvlm2_handler.py
CHANGED
|
@@ -39,7 +39,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
|
|
| 39 |
class SmolVLM2Handler:
|
| 40 |
"""Handler for SmolVLM2 model operations"""
|
| 41 |
|
| 42 |
-
def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-
|
| 43 |
"""
|
| 44 |
Initialize SmolVLM2 model (2.2B version - better reasoning capabilities)
|
| 45 |
|
|
|
|
| 39 |
class SmolVLM2Handler:
|
| 40 |
"""Handler for SmolVLM2 model operations"""
|
| 41 |
|
| 42 |
+
def __init__(self, model_name: str = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct", device: str = "auto"):
|
| 43 |
"""
|
| 44 |
Initialize SmolVLM2 model (2.2B version - better reasoning capabilities)
|
| 45 |
|