Spaces:

chakkale
/

minicpm-video-analyzer

Paused

App Files Files Community

chakkale commited on Jul 9

Commit

a4609f7

verified ·

1 Parent(s): 60736be

Upload app.py

Browse files

Files changed (1) hide show

app.py +461 -131

app.py CHANGED Viewed

@@ -1,31 +1,41 @@
 #!/usr/bin/env python3
 """
-MiniCPM-o 2.6 Multimodal Video Analyzer - Hugging Face Spaces Version
-A Gradio interface for comprehensive video + audio analysis using MiniCPM-o 2.6
 MULTIMODAL CAPABILITIES:
 - Video Analysis: Visual content, scenes, objects, actions, composition
 - Audio Analysis: Speech, music, sound effects, ambient audio, transcription
 - Combined Analysis: Synchronized audiovisual understanding and insights
-SHAPE MISMATCH ERROR HANDLING:
-This version includes robust handling for the common shape mismatch error:
-"RuntimeError: shape mismatch: value tensor of shape [1080] cannot be broadcast to indexing result of shape [1044]"
-The error occurs in the vision processing pipeline when there are inconsistencies between:
-- Calculated position embeddings (e.g., 1080 positions)
-- Attention mask dimensions (e.g., 1044 valid positions)
-IMPLEMENTED SOLUTIONS:
-1. Fallback Strategy 1: Reduces max_slice_nums to 1 for simpler processing
-2. Fallback Strategy 2: Re-processes with fewer frames (16 max)
-3. Enhanced Error Messages: Provides actionable troubleshooting advice
-4. Video Diagnostics: Logs resolution and format information
-5. Audio Extraction: Librosa-based audio processing with error handling
 VIDEO COMPATIBILITY:
 - Preserves original video resolution and quality
-- Format: MP4, AVI, MOV, WebM supported
 - Duration: Any length (frames are sampled automatically)
 - Audio: Automatically extracted and analyzed when available
 """
@@ -195,14 +205,106 @@ def load_model():
         print(f"❌ Error loading model: {e}")
         raise e
-def analyze_video(video_file, prompt, max_frames):
-    """Analyze video with audio using MiniCPM-o 2.6 multimodal capabilities"""
     if video_file is None:
         return "❌ Please upload a video file"
-    if not prompt.strip():
-        prompt = "Describe this video in detail, including both visual content and audio"
     try:
         # Load model
@@ -210,7 +312,7 @@ def analyze_video(video_file, prompt, max_frames):
         model, tokenizer = load_model()
         # Process video
-        print(f"Processing video: {video_file}")
         # Add video diagnostics to help identify potential issues
         try:
@@ -221,10 +323,11 @@ def analyze_video(video_file, prompt, max_frames):
                 height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                 fps = cap.get(cv2.CAP_PROP_FPS)
                 frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-                print(f"📹 Video info: {width}x{height}, {fps:.1f}fps, {frame_count} frames")
                 cap.release()
         except:
-            print("📹 Video info: Could not read video metadata")
         # Extract video frames
         frames = encode_video(video_file, max_num_frames=max_frames)
@@ -239,16 +342,206 @@ def analyze_video(video_file, prompt, max_frames):
         # Prepare multimodal content
         content = frames.copy()  # Start with video frames
-        # Add audio description to prompt if audio was found
         if audio_data is not None:
-            enhanced_prompt = f"{prompt}\n\nPlease also analyze the audio content including any speech, music, sound effects, or ambient sounds in the video."
             print(f"🎵 Audio analysis enabled - {len(audio_data)/sample_rate:.1f}s of audio")
         else:
-            enhanced_prompt = f"{prompt}\n\nNote: No audio content detected in this video."
-            print("🔇 Video analysis only - no audio content")
-        content.append(enhanced_prompt)
         # Prepare messages exactly as in sample code
         msgs = [
@@ -260,7 +553,7 @@ def analyze_video(video_file, prompt, max_frames):
         params["use_image_id"] = False
         params["max_slice_nums"] = 1  # Reduced for Spaces memory limits
-        print("🧠 Analyzing video and audio with MiniCPM-o 2.6...")
         # Clear GPU cache before inference
         if torch.cuda.is_available():
@@ -307,12 +600,9 @@ def analyze_video(video_file, prompt, max_frames):
                             frames_reduced = encode_video(video_file, max_num_frames=16)
                             if frames_reduced:
-                                # Prepare reduced content with audio info
                                 content_reduced = frames_reduced.copy()
-                                if audio_data is not None:
-                                    content_reduced.append(f"{prompt}\n\nPlease analyze both video and audio content (audio: {len(audio_data)/sample_rate:.1f}s)")
-                                else:
-                                    content_reduced.append(f"{prompt}\n\nVideo-only analysis (no audio detected)")
                                 msgs_reduced = [
                                     {'role': 'user', 'content': content_reduced},
@@ -336,21 +626,22 @@ def analyze_video(video_file, prompt, max_frames):
                         except Exception as final_error:
                             print(f"❌ All fallback strategies failed: {final_error}")
-                            # Provide helpful error message
                             error_details = f"""
-Shape mismatch error detected. This can happen due to:
 1. Unusual video resolution/aspect ratio
-2. Video compression artifacts
 3. Frame dimension inconsistencies
-Suggested solutions:
-- Try a different video file
-- Ensure video resolution is standard (e.g., 1920x1080, 1280x720)
-- Convert video to a standard format (MP4 with H.264)
 Technical details: {str(inference_error)}
 """
-                            return f"❌ Processing failed after multiple attempts:\n{error_details}"
                 # Try to clear cache and retry once for other errors
                 if torch.cuda.is_available():
@@ -362,27 +653,29 @@ Technical details: {str(inference_error)}
         # Check which attention implementation was actually used
         attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
-        # Prepare analysis type info
         if audio_data is not None:
-            analysis_type = f"Video + Audio Analysis ({len(audio_data)/sample_rate:.1f}s audio)"
             media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio Duration:** {len(audio_data)/sample_rate:.1f} seconds  \n**Sample Rate:** {sample_rate} Hz"
         else:
-            analysis_type = "Video-Only Analysis (no audio detected)"
-            media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio:** Not detected or unavailable"
-        result = f"""## 🎬 Multimodal Video Analysis Results
 **Processing Time:** {processing_time:.2f} seconds
 {media_info}
-**Model:** MiniCPM-o 2.6
 **Attention:** {attention_type}
 **Analysis Type:** {analysis_type}
-### Analysis:
 {answer}
 ---
-*Powered by MiniCPM-o 2.6 Multimodal AI on Hugging Face Spaces*
 """
         return result
@@ -416,14 +709,14 @@ def get_example_prompts():
 # Create Gradio interface
 def create_interface():
-    """Create the Gradio interface"""
     with gr.Blocks(
-        title="MiniCPM-o 2.6 Video Analyzer",
         theme=gr.themes.Soft(),
         css="""
         .gradio-container {
-            max-width: 900px !important;
         }
         .example-prompt {
             cursor: pointer;
@@ -436,104 +729,139 @@ def create_interface():
         .example-prompt:hover {
             background: #e0e0e0;
         }
         """
     ) as demo:
         gr.Markdown("""
-        # 🎬 MiniCPM-o 2.6 Multimodal Video Analyzer
-        Upload a video and get comprehensive AI-powered analysis using MiniCPM-o 2.6's multimodal capabilities.
-        **Features:**
-        - 🎥 **Video content analysis** - visual scenes, objects, actions
-        - 🎵 **Audio analysis** - speech, music, sound effects, ambient audio
-        - 🖼️ **Frame-by-frame understanding** with temporal context
-        - 📝 **Detailed multimodal descriptions** combining visual and audio elements
-        - 🎨 **Creative and marketing insights** from complete audiovisual content
-        - ⚡ **Flash Attention 2 optimized** for maximum performance
-        - 🔧 **Robust error handling** with automatic fallback strategies
-        **Supported formats:** MP4, AVI, MOV, WebM
-        **Analysis includes:** Visual content + Audio content + Speech transcription
-        **Original quality preserved** - no resizing or compression
-        ⚠️ **Note:** Audio extraction works best with standard video formats. Some videos may require fallback processing.
         """)
         with gr.Row():
-            with gr.Column(scale=1):
-                # Video input
                 video_input = gr.Video(
-                    label="📹 Upload Video",
-                    elem_id="video_input"
                 )
-                # Prompt input
-                prompt_input = gr.Textbox(
-                    label="💬 Analysis Prompt",
-                    placeholder="Describe this video in detail...",
-                    value="Describe this video in detail",
-                    lines=3
-                )
-                # Max frames slider
-                max_frames_slider = gr.Slider(
-                    minimum=8,
-                    maximum=64,
-                    value=32,
-                    step=8,
-                    label="🎞️ Max Frames to Analyze",
-                    info="More frames = more detail but slower processing"
-                )
-                # Analyze button
                 analyze_btn = gr.Button(
-                    "🚀 Analyze Video",
                     variant="primary",
-                    size="lg"
                 )
-                # Example prompts
-                gr.Markdown("### 💡 Example Prompts:")
-                example_prompts = get_example_prompts()
-                with gr.Row():
-                    for i in range(0, len(example_prompts), 2):
-                        with gr.Column():
-                            if i < len(example_prompts):
-                                ex_btn1 = gr.Button(
-                                    example_prompts[i],
-                                    size="sm",
-                                    elem_classes=["example-prompt"]
-                                )
-                                ex_btn1.click(
-                                    lambda x=example_prompts[i]: x,
-                                    outputs=prompt_input
-                                )
-                            if i + 1 < len(example_prompts):
-                                ex_btn2 = gr.Button(
-                                    example_prompts[i + 1],
-                                    size="sm",
-                                    elem_classes=["example-prompt"]
-                                )
-                                ex_btn2.click(
-                                    lambda x=example_prompts[i + 1]: x,
-                                    outputs=prompt_input
-                                )
-            with gr.Column(scale=1):
-                # Results output
                 output_text = gr.Markdown(
                     label="📊 Analysis Results",
-                    value="Upload a video and click 'Analyze Video' to get started!",
                     elem_id="output"
                 )
         # Event handlers
         analyze_btn.click(
-            fn=analyze_video,
-            inputs=[video_input, prompt_input, max_frames_slider],
             outputs=output_text,
             show_progress=True
         )
@@ -541,15 +869,17 @@ def create_interface():
         # Footer
         gr.Markdown("""
         ---
-        ### ℹ️ About
-        This app uses **MiniCPM-o 2.6**, a state-of-the-art multimodal AI model for comprehensive video and audio understanding.
-        - **Model:** [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
-        - **Capabilities:** Video analysis + Audio processing + Speech transcription
-        - **Audio Processing:** Powered by librosa for high-quality audio extraction
-        - **GPU:** Optimized for Hugging Face Spaces with SDPA/Flash Attention
-        **Processing includes:** Visual content analysis, audio content analysis, speech-to-text, music/sound identification, and synchronized audiovisual understanding.
         """)
     return demo

 #!/usr/bin/env python3
 """
+Mobile Creative Ad Analyzer - Powered by MiniCPM-o 2.6
+Professional mobile advertising creative analysis with comprehensive 12-point evaluation framework
+COMPREHENSIVE ANALYSIS FRAMEWORK:
+1. Hook Analysis (0-5 second critical window)
+2. Opening Sequence & First Impression
+3. Audiovisual Synchronization
+4. Text & UI Element Analysis
+5. Mobile-First Design Optimization
+6. Target Audience & Demographic Appeal
+7. Game/Product Appeal & Positioning
+8. Conversion & Retention Elements
+9. Creative Production Quality
+10. Performance Optimization Recommendations
+11. Platform-Specific Analysis (iOS/Android/Social)
+12. Competitive & Market Context
 MULTIMODAL CAPABILITIES:
 - Video Analysis: Visual content, scenes, objects, actions, composition
 - Audio Analysis: Speech, music, sound effects, ambient audio, transcription
 - Combined Analysis: Synchronized audiovisual understanding and insights
+MOBILE AD FOCUS:
+- Optimized for mobile game ads, app install campaigns, social media creatives
+- Hook effectiveness assessment for social feeds (Instagram, TikTok, Facebook)
+- Mobile-first design evaluation and thumb-stopping power analysis
+- Conversion optimization and performance improvement recommendations
+SHAPE MISMATCH ERROR HANDLING:
+Robust handling for the common shape mismatch error with fallback strategies
+and specialized error messages for mobile creative formats.
 VIDEO COMPATIBILITY:
 - Preserves original video resolution and quality
+- Format: MP4, AVI, MOV, WebM supported
+- Optimized for mobile creative formats (9:16, 16:9, 1:1)
 - Duration: Any length (frames are sampled automatically)
 - Audio: Automatically extracted and analyzed when available
 """
         print(f"❌ Error loading model: {e}")
         raise e
+def get_comprehensive_mobile_ad_prompt():
+    """Get the comprehensive mobile ad analysis prompt"""
+    return """🎮 COMPREHENSIVE MOBILE CREATIVE AD ANALYSIS
+Please provide a detailed analysis of this mobile advertising creative, examining both visual and audio elements across the following dimensions:
+**1. HOOK ANALYSIS (0-5 Seconds - Critical Window)**
+- Identify the specific hook mechanism used in the opening moments
+- Analyze attention-grabbing visual elements (motion, colors, characters, UI elements)
+- Evaluate audio hooks (music intro, sound effects, voiceover opening, silence/contrast)
+- Assess scroll-stopping power and thumb-stopping effectiveness
+- Rate the hook strength for mobile social feeds (Instagram, TikTok, Facebook)
+**2. OPENING SEQUENCE & FIRST IMPRESSION**
+- Analyze the first-play experience and immediate visual impact
+- Examine character introductions, gameplay preview, or problem presentation
+- Evaluate motion graphics, transitions, and visual flow
+- Assess brand logo placement and timing
+- Review pacing and information hierarchy in opening seconds
+**3. AUDIOVISUAL SYNCHRONIZATION**
+- Analyze how visuals and audio work together to create impact
+- Evaluate music style, tempo, and genre appropriateness for target audience
+- Assess sound effects quality and timing with visual cues
+- Review voiceover clarity, tone, and message delivery
+- Examine audio branding and brand voice consistency
+**4. TEXT & UI ELEMENT ANALYSIS**
+- Evaluate all written text elements (headlines, UI text, CTAs, game titles)
+- Assess readability and legibility on mobile screens
+- Analyze urgency triggers, emotional appeals, and persuasion techniques
+- Review call-to-action button design, placement, and messaging
+- Examine subtitle usage and accessibility considerations
+**5. MOBILE-FIRST DESIGN OPTIMIZATION**
+- Assess effectiveness on small screens (5-7 inch displays)
+- Evaluate thumb accessibility and touch target sizing
+- Analyze social media feed optimization (square, vertical, horizontal formats)
+- Review sound-off viewing compatibility and visual storytelling clarity
+- Assess loading speed implications and file size considerations
+**6. TARGET AUDIENCE & DEMOGRAPHIC APPEAL**
+- Identify primary and secondary target demographics
+- Analyze age group appeal through visual style, music, and messaging
+- Evaluate gender targeting through character design and content themes
+- Assess cultural relevance and localization effectiveness
+- Review psychographic targeting (interests, behaviors, values)
+**7. GAME/PRODUCT APPEAL & POSITIONING**
+- Identify genre and gameplay mechanics showcased
+- Analyze competitive advantages highlighted in the creative
+- Evaluate product demonstration and feature communication
+- Assess value proposition clarity and uniqueness
+- Review progression systems, rewards, or benefits shown
+**8. CONVERSION & RETENTION ELEMENTS**
+- Identify download triggers and install prompts
+- Analyze gameplay teasers and engagement hooks
+- Evaluate social proof elements (ratings, download numbers, testimonials)
+- Assess limited-time offers, bonuses, or incentive messaging
+- Review onboarding hints and tutorial elements
+**9. CREATIVE PRODUCTION QUALITY**
+- Evaluate overall production value and professional polish
+- Analyze animation quality, visual effects, and technical execution
+- Assess audio production quality and mixing
+- Review brand consistency across all creative elements
+- Evaluate creative concept originality and memorability
+**10. PERFORMANCE OPTIMIZATION RECOMMENDATIONS**
+- Provide specific suggestions for improving the first 5-second hook
+- Recommend audiovisual enhancements for better engagement
+- Suggest mobile optimization improvements
+- Propose A/B testing opportunities for key elements
+- Offer conversion rate optimization strategies
+**11. PLATFORM-SPECIFIC ANALYSIS**
+- Evaluate effectiveness for iOS vs Android audiences
+- Analyze App Store vs Google Play creative requirements compliance
+- Assess social platform compatibility (Instagram Stories, TikTok, Facebook Feed)
+- Review programmatic advertising network optimization
+- Consider influencer/creator content adaptation potential
+**12. COMPETITIVE & MARKET CONTEXT**
+- Compare creative approach to category standards and competitors
+- Identify unique differentiation points and market positioning
+- Assess trend alignment and contemporary relevance
+- Evaluate seasonal or cultural timing appropriateness
+- Suggest competitive advantages to emphasize
+Please provide specific, actionable insights for each section with concrete examples from the creative. Include both strengths to leverage and weaknesses to address, with prioritized recommendations for maximum impact improvement."""
+def analyze_video(video_file, max_frames=32):
+    """Analyze mobile creative ad using comprehensive analysis prompt"""
     if video_file is None:
         return "❌ Please upload a video file"
+    # Use the comprehensive mobile ad analysis prompt
+    prompt = get_comprehensive_mobile_ad_prompt()
     try:
         # Load model
         model, tokenizer = load_model()
         # Process video
+        print(f"Processing mobile creative ad: {video_file}")
         # Add video diagnostics to help identify potential issues
         try:
                 height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                 fps = cap.get(cv2.CAP_PROP_FPS)
                 frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                duration = frame_count / fps if fps > 0 else 0
+                print(f"📹 Creative specs: {width}x{height}, {fps:.1f}fps, {duration:.1f}s duration")
                 cap.release()
         except:
+            print("📹 Creative info: Could not read video metadata")
         # Extract video frames
         frames = encode_video(video_file, max_num_frames=max_frames)
         # Prepare multimodal content
         content = frames.copy()  # Start with video frames
+        content.append(prompt)  # Add the comprehensive analysis prompt
         if audio_data is not None:
             print(f"🎵 Audio analysis enabled - {len(audio_data)/sample_rate:.1f}s of audio")
         else:
+            print("🔇 Visual analysis only - no audio content detected")
+        # Prepare messages exactly as in sample code
+        msgs = [
+            {'role': 'user', 'content': content},
+        ]
+        # Set decode params for video exactly as in sample code
+        params = {}
+        params["use_image_id"] = False
+        params["max_slice_nums"] = 1  # Reduced for Spaces memory limits
+        print("🧠 Analyzing mobile creative ad with MiniCPM-o 2.6...")
+        # Clear GPU cache before inference
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        start_time = time.time()
+        # Generate response using exact method from sample code
+        with torch.no_grad():  # Save memory
+            try:
+                answer = model.chat(
+                    msgs=msgs,
+                    tokenizer=tokenizer,
+                    **params
+                )
+            except Exception as inference_error:
+                print(f"Inference error: {inference_error}")
+                # Check if it's the known shape mismatch error
+                if "shape mismatch" in str(inference_error) and "cannot be broadcast" in str(inference_error):
+                    print("🔧 Detected shape mismatch error - applying fallback strategy...")
+                    try:
+                        # Fallback Strategy 1: Reduce max_slice_nums to 1 for simpler processing
+                        params["max_slice_nums"] = 1
+                        print("📝 Trying with reduced max_slice_nums=1...")
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                        answer = model.chat(
+                            msgs=msgs,
+                            tokenizer=tokenizer,
+                            **params
+                        )
+                        print("✅ Fallback strategy 1 successful!")
+                    except Exception as fallback_error:
+                        print(f"❌ Fallback strategy 1 failed: {fallback_error}")
+                        try:
+                            # Fallback Strategy 2: Re-process video with fewer frames
+                            print("📝 Trying with fewer frames (16 max)...")
+                            frames_reduced = encode_video(video_file, max_num_frames=16)
+                            if frames_reduced:
+                                # Prepare reduced content for mobile ad analysis
+                                content_reduced = frames_reduced.copy()
+                                content_reduced.append(prompt)  # Use the same comprehensive prompt
+                                msgs_reduced = [
+                                    {'role': 'user', 'content': content_reduced},
+                                ]
+                                params["max_slice_nums"] = 1
+                                params["use_image_id"] = False
+                                if torch.cuda.is_available():
+                                    torch.cuda.empty_cache()
+                                answer = model.chat(
+                                    msgs=msgs_reduced,
+                                    tokenizer=tokenizer,
+                                    **params
+                                )
+                                print("✅ Fallback strategy 2 successful with reduced frames!")
+                            else:
+                                raise Exception("Could not process video with reduced frames")
+                        except Exception as final_error:
+                            print(f"❌ All fallback strategies failed: {final_error}")
+                            # Provide helpful error message for mobile ad analysis
+                            error_details = f"""
+Shape mismatch error detected during mobile creative analysis. This can happen due to:
+1. Unusual video resolution/aspect ratio
+2. Video compression artifacts
+3. Frame dimension inconsistencies
+Suggested solutions for mobile creatives:
+- Ensure video is in standard mobile format (9:16, 16:9, 1:1)
+- Use common resolutions (1080x1920, 1920x1080, 1080x1080)
+- Convert to MP4 with H.264 encoding
+- Check if video is corrupted or has unusual codec
+Technical details: {str(inference_error)}
+"""
+                            return f"❌ Mobile creative analysis failed after multiple attempts:\n{error_details}"
+                # Try to clear cache and retry once for other errors
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                raise inference_error
+        processing_time = time.time() - start_time
+        # Check which attention implementation was actually used
+        attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
+        # Prepare analysis type info for mobile ad focus
+        if audio_data is not None:
+            analysis_type = f"Comprehensive Mobile Ad Analysis (Visual + Audio)"
+            media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio Duration:** {len(audio_data)/sample_rate:.1f} seconds  \n**Sample Rate:** {sample_rate} Hz"
+        else:
+            analysis_type = "Mobile Ad Analysis (Visual Only)"
+            media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio:** Not detected in creative"
+        result = f"""## 📱 Mobile Creative Ad Analysis Results
+**Processing Time:** {processing_time:.2f} seconds
+{media_info}
+**Model:** MiniCPM-o 2.6 Multimodal AI
+**Attention:** {attention_type}
+**Analysis Type:** {analysis_type}
+---
+### 🎯 Creative Analysis Report:
+{answer}
+---
+*Powered by MiniCPM-o 2.6 - Professional Mobile Creative Analysis on Hugging Face Spaces*
+"""
+        return result
+    except Exception as e:
+        error_msg = f"❌ Error processing video: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        print(error_msg)
+        return error_msg
+def analyze_video_with_custom_prompt(video_file, custom_prompt, max_frames=32):
+    """Analyze mobile creative ad using a custom analysis prompt"""
+    if video_file is None:
+        return "❌ Please upload a video file"
+    # Use the provided custom prompt
+    prompt = custom_prompt
+    try:
+        # Load model
+        print("Loading model...")
+        model, tokenizer = load_model()
+        # Process video
+        print(f"Processing mobile creative ad: {video_file}")
+        # Add video diagnostics to help identify potential issues
+        try:
+            import cv2
+            cap = cv2.VideoCapture(video_file)
+            if cap.isOpened():
+                width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                fps = cap.get(cv2.CAP_PROP_FPS)
+                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                duration = frame_count / fps if fps > 0 else 0
+                print(f"📹 Creative specs: {width}x{height}, {fps:.1f}fps, {duration:.1f}s duration")
+                cap.release()
+        except:
+            print("📹 Creative info: Could not read video metadata")
+        # Extract video frames
+        frames = encode_video(video_file, max_num_frames=max_frames)
+        if not frames:
+            return "❌ Could not extract frames from video"
+        print(f"📸 Extracted {len(frames)} frames")
+        # Extract audio from video
+        audio_data, sample_rate = extract_audio_from_video(video_file)
+        # Prepare multimodal content
+        content = frames.copy()  # Start with video frames
+        content.append(prompt)  # Add the comprehensive analysis prompt
+        if audio_data is not None:
+            print(f"🎵 Audio analysis enabled - {len(audio_data)/sample_rate:.1f}s of audio")
+        else:
+            print("🔇 Visual analysis only - no audio content detected")
         # Prepare messages exactly as in sample code
         msgs = [
         params["use_image_id"] = False
         params["max_slice_nums"] = 1  # Reduced for Spaces memory limits
+        print("🧠 Analyzing mobile creative ad with MiniCPM-o 2.6...")
         # Clear GPU cache before inference
         if torch.cuda.is_available():
                             frames_reduced = encode_video(video_file, max_num_frames=16)
                             if frames_reduced:
+                                # Prepare reduced content for mobile ad analysis
                                 content_reduced = frames_reduced.copy()
+                                content_reduced.append(prompt)  # Use the same comprehensive prompt
                                 msgs_reduced = [
                                     {'role': 'user', 'content': content_reduced},
                         except Exception as final_error:
                             print(f"❌ All fallback strategies failed: {final_error}")
+                            # Provide helpful error message for mobile ad analysis
                             error_details = f"""
+Shape mismatch error detected during mobile creative analysis. This can happen due to:
 1. Unusual video resolution/aspect ratio
+2. Video compression artifacts
 3. Frame dimension inconsistencies
+Suggested solutions for mobile creatives:
+- Ensure video is in standard mobile format (9:16, 16:9, 1:1)
+- Use common resolutions (1080x1920, 1920x1080, 1080x1080)
+- Convert to MP4 with H.264 encoding
+- Check if video is corrupted or has unusual codec
 Technical details: {str(inference_error)}
 """
+                            return f"❌ Mobile creative analysis failed after multiple attempts:\n{error_details}"
                 # Try to clear cache and retry once for other errors
                 if torch.cuda.is_available():
         # Check which attention implementation was actually used
         attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
+        # Prepare analysis type info for mobile ad focus
         if audio_data is not None:
+            analysis_type = f"Comprehensive Mobile Ad Analysis (Visual + Audio)"
             media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio Duration:** {len(audio_data)/sample_rate:.1f} seconds  \n**Sample Rate:** {sample_rate} Hz"
         else:
+            analysis_type = "Mobile Ad Analysis (Visual Only)"
+            media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio:** Not detected in creative"
+        result = f"""## 📱 Mobile Creative Ad Analysis Results
 **Processing Time:** {processing_time:.2f} seconds
 {media_info}
+**Model:** MiniCPM-o 2.6 Multimodal AI
 **Attention:** {attention_type}
 **Analysis Type:** {analysis_type}
+---
+### 🎯 Creative Analysis Report:
 {answer}
 ---
+*Powered by MiniCPM-o 2.6 - Professional Mobile Creative Analysis on Hugging Face Spaces*
 """
         return result
 # Create Gradio interface
 def create_interface():
+    """Create the mobile creative ad analysis interface"""
     with gr.Blocks(
+        title="Mobile Creative Ad Analyzer - MiniCPM-o 2.6",
         theme=gr.themes.Soft(),
         css="""
         .gradio-container {
+            max-width: 1000px !important;
         }
         .example-prompt {
             cursor: pointer;
         .example-prompt:hover {
             background: #e0e0e0;
         }
+        .advanced-section {
+            background: #f8f9fa;
+            padding: 15px;
+            border-radius: 8px;
+            margin-top: 10px;
+        }
         """
     ) as demo:
         gr.Markdown("""
+        # 📱 Mobile Creative Ad Analyzer
+        ### Powered by MiniCPM-o 2.6 Multimodal AI
+        **Professional mobile advertising creative analysis** with comprehensive insights across 12 key dimensions including hook analysis, audiovisual sync, mobile optimization, and conversion elements.
+        **🎯 Analysis includes:**
+        - **Hook Analysis** (0-5 second critical window)
+        - **Mobile-First Design** optimization assessment
+        - **Audiovisual Synchronization** evaluation
+        - **Target Audience** demographic analysis
+        - **Conversion Elements** and performance optimization
+        - **Platform-Specific** recommendations (iOS/Android/Social)
+        **📱 Optimized for:** Mobile game ads, app install campaigns, social media creatives, programmatic display ads
+        **🎵 Supports:** Video + Audio analysis for complete creative assessment
         """)
         with gr.Row():
+            with gr.Column(scale=2):
+                # Video input - main focus
                 video_input = gr.Video(
+                    label="📱 Upload Mobile Creative Ad",
+                    elem_id="video_input",
+                    height=400
                 )
+                # Main analyze button
                 analyze_btn = gr.Button(
+                    "🚀 Analyze Mobile Creative",
                     variant="primary",
+                    size="lg",
+                    scale=2
                 )
+                # Advanced Settings Section
+                with gr.Accordion("⚙️ Advanced Settings", open=False):
+                    with gr.Group(elem_classes=["advanced-section"]):
+                        gr.Markdown("### 🔧 Processing Options")
+                        # Max frames slider (moved from main interface)
+                        max_frames_slider = gr.Slider(
+                            minimum=8,
+                            maximum=64,
+                            value=32,
+                            step=8,
+                            label="🎞️ Max Frames to Analyze",
+                            info="More frames = more detail but slower processing. 32 is optimal for most mobile ads."
+                        )
+                        # Custom prompt option (for advanced users)
+                        with gr.Accordion("📝 Custom Analysis Prompt (Advanced)", open=False):
+                            custom_prompt_input = gr.Textbox(
+                                label="Custom Analysis Prompt",
+                                placeholder="Enter custom analysis instructions (leave empty to use comprehensive mobile ad analysis)",
+                                lines=3,
+                                value=""
+                            )
+                            gr.Markdown("*Leave empty to use the comprehensive 12-point mobile creative analysis prompt*")
+                        # Example prompts (moved from main interface)
+                        with gr.Accordion("💡 Alternative Analysis Prompts", open=False):
+                            gr.Markdown("### Example Analysis Focuses:")
+                            example_prompts = get_example_prompts()
+                            with gr.Row():
+                                for i in range(0, min(6, len(example_prompts)), 2):
+                                    with gr.Column():
+                                        if i < len(example_prompts):
+                                            ex_btn1 = gr.Button(
+                                                example_prompts[i][:50] + "..." if len(example_prompts[i]) > 50 else example_prompts[i],
+                                                size="sm",
+                                                elem_classes=["example-prompt"]
+                                            )
+                                            ex_btn1.click(
+                                                lambda x=example_prompts[i]: x,
+                                                outputs=custom_prompt_input
+                                            )
+                                        if i + 1 < len(example_prompts):
+                                            ex_btn2 = gr.Button(
+                                                example_prompts[i + 1][:50] + "..." if len(example_prompts[i + 1]) > 50 else example_prompts[i + 1],
+                                                size="sm",
+                                                elem_classes=["example-prompt"]
+                                            )
+                                            ex_btn2.click(
+                                                lambda x=example_prompts[i + 1]: x,
+                                                outputs=custom_prompt_input
+                                            )
+            with gr.Column(scale=3):
+                # Results output - larger space for comprehensive analysis
                 output_text = gr.Markdown(
                     label="📊 Analysis Results",
+                    value="""### 🎯 Mobile Creative Analysis Ready!
+Upload your mobile advertising creative (video ad) and click **"Analyze Mobile Creative"** to receive a comprehensive professional analysis covering:
+✅ **Hook effectiveness** (critical first 5 seconds)
+✅ **Mobile optimization** for small screens
+✅ **Audio-visual impact** and synchronization
+✅ **Target audience appeal** assessment
+✅ **Conversion optimization** recommendations
+✅ **Platform-specific** insights (iOS/Android/Social)
+**Supports:** MP4, AVI, MOV, WebM formats with automatic audio extraction for complete analysis.
+                    """,
                     elem_id="output"
                 )
+        # Modified event handler to use custom prompt if provided
+        def analyze_with_options(video_file, max_frames, custom_prompt):
+            if custom_prompt and custom_prompt.strip():
+                # Use custom prompt if provided
+                return analyze_video_with_custom_prompt(video_file, custom_prompt.strip(), max_frames)
+            else:
+                # Use default comprehensive mobile ad analysis
+                return analyze_video(video_file, max_frames)
         # Event handlers
         analyze_btn.click(
+            fn=analyze_with_options,
+            inputs=[video_input, max_frames_slider, custom_prompt_input],
             outputs=output_text,
             show_progress=True
         )
         # Footer
         gr.Markdown("""
         ---
+        ### ℹ️ About Mobile Creative Ad Analyzer
+        This professional mobile advertising analysis tool uses **MiniCPM-o 2.6**, a state-of-the-art multimodal AI model for comprehensive creative assessment.
+        - **Model:** [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) - Advanced multimodal AI
+        - **Analysis Framework:** 12-point comprehensive mobile creative evaluation
+        - **Capabilities:** Visual analysis + Audio processing + Performance optimization insights
+        - **Optimization:** Flash Attention 2 / SDPA for maximum processing efficiency
+        **🎯 Perfect for:** Mobile game ads, app install campaigns, social media creatives, programmatic display, influencer content, and any mobile-first advertising creative.
+        **📊 Analysis Dimensions:** Hook effectiveness, mobile optimization, target audience appeal, audiovisual sync, conversion elements, platform compliance, competitive positioning, and actionable optimization recommendations.
         """)
     return demo