Spaces:

AverageAiLiker
/

bot-tks1p3jy

Running

File size: 12,313 Bytes

3ab16a2

import gradio as gr
import torch
from diffusers import DiffusionPipeline
import numpy as np
import spaces
import time
from PIL import Image
import io
import base64

# Model configuration
MODEL_ID = "hpcai-tech/Open-Sora-v2"

# Initialize the pipeline
@spaces.GPU(duration=1500)
def load_model():
    """Load the Open-Sora-v2 model"""
    try:
        pipe = DiffusionPipeline.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            variant="fp16",
            use_safetensors=True
        )
        pipe.to("cuda")
        # Enable memory efficient attention
        pipe.enable_attention_slicing()
        return pipe
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Global model variable
model = None

def initialize_model():
    """Initialize the model on first request"""
    global model
    if model is None:
        model = load_model()
    return model is not None

@spaces.GPU(duration=120)
def generate_video(
    prompt: str,
    duration: int = 4,
    height: int = 720,
    width: int = 1280,
    num_inference_steps: int = 50,
    guidance_scale: float = 7.5,
    progress=gr.Progress()
) -> str:
    """
    Generate a video from text prompt using Open-Sora-v2
    
    Args:
        prompt: Text description of the video
        duration: Duration in seconds
        height: Video height
        width: Video width
        num_inference_steps: Number of denoising steps
        guidance_scale: Guidance scale for generation
    
    Returns:
        Path to the generated video file
    """
    try:
        # Initialize model if not already done
        if not initialize_model():
            raise Exception("Failed to initialize model")
        
        progress(0.1, desc="Initializing generation...")
        
        # Calculate number of frames based on duration (assuming 30 fps)
        num_frames = duration * 30
        
        progress(0.2, desc="Starting video generation...")
        
        # Generate video frames
        result = model(
            prompt=prompt,
            num_frames=num_frames,
            height=height,
            width=width,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=torch.Generator().manual_seed(42)
        )
        
        progress(0.8, desc="Processing frames...")
        
        # Save the generated video
        output_path = f"generated_video_{int(time.time())}.mp4"
        
        if hasattr(result, 'videos'):
            # Handle video output
            video_frames = result.videos[0]
        else:
            # Handle image sequence output
            video_frames = result.frames[0] if hasattr(result, 'frames') else result
        
        # Save as video file
        save_video(video_frames, output_path, fps=30)
        
        progress(1.0, desc="Video generation complete!")
        
        return output_path
        
    except Exception as e:
        print(f"Error generating video: {e}")
        raise gr.Error(f"Video generation failed: {str(e)}")

def save_video(frames, output_path, fps=30):
    """Save video frames to MP4 file"""
    try:
        import cv2
        
        # Convert frames to numpy if needed
        if torch.is_tensor(frames):
            frames = frames.cpu().numpy()
        
        # Ensure frames are in the correct format
        if len(frames.shape) == 4:
            frames = np.transpose(frames, (0, 2, 3, 1))  # TCHW -> THWC
        
        # Normalize frames to 0-255
        frames = ((frames + 1.0) * 127.5).astype(np.uint8)
        
        # Get video dimensions
        height, width = frames[0].shape[:2]
        
        # Initialize video writer
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        # Write frames
        for frame in frames:
            if len(frame.shape) == 3:
                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            out.write(frame)
        
        out.release()
        
    except ImportError:
        # Fallback: save as GIF if cv2 is not available
        from PIL import Image
        
        if torch.is_tensor(frames):
            frames = frames.cpu().numpy()
        
        if len(frames.shape) == 4:
            frames = np.transpose(frames, (0, 2, 3, 1))
        
        frames = ((frames + 1.0) * 127.5).astype(np.uint8)
        
        images = [Image.fromarray(frame) for frame in frames]
        images[0].save(
            output_path.replace('.mp4', '.gif'),
            save_all=True,
            append_images=images[1:],
            duration=33,  # ~30 fps
            loop=0
        )

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="Text to Video - Open-Sora-v2",
        theme=gr.themes.Soft(),
        css="""
        .header-text {
            text-align: center;
            font-size: 2em;
            margin-bottom: 0.5em;
            background: linear-gradient(45deg, #667eea 0%, #764ba2 100%);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
        }
        .subheader-text {
            text-align: center;
            color: #666;
            margin-bottom: 2em;
        }
        .generate-btn {
            background: linear-gradient(45deg, #667eea 0%, #764ba2 100%);
            border: none;
            color: white;
            font-weight: bold;
        }
        .generate-btn:hover {
            background: linear-gradient(45deg, #764ba2 0%, #667eea 100%);
        }
        """
    ) as demo:
        
        gr.Markdown("""
        <div class="header-text">🎬 Text to Video Generator</div>
        <div class="subheader-text">Powered by Open-Sora-v2 - Transform your ideas into stunning videos</div>
        <div style="text-align: center; margin-bottom: 1em;">
            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #667eea; text-decoration: none;">
                Built with anycoder
            </a>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                prompt_input = gr.Textbox(
                    label="📝 Describe your video",
                    placeholder="A beautiful sunset over the ocean with waves gently crashing on the shore, cinematic quality, 4K resolution...",
                    lines=4,
                    max_lines=6
                )
                
                with gr.Row():
                    duration_input = gr.Slider(
                        minimum=2,
                        maximum=16,
                        value=4,
                        step=2,
                        label="⏱️ Duration (seconds)"
                    )
                    
                    quality_input = gr.Dropdown(
                        choices=[
                            ("720p HD", 720),
                            ("1080p Full HD", 1080),
                            ("4K Ultra HD", 2160)
                        ],
                        value=720,
                        label="🎥 Quality"
                    )
                
                with gr.Accordion("⚙️ Advanced Settings", open=False):
                    with gr.Row():
                        steps_input = gr.Slider(
                            minimum=20,
                            maximum=100,
                            value=50,
                            step=5,
                            label="🔢 Inference Steps"
                        )
                        
                        guidance_input = gr.Slider(
                            minimum=1.0,
                            maximum=20.0,
                            value=7.5,
                            step=0.5,
                            label="🎯 Guidance Scale"
                        )
                
                generate_btn = gr.Button(
                    "🚀 Generate Video",
                    variant="primary",
                    size="lg",
                    elem_classes=["generate-btn"]
                )
            
            with gr.Column(scale=1):
                gr.Markdown("""
                ### 💡 Example Prompts
                
                - 🌅 "A serene mountain landscape at sunrise with golden light filtering through misty valleys"
                - 🏙️ "A futuristic cyberpunk city at night with neon signs reflecting on wet streets"
                - 🌊 "Underwater coral reef with colorful tropical fish swimming in crystal clear water"
                - 🌳 "A magical enchanted forest with glowing mushrooms and fireflies at twilight"
                
                ### ⚡ Tips for Best Results
                
                - Be descriptive and specific
                - Include visual style (cinematic, realistic, anime, etc.)
                - Mention lighting and atmosphere
                - Specify camera angles if desired
                """)
        
        with gr.Row():
            video_output = gr.Video(
                label="🎬 Generated Video",
                visible=False
            )
            
            loading_info = gr.Markdown(
                "✨ Your video will appear here after generation",
                visible=True
            )
        
        # Example prompts
        example_prompts = [
            [
                "A beautiful sunset over the ocean with waves gently crashing on the shore, cinematic quality, warm golden lighting",
                4, 720, 50, 7.5
            ],
            [
                "A serene mountain landscape at sunrise with mist rolling over the valleys, golden light filtering through the clouds",
                4, 720, 50, 7.5
            ],
            [
                "A bustling city street at night with neon signs reflecting on wet pavement, cyberpunk aesthetic, blade runner style",
                4, 720, 50, 7.5
            ],
            [
                "Underwater coral reef with colorful fish swimming, sun rays penetrating through the water, national geographic documentary style",
                4, 720, 50, 7.5
            ]
        ]
        
        gr.Examples(
            examples=example_prompts,
            inputs=[prompt_input, duration_input, quality_input, steps_input, guidance_input],
            label="🎯 Try these examples",
            cache_examples=False
        )
        
        def generate_and_display(prompt, duration, quality, steps, guidance, progress=gr.Progress()):
            try:
                # Calculate width based on quality (16:9 aspect ratio)
                width_map = {720: 1280, 1080: 1920, 2160: 3840}
                width = width_map.get(quality, 1280)
                
                # Generate video
                video_path = generate_video(
                    prompt=prompt,
                    duration=duration,
                    height=quality,
                    width=width,
                    num_inference_steps=steps,
                    guidance_scale=guidance,
                    progress=progress
                )
                
                return {
                    video_output: gr.Video(value=video_path, visible=True),
                    loading_info: gr.Markdown(visible=False)
                }
                
            except Exception as e:
                return {
                    video_output: gr.Video(visible=False),
                    loading_info: gr.Markdown(f"❌ Error: {str(e)}", visible=True)
                }
        
        generate_btn.click(
            fn=generate_and_display,
            inputs=[prompt_input, duration_input, quality_input, steps_input, guidance_input],
            outputs=[video_output, loading_info],
            show_progress=True
        )
        
        # Initialize model on page load
        demo.load(
            fn=initialize_model,
            inputs=[],
            outputs=[],
            queue=False
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        share=True,
        show_error=True,
        show_tips=True,
        queue=True
    )