multimodal-video-audio

Runtime error

File size: 2,281 Bytes

import gradio as gr
from transformers import pipeline
import soundfile as sf
from moviepy.editor import VideoClip, AudioFileClip
from PIL import Image, ImageDraw, ImageFont
import numpy as np

# ===== Criar pipeline TTS =====
tts_model = pipeline(
    "text-to-speech",
    model="espnet/kan-bayashi_ljspeech_tts_train_tacotron2"
)

def gerar_video_premium(prompt_texto):
    """
    Função:
    1. Gera áudio via TTS
    2. Cria vídeo animado com cores dinâmicas
    3. Adiciona áudio
    4. Retorna caminho do vídeo
    """
    # ===== Gerar áudio =====
    tts_output = tts_model(prompt_texto)
    sf.write("audio.wav", tts_output["array"], samplerate=tts_output["sampling_rate"])
    audio_clip = AudioFileClip("audio.wav")
    duracao = audio_clip.duration
    
    # ===== Função de frame animado =====
    largura, altura = 1280, 720
    def make_frame(t):
        # Fundo animado com cores oscilando
        frame = np.zeros((altura, largura, 3), dtype=np.uint8)
        r = int((np.sin(t*2*np.pi/5) + 1) * 127)
        g = int((np.sin(t*2*np.pi/3 + 1) + 1) * 127)
        b = int((np.sin(t*2*np.pi/4 + 2) + 1) * 127)
        frame[:, :, 0] = r
        frame[:, :, 1] = g
        frame[:, :, 2] = b
        
        # Adicionar texto centralizado
        pil_img = Image.fromarray(frame)
        draw = ImageDraw.Draw(pil_img)
        font = ImageFont.load_default()
        text_size = draw.textsize(prompt_texto, font=font)
        draw.text(
            ((largura - text_size[0]) / 2, (altura - text_size[1]) / 2),
            prompt_texto, fill=(255, 255, 255), font=font
        )
        return np.array(pil_img)
    
    # ===== Criar vídeo =====
    video_clip = VideoClip(make_frame, duration=duracao).set_audio(audio_clip)
    
    # ===== Salvar vídeo =====
    video_clip.write_videofile("video_final.mp4", fps=24, codec="libx264", audio_codec="aac")
    
    return "video_final.mp4"

# ===== Interface Gradio =====
iface = gr.Interface(
    fn=gerar_video_premium,
    inputs=gr.Textbox(label="Digite o prompt para gerar vídeo e áudio"),
    outputs=gr.Video(label="Vídeo gerado"),
    title="Gerador Premium de Vídeo + Áudio",
    description="Digite qualquer prompt. O Space gera um vídeo animado profissional com TTS."
)

iface.launch()