import gradio as gr
import torch
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel

# 🔹 Load pretrained models from NeMo
fastpitch = FastPitchModel.from_pretrained("nvidia/tts_en_fastpitch")
hifigan = HifiGanModel.from_pretrained("nvidia/tts_hifigan")

# 🔹 TTS function
def tts(text):
    # Convert text → mel spectrogram
    with torch.no_grad():
        spectrogram = fastpitch.parse(text)
        audio = hifigan.convert_spectrogram_to_audio(spectrogram)
    return (22050, audio.cpu().numpy())

# 🔹 Gradio UI
iface = gr.Interface(
    fn=tts,
    inputs=gr.Textbox(label="Enter text"),
    outputs=gr.Audio(label="Generated Speech"),
    title="FastPitch + HiFiGAN (NeMo TTS)",
    description="Enter text and get speech synthesized using NVIDIA NeMo FastPitch and HiFiGAN."
)

iface.launch()