Spaces:

Agents-MCP-Hackathon
/

SlideDeck-AI

Running

App Files Files Community

Revanth-ml commited on Jun 10

Commit

4be7077

verified ·

1 Parent(s): 7da4c38

Create deploy_kokora_app_cpu_modal_labs.py

Browse files

Files changed (1) hide show

deploy_kokora_app_cpu_modal_labs.py +104 -0

deploy_kokora_app_cpu_modal_labs.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import io
+import modal
+from fastapi import FastAPI, Request, status
+from fastapi.responses import Response, JSONResponse
+app = modal.App("kokoro-tts-api-cpu")
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("git", "libsndfile1", "espeak-ng")
+    .pip_install(
+        "torch==2.3.0",
+        "soundfile",
+        "kokoro>=0.9.4",
+        "fastapi",
+        "numpy"
+    ).run_commands(
+        "pip install --force-reinstall --no-binary soundfile soundfile",)
+    .env({"HF_HOME": "/cache"})
+)
+CACHE_PATH = "/cache"
+hf_cache = modal.Volume.from_name("kokoro-hf-cache", create_if_missing=True)
+web_app = FastAPI(
+    title="Kokoro TTS API",
+    description="A serverless API for generating speech from text using the Kokoro model.",
+    version="1.0.0"
+)
+VOICE_PREFIX_MAP = {"en": "a", "us": "a", "gb": "b", "uk": "b", "es": "e", "fr": "f"}
+def voice_to_lang(voice: str) -> str:
+    prefix = voice.split("_", 1)[0].lower()
+    return prefix if prefix in "abehijpz" else VOICE_PREFIX_MAP.get(prefix, "a")
+@app.function(
+    image=image,
+    volumes={CACHE_PATH: hf_cache},
+    cpu=4,
+    timeout=180,
+    container_idle_timeout=300,
+)
+@modal.asgi_app()
+def fastapi_app():
+    """
+    This function hosts our FastAPI application on Modal.
+    """
+    print("🚀 Kokoro TTS API container is starting up...")
+    @web_app.post("/",
+        summary="Synthesize Speech",
+        description="""
+        Converts text to speech.
+        - **text**: The string of text to synthesize.
+        - **voice**: (Optional) The voice ID to use (e.g., "a_heart", "b_female", "e_male"). Defaults to "a_heart".
+        """
+    )
+    async def tts_endpoint(request: Request):
+        try:
+            body = await request.json()
+            text_to_synthesize = body["text"]
+            voice_id = body.get("voice", "af_heart")
+        except Exception:
+            return JSONResponse(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                content={"error": "Invalid request. Body must be JSON with a 'text' key."},
+            )
+        print(f"Synthesizing text: '{text_to_synthesize[:50]}...' with voice: {voice_id}")
+        from kokoro import KPipeline
+        import soundfile as sf
+        import torch
+        import numpy as np
+        torch.hub.set_dir(CACHE_PATH)
+        lang = voice_to_lang(voice_id)
+        pipe = KPipeline(lang_code=lang)
+        all_chunks = []
+        for _, _, chunk in pipe(text_to_synthesize, voice=voice_id):
+            all_chunks.append(chunk)
+        if not all_chunks:
+            return JSONResponse(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                content={"error": "TTS generation failed to produce audio."},
+            )
+        full_audio = np.concatenate(all_chunks)
+        buffer = io.BytesIO()
+        sf.write(buffer, full_audio, 24_000, format="WAV", subtype="PCM_16")
+        buffer.seek(0)
+        hf_cache.commit()
+        print("Synthesis complete. Returning audio file.")
+        return Response(content=buffer.getvalue(), media_type="audio/wav")
+    return web_app