Spaces:

awacke1
/

ASRGenerateStoryandVideo

Build error

App Files Files Community

awacke1 commited on Jun 14, 2022

Commit

dee841c

1 Parent(s): 84d8598

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -20

app.py CHANGED Viewed

@@ -1,29 +1,27 @@
 import gradio as gr
-import tensorflow as tf
-import transformers
 from transformers import pipeline
-#import streamlit as st
 import firebase_admin
 from firebase_admin import credentials
 from firebase_admin import firestore
 import datetime
 import tempfile
 from typing import Optional
 import numpy as np
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
-import io, base64
-import mediapy
-import os
-import sys
-from PIL import Image
-from huggingface_hub import snapshot_download
 # firestore singleton is a cached multiuser instance to persist shared crowdsource memory
@@ -39,7 +37,31 @@ db = get_db_firestore()
 # create ASR ML pipeline
 asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
-#asr = pipeline("automatic-speech-recognition", "snakers4/silero-models")
 # create Text Classification pipeline
 classifier = pipeline("text-classification")
@@ -145,24 +167,27 @@ def generate_interpolation(gallery):
 demo = gr.Blocks()
 with demo:
-    #with gr.Row():
-        # Left column (inputs)
-    #    with gr.Column():
     audio_file = gr.inputs.Audio(source="microphone", type="filepath")
     text = gr.Textbox()
     label = gr.Label()
     saved = gr.Textbox()
-    savedAll = gr.Textbox()
-    #    with gr.Column():
     b1 = gr.Button("Recognize Speech")
     b2 = gr.Button("Classify Sentiment")
     b3 = gr.Button("Save Speech to Text")
     b4 = gr.Button("Retrieve All")
     b1.click(speech_to_text, inputs=audio_file, outputs=text)
     b2.click(text_to_sentiment, inputs=text, outputs=label)
     b3.click(upsert, inputs=text, outputs=saved)
     b4.click(selectall, inputs=text, outputs=savedAll)
     with gr.Row():
         # Left column (inputs)

 import gradio as gr
 from transformers import pipeline
+import io, base64
+from PIL import Image
+import numpy as np
+import tensorflow as tf
+import mediapy
+import os
+import sys
+from huggingface_hub import snapshot_download
+import streamlit as st
 import firebase_admin
 from firebase_admin import credentials
 from firebase_admin import firestore
 import datetime
+from transformers import pipeline
+import gradio as gr
 import tempfile
 from typing import Optional
 import numpy as np
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 # firestore singleton is a cached multiuser instance to persist shared crowdsource memory
 # create ASR ML pipeline
 asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
+MODEL_NAMES = [
+    "en/ljspeech/tacotron2-DDC",
+    "en/ljspeech/glow-tts",
+    "en/ljspeech/speedy-speech-wn",
+    "en/ljspeech/vits",
+    "en/sam/tacotron-DDC",
+    "fr/mai/tacotron2-DDC",
+    "de/thorsten/tacotron2-DCA",
+]
+MODELS = {}
+manager = ModelManager()
+for MODEL_NAME in MODEL_NAMES:
+    print(f"downloading {MODEL_NAME}")
+    model_path, config_path, model_item = manager.download_model(f"tts_models/{MODEL_NAME}")
+    vocoder_name: Optional[str] = model_item["default_vocoder"]
+    vocoder_path = None
+    vocoder_config_path = None
+    if vocoder_name is not None:
+        vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
+    synthesizer = Synthesizer(
+        model_path, config_path, None, vocoder_path, vocoder_config_path,
+    )
+    MODELS[MODEL_NAME] = synthesizer
 # create Text Classification pipeline
 classifier = pipeline("text-classification")
 demo = gr.Blocks()
 with demo:
     audio_file = gr.inputs.Audio(source="microphone", type="filepath")
     text = gr.Textbox()
     label = gr.Label()
     saved = gr.Textbox()
+    savedAll = gr.Textbox()
+    TTSchoice = gr.inputs.Radio( label="Pick a TTS Model", choices=MODEL_NAMES,   )
+    audio = gr.Audio(label="Output", interactive=False)
     b1 = gr.Button("Recognize Speech")
     b2 = gr.Button("Classify Sentiment")
     b3 = gr.Button("Save Speech to Text")
     b4 = gr.Button("Retrieve All")
+    b5 = gr.Button("Read It Back Aloud")
     b1.click(speech_to_text, inputs=audio_file, outputs=text)
     b2.click(text_to_sentiment, inputs=text, outputs=label)
     b3.click(upsert, inputs=text, outputs=saved)
     b4.click(selectall, inputs=text, outputs=savedAll)
+    b5.click(tts,  inputs=[text,TTSchoice], outputs=audio)
     with gr.Row():
         # Left column (inputs)