Spaces:

fffiloni
/

instant-TTS-Bark-cloning

Paused

App Files Files Community

fffiloni commited on Sep 4, 2023

Commit

274d8f8

1 Parent(s): e5daf13

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -53

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 import os
 import shutil
@@ -9,6 +10,18 @@ from pydub import AudioSegment
 file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
 """
 model_ids = [
     'suno/bark',
@@ -52,6 +65,14 @@ def cut_wav(input_path, max_duration):
     cut_audio.export(output_path, format="wav")
     return output_path
 def infer(prompt, input_wav_file):
@@ -72,36 +93,6 @@ def infer(prompt, input_wav_file):
     # Move the WAV file to the new directory
     shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))
-    """
-    text = prompt
-    print("SYNTHETIZING...")
-    # with random speaker
-    #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)
-    # cloning a speaker.
-    # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
-    output_dict = model.synthesize(
-        text,
-        config,
-        speaker_id=f"{file_name}",
-        voice_dirs="bark_voices/",
-        gpu=True
-    )
-    print(output_dict)
-    sample_rate = 24000  # Replace with the actual sample rate
-    print("WRITING WAVE FILE")
-    wavfile.write(
-        'output.wav',
-        sample_rate,
-        output_dict['wav']
-    )
-    """
     tts.tts_to_file(text=prompt,
                 file_path="output.wav",
@@ -117,11 +108,77 @@ def infer(prompt, input_wav_file):
     tts_video = gr.make_waveform(audio="output.wav")
-    return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True)
 css = """
 #col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
 img[src*='#center'] {
     display: block;
     margin: auto;
@@ -171,39 +228,95 @@ with gr.Blocks(css=css) as demo:
         with gr.Row():
             with gr.Column():
                 prompt = gr.Textbox(
-                    label="Text to speech prompt"
                 )
-                if file_upload_available == "True":
-                    audio_in = gr.Audio(
-                        label="WAV voice to clone",
-                        type="filepath",
-                        source="upload"
-                    )
-                else:
-                    audio_in = gr.Audio(
-                        label="WAV voice to clone",
-                        type="filepath",
-                        source="upload",
-                        interactive = False
-                    )
-                submit_btn = gr.Button("Submit")
             with gr.Column():
                 cloned_out = gr.Audio(
-                    label="Text to speech output"
                 )
                 video_out = gr.Video(
-                    label = "Waveform video"
                 )
                 npz_file = gr.File(
                     label = ".npz file",
                     visible = False
                 )
@@ -226,9 +339,10 @@ with gr.Blocks(css=css) as demo:
             outputs = [
                 cloned_out,
                 video_out,
-                npz_file
             ],
-            cache_examples = True
         )
         gr.HTML("""
@@ -256,8 +370,37 @@ with gr.Blocks(css=css) as demo:
         outputs = [
             cloned_out,
             video_out,
-            npz_file
         ]
     )
-demo.queue(api_open=False, max_size=20).launch()

 import gradio as gr
+from share_btn import community_icon_html, loading_icon_html, share_js
 import os
 import shutil
 file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
+import json
+with open("characters.json", "r") as file:
+    data = json.load(file)
+    characters = [
+        {
+            "image": item["image"],
+            "title": item["title"],
+            "speaker": item["speaker"]
+        }
+        for item in data
+    ]
 """
 model_ids = [
     'suno/bark',
     cut_audio.export(output_path, format="wav")
     return output_path
+def update_selection(selected_state: gr.SelectData):
+    c_image = characters[selected_state.index]["image"]
+    c_title = characters[selected_state.index]["title"]
+    c_speaker = characters[selected_state.index]["speaker"]
+    return c_title, selected_state
 def infer(prompt, input_wav_file):
     # Move the WAV file to the new directory
     shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))
     tts.tts_to_file(text=prompt,
                 file_path="output.wav",
     tts_video = gr.make_waveform(audio="output.wav")
+    return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True)
+def infer_from_c(prompt, c_name):
+    tts.tts_to_file(text=prompt,
+                file_path="output.wav",
+                voice_dir="examples/library/",
+                speaker=f"{c_name}")
+    tts_video = gr.make_waveform(audio="output.wav")
+    return "output.wav", tts_video, gr.update(value=f"examples/library/{c_name}/{c_name}.npz", visible=True), gr.Group.update(visible=True)
 css = """
 #col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
+a {text-decoration-line: underline; font-weight: 600;}
+.animate-spin {
+  animation: spin 1s linear infinite;
+}
+@keyframes spin {
+  from {
+      transform: rotate(0deg);
+  }
+  to {
+      transform: rotate(360deg);
+  }
+}
+#share-btn-container {
+  display: flex;
+  padding-left: 0.5rem !important;
+  padding-right: 0.5rem !important;
+  background-color: #000000;
+  justify-content: center;
+  align-items: center;
+  border-radius: 9999px !important;
+  max-width: 15rem;
+  height: 36px;
+}
+div#share-btn-container > div {
+    flex-direction: row;
+    background: black;
+    align-items: center;
+}
+#share-btn-container:hover {
+  background-color: #060606;
+}
+#share-btn {
+  all: initial;
+  color: #ffffff;
+  font-weight: 600;
+  cursor:pointer;
+  font-family: 'IBM Plex Sans', sans-serif;
+  margin-left: 0.5rem !important;
+  padding-top: 0.5rem !important;
+  padding-bottom: 0.5rem !important;
+  right:0;
+}
+#share-btn * {
+  all: unset;
+}
+#share-btn-container div:nth-child(-n+2){
+  width: auto !important;
+  min-height: 0px !important;
+}
+#share-btn-container .wrap {
+  display: none !important;
+}
+#share-btn-container.hidden {
+  display: none!important;
+}
 img[src*='#center'] {
     display: block;
     margin: auto;
         with gr.Row():
             with gr.Column():
                 prompt = gr.Textbox(
+                    label="Text to speech prompt",
+                    elem_id = "tts-prompt"
                 )
+                with gr.Tab("File upload"):
+                    with gr.Column():
+                        if file_upload_available == "True":
+                            audio_in = gr.Audio(
+                                label="WAV voice to clone",
+                                type="filepath",
+                                source="upload"
+                            )
+                        else:
+                            audio_in = gr.Audio(
+                                label="WAV voice to clone",
+                                type="filepath",
+                                source="upload",
+                                interactive = False
+                            )
+                        submit_btn = gr.Button("Submit")
+                with gr.Tab("Microphone"):
+                    micro_in = gr.Audio(
+                                label="Record voice to clone",
+                                type="filepath",
+                                source="microphone",
+                                interactive = True
+                            )
+                    micro_submit_btn = gr.Button("Submit")
+                with gr.Tab("Voices Characters"):
+                    selected_state = gr.State()
+                    gallery_in = gr.Gallery(
+                                label="Character Gallery",
+                                value=[(item["image"], item["title"]) for item in characters],
+                                interactive = True,
+                                allow_preview=False,
+                                columns=2,
+                                elem_id="gallery",
+                                show_share_button=False
+                            )
+                    c_submit_btn = gr.Button("Submit")
             with gr.Column():
                 cloned_out = gr.Audio(
+                    label="Text to speech output",
+                    visible = False
                 )
                 video_out = gr.Video(
+                    label = "Waveform video",
+                    elem_id = "voice-video-out"
                 )
                 npz_file = gr.File(
                     label = ".npz file",
                     visible = False
                 )
+                character_name = gr.Textbox(
+                    label="Character Name",
+                    placeholder="Name that voice character",
+                    elem_id = "character-name"
+                )
+                voice_description = gr.Textbox(
+                    label="description",
+                    placeholder="How would you describe that voice ? ",
+                    elem_id = "voice-description"
+                )
+                with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
+                    community_icon = gr.HTML(community_icon_html)
+                    loading_icon = gr.HTML(loading_icon_html)
+                    share_button = gr.Button("Share with Community", elem_id="share-btn")
+        share_button.click(None, [], [], _js=share_js)
+        gallery_in.select(
+            update_selection,
+            outputs=[character_name, selected_state],
+            queue=False,
+            show_progress=False,
+        )
             outputs = [
                 cloned_out,
                 video_out,
+                npz_file,
+                share_group
             ],
+            cache_examples = False
         )
         gr.HTML("""
         outputs = [
             cloned_out,
             video_out,
+            npz_file,
+            share_group
+        ]
+    )
+    micro_submit_btn.click(
+        fn = infer,
+        inputs = [
+            prompt,
+            micro_in
+        ],
+        outputs = [
+            cloned_out,
+            video_out,
+            npz_file,
+            share_group
+        ]
+    )
+    c_submit_btn.click(
+        fn = infer_from_c,
+        inputs = [
+            prompt,
+            character_name
+        ],
+        outputs = [
+            cloned_out,
+            video_out,
+            npz_file,
+            share_group
         ]
     )
+demo.queue(api_open=False, max_size=10).launch()