Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import soundfile as sf | |
| import spaces | |
| import os | |
| import numpy as np | |
| import re | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from speechbrain.pretrained import EncoderClassifier | |
| from datasets import load_dataset | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_models_and_data(): | |
| model_name = "microsoft/speecht5_tts" | |
| processor = SpeechT5Processor.from_pretrained(model_name) | |
| model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
| spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
| speaker_model = EncoderClassifier.from_hparams( | |
| source=spk_model_name, | |
| run_opts={"device": device}, | |
| savedir=os.path.join("/tmp", spk_model_name), | |
| ) | |
| # Load a sample from a dataset for default embedding | |
| dataset = load_dataset("ylacombe/english_dialects","southern_female", split="train") | |
| example = dataset[304] | |
| return model, processor, vocoder, speaker_model, example | |
| model, processor, vocoder, speaker_model, default_example = load_models_and_data() | |
| def create_speaker_embedding(waveform): | |
| with torch.no_grad(): | |
| speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device)) | |
| speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) | |
| speaker_embeddings = speaker_embeddings.squeeze() | |
| return speaker_embeddings | |
| def prepare_default_embedding(example): | |
| audio = example["audio"] | |
| return create_speaker_embedding(audio["array"]) | |
| default_embedding = prepare_default_embedding(default_example) | |
| replacements = [ | |
| ("â", "a"), # Long a | |
| ("ç", "ch"), # Ch as in "chair" | |
| ("ğ", "gh"), # Silent g or slight elongation of the preceding vowel | |
| ("ı", "i"), # Dotless i | |
| ("î", "i"), # Long i | |
| ("ö", "oe"), # Similar to German ö | |
| ("ş", "sh"), # Sh as in "shoe" | |
| ("ü", "ue"), # Similar to German ü | |
| ("û", "u"), # Long u | |
| ] | |
| number_words = { | |
| 0: "sıfır", 1: "bir", 2: "iki", 3: "üç", 4: "dört", 5: "beş", 6: "altı", 7: "yedi", 8: "sekiz", 9: "dokuz", | |
| 10: "on", 11: "on bir", 12: "on iki", 13: "on üç", 14: "on dört", 15: "on beş", 16: "on altı", 17: "on yedi", | |
| 18: "on sekiz", 19: "on dokuz", 20: "yirmi", 30: "otuz", 40: "kırk", 50: "elli", 60: "altmış", 70: "yetmiş", | |
| 80: "seksen", 90: "doksan", 100: "yüz", 1000: "bin" | |
| } | |
| def number_to_words(number): | |
| if number < 20: | |
| return number_words[number] | |
| elif number < 100: | |
| tens, unit = divmod(number, 10) | |
| return number_words[tens * 10] + (" " + number_words[unit] if unit else "") | |
| elif number < 1000: | |
| hundreds, remainder = divmod(number, 100) | |
| return (number_words[hundreds] + " yüz" if hundreds > 1 else "yüz") + (" " + number_to_words(remainder) if remainder else "") | |
| elif number < 1000000: | |
| thousands, remainder = divmod(number, 1000) | |
| return (number_to_words(thousands) + " bin" if thousands > 1 else "bin") + (" " + number_to_words(remainder) if remainder else "") | |
| elif number < 1000000000: | |
| millions, remainder = divmod(number, 1000000) | |
| return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "") | |
| elif number < 1000000000000: | |
| billions, remainder = divmod(number, 1000000000) | |
| return number_to_words(billions) + " milyar" + (" " + number_to_words(remainder) if remainder else "") | |
| else: | |
| return str(number) | |
| def replace_numbers_with_words(text): | |
| def replace(match): | |
| number = int(match.group()) | |
| return number_to_words(number) | |
| # Find the numbers and change with words. | |
| result = re.sub(r'\b\d+\b', replace, text) | |
| return result | |
| def normalize_text(text): | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Replace numbers with words | |
| text = replace_numbers_with_words(text) | |
| # Apply character replacements | |
| for old, new in replacements: | |
| text = text.replace(old, new) | |
| # Remove punctuation | |
| text = re.sub(r'[^\w\s]', '', text) | |
| return text | |
| def text_to_speech(text, audio_file=None): | |
| # Normalize the input text | |
| normalized_text = normalize_text(text) | |
| # Prepare the input for the model | |
| inputs = processor(text=normalized_text, return_tensors="pt").to(device) | |
| # Use the default speaker embedding | |
| speaker_embeddings = default_embedding | |
| # Generate speech | |
| with torch.no_grad(): | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder) | |
| speech_np = speech.cpu().numpy() | |
| return (16000, speech_np) | |
| iface = gr.Interface( | |
| fn=text_to_speech, | |
| inputs=[ | |
| gr.Textbox(label="Enter English text to convert to speech") | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Speech", type="numpy") | |
| ], | |
| title="English SpeechT5 Text-to-Speech Demo", | |
| description="Enter English text, and listen to the generated speech." | |
| ) | |
| iface.launch(share=True) | |