Spaces:
Sleeping
Sleeping
File size: 6,548 Bytes
1f30c42 d789727 8fb170c d789727 1f30c42 d789727 1f30c42 d789727 1f30c42 d789727 1f30c42 d789727 1f30c42 54547cf 1f30c42 54547cf 1f30c42 d789727 54547cf 1f30c42 8fb170c 1f30c42 1ea390e 1f30c42 d789727 1f30c42 1f1eee1 1f30c42 d789727 1f30c42 d789727 1f30c42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import gradio as gr
import torch
import soundfile as sf
import os
import numpy as np
import noisereduce as nr
from typing import Optional, Iterator
import torch.nn as nn
from transformers import AutoTokenizer, VitsModel # لازم تتأكد أنك مستوردهم
from concurrent.futures import ThreadPoolExecutor, as_completed
# اختيار الجهاز (CPU أو GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Running on:", device)
token=os.environ.get("key_")
models = {}
# فلتر الضوضاء
def remove_noise_nr(audio_data, sr=16000):
reduced_noise = nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
return reduced_noise
def _inference_forward_stream(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
speaker_embeddings: torch.Tensor = None,
chunk_size: int = 32,
is_streaming: bool = True
):
import torch.nn as nn
padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
text_encoder_output = self.text_encoder(
input_ids=input_ids,
padding_mask=padding_mask,
attention_mask=attention_mask
)
hidden_states = text_encoder_output[0]
hidden_states = hidden_states.transpose(1, 2)
input_padding_mask = padding_mask.transpose(1, 2)
prior_means = text_encoder_output[1]
prior_log_variances = text_encoder_output[2]
# حساب المدة
if self.config.use_stochastic_duration_prediction:
log_duration = self.duration_predictor(
hidden_states, input_padding_mask, speaker_embeddings, reverse=True, noise_scale=self.noise_scale_duration
)
else:
log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
length_scale = 1.0 / self.speaking_rate
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
batch_size, _, output_length, input_length = attn_mask.shape
cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
valid_indices = indices.unsqueeze(0) < cum_duration
valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
spectrogram = latents * output_padding_mask
if is_streaming:
for i in range(0, spectrogram.size(-1), chunk_size):
with torch.no_grad():
yield spectrogram[:, :, i: i + chunk_size]
else:
yield spectrogram
def get_model(name_model):
global models
if name_model in models:
tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
return models[name_model], tokenizer
models[name_model] = VitsModel.from_pretrained(name_model, token=token)
models[name_model].decoder.apply_weight_norm()
for flow in models[name_model].flow.flows:
torch.nn.utils.weight_norm(flow.conv_pre)
torch.nn.utils.weight_norm(flow.conv_post)
tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
return models[name_model], tokenizer
TXT = """السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي اخباركم طيبين ان شاء الله ارحبوا على العين والراس"""
def process_chunk(chunk_id, spectrogram_chunk, speaker_embeddings, decoder):
with torch.no_grad():
wav = decoder(torch.tensor(spectrogram_chunk), speaker_embeddings)
wav = wav.squeeze().cpu().numpy()
file_path = f"audio_chunks/chunk_{chunk_id}.wav"
sf.write(file_path, wav, samplerate=16000)
return file_path
def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=0.9):
os.makedirs("audio_chunks", exist_ok=True)
model, tokenizer = get_model(name_model)
model.config.sampling_rate=16000
#text = ask_ai(text)
inputs = tokenizer(text, return_tensors="pt").to(device)
model.speaking_rate = speaking_rate
chunk_files = []
with ThreadPoolExecutor(max_workers=8) as executor:
futures = []
chunk_id = 0
for spectrogram_chunk in _inference_forward_stream(
model,
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
speaker_embeddings=None,
is_streaming=True,
chunk_size=32
):
futures.append(executor.submit(process_chunk, chunk_id, spectrogram_chunk, None, model.decoder))
chunk_id += 1
for future in as_completed(futures):
chunk_files.append(future.result())
chunk_files.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
all_audio = np.concatenate([sf.read(f)[0] for f in chunk_files])
return (model.config.sampling_rate, remove_noise_nr(all_audio))
model_choices = gr.Dropdown(
choices=[
"wasmdashai/vits-ar-sa-huba-v1",
"wasmdashai/vits-ar-sa-huba-v2",
"wasmdashai/vits-ar-sa-A",
"wasmdashai/vits-ar-ye-sa",
"wasmdashai/vits-ar-sa-M-v2",
'wasmdashai/vits-en-v1'
],
label="اختر النموذج",
value="wasmdashai/vits-ar-sa-huba-v2",
)
demo = gr.Interface(
fn=modelspeech,
inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)],
outputs=[gr.Audio(autoplay=True)]
)
demo.queue()
demo.launch(debug=True)
|