Spaces:

wasmdashai
/

DemoLahja

Sleeping

App Files Files Community

wasmdashai commited on Aug 20

Commit

407171e

verified ·

1 Parent(s): 1037683

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -15

app.py CHANGED Viewed

@@ -8,17 +8,23 @@ import numpy as np
 import noisereduce as nr
 import torch.nn as nn
 from typing import Optional, Iterator
-f=""
-token= os.getenv("acees-token")
-# token ="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"
 models = {}
-# دالة إز
 def remove_noise_nr(audio_data, sr=16000):
     return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
-# دالة inference
 def _inference_forward_stream(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -27,8 +33,8 @@ def _inference_forward_stream(
         chunk_size: int = 32,
         is_streaming: bool = True
     ) -> Iterator[torch.Tensor]:
-    padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
     text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
     hidden_states = text_encoder_output[0].transpose(1, 2)
     input_padding_mask = padding_mask.transpose(1, 2)
@@ -38,7 +44,6 @@ def _inference_forward_stream(
     duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
     predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
-    # إنشاء attention mask
     indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
     output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
     output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
@@ -65,17 +70,18 @@ def _inference_forward_stream(
             yield wav.squeeze().cpu().numpy()
     else:
         with torch.no_grad():
-            print("fff")
             wav = self.decoder(spectrogram, speaker_embeddings)
         yield wav.squeeze().cpu().numpy()
 def get_model(name_model):
     global models
     if name_model in models:
         tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
         return models[name_model], tokenizer
-    models[name_model] = VitsModel.from_pretrained(name_model, token=token)
     models[name_model].decoder.apply_weight_norm()
     for flow in models[name_model].flow.flows:
         torch.nn.utils.weight_norm(flow.conv_pre)
@@ -84,18 +90,23 @@ def get_model(name_model):
     tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
     return models[name_model], tokenizer
 TXT = "السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي"
 def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
     model, tokenizer = get_model(name_model)
-    inputs = tokenizer(text, return_tensors="pt").to("cuda")
     model.speaking_rate = speaking_rate
     with torch.no_grad():
-      outputs = model(**inputs)
-      waveform = outputs.waveform[0].cpu().numpy()
-     #wav = list(_inference_forward_stream(model, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, speaker_embeddings=None, is_streaming=False))[0]
     return model.config.sampling_rate, remove_noise_nr(waveform)
 model_choices = gr.Dropdown(
     choices=[
         "wasmdashai/vits-ar-sa-huba-v1",
@@ -109,6 +120,11 @@ model_choices = gr.Dropdown(
     value="wasmdashai/vits-ar-sa-huba-v2"
 )
-demo = gr.Interface(fn=modelspeech, inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)], outputs=["audio"])
 demo.queue()
-demo.launch()

 import noisereduce as nr
 import torch.nn as nn
 from typing import Optional, Iterator
+# قراءة التوكن من Secrets
+token = os.getenv("acees-token")  # تأكد أنك سميته بنفس الاسم في Settings → Repository secrets
+# كائن لتخزين النماذج
 models = {}
+# اختيار الجهاز (CUDA لو متوفر، غير كذا CPU)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# دالة إزالة الضوضاء
 def remove_noise_nr(audio_data, sr=16000):
     return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
+# دالة inference (streaming / non-streaming)
 def _inference_forward_stream(
         self,
         input_ids: Optional[torch.Tensor] = None,
         chunk_size: int = 32,
         is_streaming: bool = True
     ) -> Iterator[torch.Tensor]:
+    padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
     text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
     hidden_states = text_encoder_output[0].transpose(1, 2)
     input_padding_mask = padding_mask.transpose(1, 2)
     duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
     predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
     indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
     output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
     output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
             yield wav.squeeze().cpu().numpy()
     else:
         with torch.no_grad():
             wav = self.decoder(spectrogram, speaker_embeddings)
         yield wav.squeeze().cpu().numpy()
+# تحميل النموذج + التوكن
 def get_model(name_model):
     global models
     if name_model in models:
         tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
         return models[name_model], tokenizer
+    models[name_model] = VitsModel.from_pretrained(name_model, token=token)
     models[name_model].decoder.apply_weight_norm()
     for flow in models[name_model].flow.flows:
         torch.nn.utils.weight_norm(flow.conv_pre)
     tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
     return models[name_model], tokenizer
+# النص الافتراضي
 TXT = "السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي"
+# دالة تحويل النص إلى كلام
 def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
     model, tokenizer = get_model(name_model)
+    inputs = tokenizer(text, return_tensors="pt").to(device)  # يشتغل على CPU أو GPU حسب المتوفر
     model.speaking_rate = speaking_rate
     with torch.no_grad():
+        outputs = model(**inputs)
+        waveform = outputs.waveform[0].cpu().numpy()
     return model.config.sampling_rate, remove_noise_nr(waveform)
+# واجهة Gradio
 model_choices = gr.Dropdown(
     choices=[
         "wasmdashai/vits-ar-sa-huba-v1",
     value="wasmdashai/vits-ar-sa-huba-v2"
 )
+demo = gr.Interface(
+    fn=modelspeech,
+    inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)],
+    outputs=["audio"]
+)
 demo.queue()
+demo.launch(server_name="0.0.0.0", server_port=7860)