wasmdashai commited on
Commit
407171e
·
verified ·
1 Parent(s): 1037683

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -15
app.py CHANGED
@@ -8,17 +8,23 @@ import numpy as np
8
  import noisereduce as nr
9
  import torch.nn as nn
10
  from typing import Optional, Iterator
11
- f=""
12
- token= os.getenv("acees-token")
13
- # token ="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"
14
 
 
 
 
 
15
  models = {}
16
 
17
- # دالة إز
 
 
 
 
18
  def remove_noise_nr(audio_data, sr=16000):
19
  return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
20
 
21
- # دالة inference
 
22
  def _inference_forward_stream(
23
  self,
24
  input_ids: Optional[torch.Tensor] = None,
@@ -27,8 +33,8 @@ def _inference_forward_stream(
27
  chunk_size: int = 32,
28
  is_streaming: bool = True
29
  ) -> Iterator[torch.Tensor]:
30
- padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
31
 
 
32
  text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
33
  hidden_states = text_encoder_output[0].transpose(1, 2)
34
  input_padding_mask = padding_mask.transpose(1, 2)
@@ -38,7 +44,6 @@ def _inference_forward_stream(
38
  duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
39
  predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
40
 
41
- # إنشاء attention mask
42
  indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
43
  output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
44
  output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
@@ -65,17 +70,18 @@ def _inference_forward_stream(
65
  yield wav.squeeze().cpu().numpy()
66
  else:
67
  with torch.no_grad():
68
- print("fff")
69
  wav = self.decoder(spectrogram, speaker_embeddings)
70
  yield wav.squeeze().cpu().numpy()
71
 
 
 
72
  def get_model(name_model):
73
  global models
74
  if name_model in models:
75
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
76
  return models[name_model], tokenizer
77
 
78
- models[name_model] = VitsModel.from_pretrained(name_model, token=token)
79
  models[name_model].decoder.apply_weight_norm()
80
  for flow in models[name_model].flow.flows:
81
  torch.nn.utils.weight_norm(flow.conv_pre)
@@ -84,18 +90,23 @@ def get_model(name_model):
84
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
85
  return models[name_model], tokenizer
86
 
 
 
87
  TXT = "السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي"
 
 
 
88
  def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
89
  model, tokenizer = get_model(name_model)
90
- inputs = tokenizer(text, return_tensors="pt").to("cuda")
91
  model.speaking_rate = speaking_rate
92
  with torch.no_grad():
93
- outputs = model(**inputs)
94
- waveform = outputs.waveform[0].cpu().numpy()
95
- #wav = list(_inference_forward_stream(model, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, speaker_embeddings=None, is_streaming=False))[0]
96
  return model.config.sampling_rate, remove_noise_nr(waveform)
97
 
98
 
 
99
  model_choices = gr.Dropdown(
100
  choices=[
101
  "wasmdashai/vits-ar-sa-huba-v1",
@@ -109,6 +120,11 @@ model_choices = gr.Dropdown(
109
  value="wasmdashai/vits-ar-sa-huba-v2"
110
  )
111
 
112
- demo = gr.Interface(fn=modelspeech, inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)], outputs=["audio"])
 
 
 
 
 
113
  demo.queue()
114
- demo.launch()
 
8
  import noisereduce as nr
9
  import torch.nn as nn
10
  from typing import Optional, Iterator
 
 
 
11
 
12
+ # قراءة التوكن من Secrets
13
+ token = os.getenv("acees-token") # تأكد أنك سميته بنفس الاسم في Settings → Repository secrets
14
+
15
+ # كائن لتخزين النماذج
16
  models = {}
17
 
18
+ # اختيار الجهاز (CUDA لو متوفر، غير كذا CPU)
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
+
22
+ # دالة إزالة الضوضاء
23
  def remove_noise_nr(audio_data, sr=16000):
24
  return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
25
 
26
+
27
+ # دالة inference (streaming / non-streaming)
28
  def _inference_forward_stream(
29
  self,
30
  input_ids: Optional[torch.Tensor] = None,
 
33
  chunk_size: int = 32,
34
  is_streaming: bool = True
35
  ) -> Iterator[torch.Tensor]:
 
36
 
37
+ padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
38
  text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
39
  hidden_states = text_encoder_output[0].transpose(1, 2)
40
  input_padding_mask = padding_mask.transpose(1, 2)
 
44
  duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
45
  predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
46
 
 
47
  indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
48
  output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
49
  output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
 
70
  yield wav.squeeze().cpu().numpy()
71
  else:
72
  with torch.no_grad():
 
73
  wav = self.decoder(spectrogram, speaker_embeddings)
74
  yield wav.squeeze().cpu().numpy()
75
 
76
+
77
+ # تحميل النموذج + التوكن
78
  def get_model(name_model):
79
  global models
80
  if name_model in models:
81
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
82
  return models[name_model], tokenizer
83
 
84
+ models[name_model] = VitsModel.from_pretrained(name_model, token=token)
85
  models[name_model].decoder.apply_weight_norm()
86
  for flow in models[name_model].flow.flows:
87
  torch.nn.utils.weight_norm(flow.conv_pre)
 
90
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
91
  return models[name_model], tokenizer
92
 
93
+
94
+ # النص الافتراضي
95
  TXT = "السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي"
96
+
97
+
98
+ # دالة تحويل النص إلى كلام
99
  def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
100
  model, tokenizer = get_model(name_model)
101
+ inputs = tokenizer(text, return_tensors="pt").to(device) # يشتغل على CPU أو GPU حسب المتوفر
102
  model.speaking_rate = speaking_rate
103
  with torch.no_grad():
104
+ outputs = model(**inputs)
105
+ waveform = outputs.waveform[0].cpu().numpy()
 
106
  return model.config.sampling_rate, remove_noise_nr(waveform)
107
 
108
 
109
+ # واجهة Gradio
110
  model_choices = gr.Dropdown(
111
  choices=[
112
  "wasmdashai/vits-ar-sa-huba-v1",
 
120
  value="wasmdashai/vits-ar-sa-huba-v2"
121
  )
122
 
123
+ demo = gr.Interface(
124
+ fn=modelspeech,
125
+ inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)],
126
+ outputs=["audio"]
127
+ )
128
+
129
  demo.queue()
130
+ demo.launch(server_name="0.0.0.0", server_port=7860)