Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,17 +15,26 @@ from collections import Counter
|
|
| 15 |
from scipy.stats import kurtosis
|
| 16 |
from huggingface_hub import InferenceClient
|
| 17 |
import os
|
|
|
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
access_token_mod_1 = os.getenv('HF_Access_Personal')
|
| 20 |
|
| 21 |
-
#
|
| 22 |
processor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
|
| 23 |
model = AutoModelForAudioClassification.from_pretrained("Robertomarting/tmp_trainer",token=access_token_mod_1)
|
| 24 |
|
|
|
|
| 25 |
def is_white_noise(audio, threshold=0.75):
|
| 26 |
kurt = kurtosis(audio)
|
| 27 |
return np.abs(kurt) < 0.1 and np.mean(np.abs(audio)) < threshold
|
| 28 |
|
|
|
|
|
|
|
| 29 |
def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
|
| 30 |
data = []
|
| 31 |
target_length = int(target_sr * target_duration)
|
|
@@ -57,6 +66,7 @@ def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
|
|
| 57 |
|
| 58 |
return data
|
| 59 |
|
|
|
|
| 60 |
def preprocess_audio(audio_segments):
|
| 61 |
inputs = processor(
|
| 62 |
audio_segments,
|
|
@@ -64,32 +74,26 @@ def preprocess_audio(audio_segments):
|
|
| 64 |
sampling_rate=processor.sampling_rate,
|
| 65 |
max_length=int(processor.sampling_rate * 1),
|
| 66 |
truncation=True,
|
| 67 |
-
return_tensors="pt"
|
| 68 |
)
|
| 69 |
return inputs
|
| 70 |
|
|
|
|
| 71 |
def predict_audio(audio):
|
| 72 |
-
# Procesar el audio y obtener las listas de numpy
|
| 73 |
-
audio_segments = process_audio(audio)
|
| 74 |
|
| 75 |
-
|
|
|
|
| 76 |
inputs = preprocess_audio(audio_segments)
|
| 77 |
|
| 78 |
-
# Realizar las predicciones
|
| 79 |
with torch.no_grad():
|
| 80 |
outputs = model(**inputs)
|
| 81 |
|
| 82 |
-
# Obtener los logits de las predicciones
|
| 83 |
logits = outputs.logits
|
| 84 |
-
|
| 85 |
-
# Convertir logits a probabilidades
|
| 86 |
probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
|
| 87 |
predicted_classes = probabilities.argmax(axis=1)
|
| 88 |
|
| 89 |
-
# Obtener la etiqueta m谩s com煤n
|
| 90 |
most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
|
| 91 |
-
|
| 92 |
-
# Mapear etiquetas num茅ricas a etiquetas de texto
|
| 93 |
replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
|
| 94 |
most_common_predicted_label = replace_dict[most_common_predicted_label]
|
| 95 |
|
|
@@ -98,10 +102,93 @@ def predict_audio(audio):
|
|
| 98 |
def clear_audio_input(audio):
|
| 99 |
return ""
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
access_token = os.getenv('HF_ACCESS_TOKEN')
|
| 102 |
|
|
|
|
| 103 |
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=access_token)
|
| 104 |
|
|
|
|
| 105 |
def respond(
|
| 106 |
message,
|
| 107 |
history: list[tuple[str, str]],
|
|
@@ -133,6 +220,11 @@ def respond(
|
|
| 133 |
response += token
|
| 134 |
yield response
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
my_theme = gr.themes.Soft(
|
| 137 |
primary_hue="emerald",
|
| 138 |
secondary_hue="green",
|
|
@@ -150,54 +242,19 @@ my_theme = gr.themes.Soft(
|
|
| 150 |
shadow_spread='*button_shadow_active'
|
| 151 |
)
|
| 152 |
|
| 153 |
-
#
|
| 154 |
def mostrar_pagina_1():
|
| 155 |
return gr.update(visible=False), gr.update(visible=True)
|
| 156 |
|
| 157 |
-
#
|
| 158 |
def mostrar_pagina_2():
|
| 159 |
return gr.update(visible=False), gr.update(visible=True)
|
| 160 |
|
| 161 |
-
#
|
| 162 |
def redirigir_a_pantalla_inicial():
|
| 163 |
return gr.update(visible=True), gr.update(visible=False)
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
#processor = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
|
| 168 |
-
|
| 169 |
-
#monitor_model = HubertForSequenceClassification.from_pretrained("A-POR-LOS-8000/distilhubert-finetuned-cry-detector",use_auth_token=access_token_mod_1)
|
| 170 |
-
|
| 171 |
-
pipeline_monitor = pipeline(model="Robertomarting/tmp_trainer",token=access_token_mod_1,feature_extractor=processor)
|
| 172 |
-
|
| 173 |
-
def predict_monitor(stream, new_chunk):
|
| 174 |
-
sr, y = new_chunk
|
| 175 |
-
y = y.astype(np.float32)
|
| 176 |
-
y /= np.max(np.abs(y))
|
| 177 |
-
|
| 178 |
-
if stream is not None:
|
| 179 |
-
stream = np.concatenate([stream, y])
|
| 180 |
-
else:
|
| 181 |
-
stream = y
|
| 182 |
-
return stream, pipeline_monitor(stream)
|
| 183 |
-
|
| 184 |
-
my_theme = gr.themes.Soft(
|
| 185 |
-
primary_hue="emerald",
|
| 186 |
-
secondary_hue="green",
|
| 187 |
-
neutral_hue="slate",
|
| 188 |
-
text_size="sm",
|
| 189 |
-
spacing_size="sm",
|
| 190 |
-
font=[gr.themes.GoogleFont('Nunito'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
|
| 191 |
-
font_mono=[gr.themes.GoogleFont('Nunito'), 'ui-monospace', 'Consolas', 'monospace'],
|
| 192 |
-
).set(
|
| 193 |
-
body_background_fill='*neutral_50',
|
| 194 |
-
body_text_color='*neutral_600',
|
| 195 |
-
body_text_size='*text_sm',
|
| 196 |
-
embed_radius='*radius_md',
|
| 197 |
-
shadow_drop='*shadow_spread',
|
| 198 |
-
shadow_spread='*button_shadow_active'
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
with gr.Blocks(theme = my_theme) as demo:
|
| 202 |
|
| 203 |
with gr.Column() as pantalla_inicial:
|
|
@@ -261,7 +318,7 @@ with gr.Blocks(theme = my_theme) as demo:
|
|
| 261 |
|
| 262 |
with gr.Column():
|
| 263 |
gr.Markdown("<h2>Assistant</h2>")
|
| 264 |
-
system_message = "
|
| 265 |
max_tokens = 512
|
| 266 |
temperature = 0.7
|
| 267 |
top_p = 0.95
|
|
@@ -282,23 +339,30 @@ with gr.Blocks(theme = my_theme) as demo:
|
|
| 282 |
boton_volver_inicio_1.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
| 283 |
|
| 284 |
with gr.Column(visible=False) as pagina_2:
|
|
|
|
| 285 |
gr.Markdown("<h2>Monitor</h2>")
|
| 286 |
-
gr.Markdown("
|
| 287 |
|
| 288 |
-
|
| 289 |
-
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Habla cerca del micr贸fono")
|
| 290 |
-
|
| 291 |
-
# Salida del texto donde se muestra la predicci贸n
|
| 292 |
-
output_text = gr.Textbox(label="Resultado de la predicci贸n")
|
| 293 |
-
|
| 294 |
-
# Vincular la predicci贸n en streaming con el audio
|
| 295 |
-
audio_input.stream(fn=lambda audio: predict_monitor(audio, audio_classifier),
|
| 296 |
-
inputs=audio_input,
|
| 297 |
-
outputs=output_text)
|
| 298 |
|
| 299 |
-
|
| 300 |
-
boton_volver_inicio_2.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_2])
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
boton_pagina_1.click(mostrar_pagina_1, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
| 303 |
boton_pagina_2.click(mostrar_pagina_2, inputs=None, outputs=[pantalla_inicial, pagina_2])
|
| 304 |
|
|
|
|
| 15 |
from scipy.stats import kurtosis
|
| 16 |
from huggingface_hub import InferenceClient
|
| 17 |
import os
|
| 18 |
+
import time
|
| 19 |
|
| 20 |
+
'''
|
| 21 |
+
Predictor
|
| 22 |
+
'''
|
| 23 |
+
|
| 24 |
+
#Obtenemos el token para traernos el modelo:
|
| 25 |
access_token_mod_1 = os.getenv('HF_Access_Personal')
|
| 26 |
|
| 27 |
+
#Cargamos procesador y modelo:
|
| 28 |
processor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
|
| 29 |
model = AutoModelForAudioClassification.from_pretrained("Robertomarting/tmp_trainer",token=access_token_mod_1)
|
| 30 |
|
| 31 |
+
#Definimos una funci贸n para eliminar segmentos de audio con un determinado porcentaje de ruido blanco:
|
| 32 |
def is_white_noise(audio, threshold=0.75):
|
| 33 |
kurt = kurtosis(audio)
|
| 34 |
return np.abs(kurt) < 0.1 and np.mean(np.abs(audio)) < threshold
|
| 35 |
|
| 36 |
+
#Funci贸n de procesado de audio, permite particionar en fragmentos de 1 segundo, hacer un trim, volverlo mono si est谩 en est茅reo, resamplearlo
|
| 37 |
+
#al sampling rate que admite el modelo, etc.
|
| 38 |
def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
|
| 39 |
data = []
|
| 40 |
target_length = int(target_sr * target_duration)
|
|
|
|
| 66 |
|
| 67 |
return data
|
| 68 |
|
| 69 |
+
#Se aplica al extractor de caracter铆sticas del modelo:
|
| 70 |
def preprocess_audio(audio_segments):
|
| 71 |
inputs = processor(
|
| 72 |
audio_segments,
|
|
|
|
| 74 |
sampling_rate=processor.sampling_rate,
|
| 75 |
max_length=int(processor.sampling_rate * 1),
|
| 76 |
truncation=True,
|
| 77 |
+
return_tensors="pt"
|
| 78 |
)
|
| 79 |
return inputs
|
| 80 |
|
| 81 |
+
#Se hace la predicci贸n para cada audio:
|
| 82 |
def predict_audio(audio):
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
audio_segments = process_audio(audio)
|
| 85 |
+
|
| 86 |
inputs = preprocess_audio(audio_segments)
|
| 87 |
|
|
|
|
| 88 |
with torch.no_grad():
|
| 89 |
outputs = model(**inputs)
|
| 90 |
|
|
|
|
| 91 |
logits = outputs.logits
|
|
|
|
|
|
|
| 92 |
probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
|
| 93 |
predicted_classes = probabilities.argmax(axis=1)
|
| 94 |
|
|
|
|
| 95 |
most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
|
| 96 |
+
|
|
|
|
| 97 |
replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
|
| 98 |
most_common_predicted_label = replace_dict[most_common_predicted_label]
|
| 99 |
|
|
|
|
| 102 |
def clear_audio_input(audio):
|
| 103 |
return ""
|
| 104 |
|
| 105 |
+
'''
|
| 106 |
+
Monitor
|
| 107 |
+
'''
|
| 108 |
+
|
| 109 |
+
#Sacamos extractor de caracter铆sticas:
|
| 110 |
+
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
|
| 111 |
+
#Y nuestro modelo:
|
| 112 |
+
model_monitor = HubertForSequenceClassification.from_pretrained("A-POR-LOS-8000/distilhubert-finetuned-cry-detector")
|
| 113 |
+
|
| 114 |
+
#Calculamos decibelios de lo que llega al gradio:
|
| 115 |
+
def compute_db(audio_data):
|
| 116 |
+
rms = np.sqrt(np.mean(np.square(audio_data)))
|
| 117 |
+
db = 20 * np.log10(rms + 1e-6)
|
| 118 |
+
return db
|
| 119 |
+
|
| 120 |
+
#Funci贸n de extracci贸n de caracter铆sticas para el monitor:
|
| 121 |
+
def preprocess_audio_monitor(audio_segments):
|
| 122 |
+
inputs = FEATURE_EXTRACTOR(
|
| 123 |
+
audio_segments,
|
| 124 |
+
padding=True,
|
| 125 |
+
sampling_rate=16000,
|
| 126 |
+
max_length=int(16000*1),
|
| 127 |
+
return_tensors="pt"
|
| 128 |
+
)
|
| 129 |
+
return inputs
|
| 130 |
+
|
| 131 |
+
#Funci贸n de predicci贸n en streaming:
|
| 132 |
+
def predict_audio_stream(audio_data, sample_rate):
|
| 133 |
+
|
| 134 |
+
audio_segments = process_audio(audio_data)
|
| 135 |
+
inputs = preprocess_audio_monitor(audio_segments)
|
| 136 |
+
|
| 137 |
+
with torch.no_grad():
|
| 138 |
+
outputs = model_monitor(**inputs)
|
| 139 |
+
|
| 140 |
+
logits = outputs.logits
|
| 141 |
+
probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
|
| 142 |
+
crying_probabilities = probabilities[:, 1]
|
| 143 |
+
avg_crying_probability = crying_probabilities.mean()
|
| 144 |
+
|
| 145 |
+
if avg_crying_probability < 0.15:
|
| 146 |
+
return "Est谩 llorando", avg_crying_probability
|
| 147 |
+
else:
|
| 148 |
+
return "No est谩 llorando", avg_crying_probability
|
| 149 |
+
|
| 150 |
+
#Funci贸n que realiza la predicci贸n
|
| 151 |
+
def continuous_prediction_with_status(audio, sample_rate=16000,duration=3):
|
| 152 |
+
audio_segments = []
|
| 153 |
+
start_time = time.time()
|
| 154 |
+
|
| 155 |
+
max_samples = sample_rate * duration
|
| 156 |
+
audio_data = audio[:max_samples]
|
| 157 |
+
|
| 158 |
+
result = predict_audio_stream(audio_data, sample_rate)
|
| 159 |
+
|
| 160 |
+
return result
|
| 161 |
+
|
| 162 |
+
def capture_and_predict(audio, sample_rate=16000, duration=5):
|
| 163 |
+
max_samples = sample_rate * duration
|
| 164 |
+
audio_data = audio[:max_samples]
|
| 165 |
+
|
| 166 |
+
result, probabilidad = predict_audio_stream(audio_data, sample_rate)
|
| 167 |
+
return f"Predicci贸n: {result}, Probabilidad: {probabilidad:.2f}", probabilidad
|
| 168 |
+
|
| 169 |
+
#Funci贸n que se encarga de indicarle al usuario si se ha pasado el umbral:
|
| 170 |
+
def update_status_to_predicting(audio, visual_threshold):
|
| 171 |
+
sample_rate, audio_data = audio
|
| 172 |
+
audio_data = np.array(audio_data, dtype=np.float32)
|
| 173 |
+
|
| 174 |
+
db_level = compute_db(audio_data)
|
| 175 |
+
|
| 176 |
+
if db_level < visual_threshold:
|
| 177 |
+
return f"Esperando... Decibelios: {db_level}"
|
| 178 |
+
else:
|
| 179 |
+
return f"Prediciendo... Decibelios: {db_level}"
|
| 180 |
+
|
| 181 |
+
'''
|
| 182 |
+
Asistente
|
| 183 |
+
'''
|
| 184 |
+
|
| 185 |
+
#Traemos el token:
|
| 186 |
access_token = os.getenv('HF_ACCESS_TOKEN')
|
| 187 |
|
| 188 |
+
#Generamos el cliente:
|
| 189 |
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=access_token)
|
| 190 |
|
| 191 |
+
#Generamos una funci贸n de respuesta:
|
| 192 |
def respond(
|
| 193 |
message,
|
| 194 |
history: list[tuple[str, str]],
|
|
|
|
| 220 |
response += token
|
| 221 |
yield response
|
| 222 |
|
| 223 |
+
'''
|
| 224 |
+
Interfaz
|
| 225 |
+
'''
|
| 226 |
+
|
| 227 |
+
#Generamos un theme con par谩metros personalizados:
|
| 228 |
my_theme = gr.themes.Soft(
|
| 229 |
primary_hue="emerald",
|
| 230 |
secondary_hue="green",
|
|
|
|
| 242 |
shadow_spread='*button_shadow_active'
|
| 243 |
)
|
| 244 |
|
| 245 |
+
#Funci贸n para mostrar la p谩gina del Predictor
|
| 246 |
def mostrar_pagina_1():
|
| 247 |
return gr.update(visible=False), gr.update(visible=True)
|
| 248 |
|
| 249 |
+
#Funci贸n para mostrar la p谩gina del Monitor
|
| 250 |
def mostrar_pagina_2():
|
| 251 |
return gr.update(visible=False), gr.update(visible=True)
|
| 252 |
|
| 253 |
+
#Funci贸n para regresar a la pantalla inicial
|
| 254 |
def redirigir_a_pantalla_inicial():
|
| 255 |
return gr.update(visible=True), gr.update(visible=False)
|
| 256 |
|
| 257 |
+
#Generamos el gradio:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
with gr.Blocks(theme = my_theme) as demo:
|
| 259 |
|
| 260 |
with gr.Column() as pantalla_inicial:
|
|
|
|
| 318 |
|
| 319 |
with gr.Column():
|
| 320 |
gr.Markdown("<h2>Assistant</h2>")
|
| 321 |
+
system_message = "Eres un chatbot especializado en el cuidado y la salud de los beb茅s. Est谩s dispuesto a ayudar amablemente a cualquier padre que tenga dudas o preocupaciones sobre su hijo o hija."
|
| 322 |
max_tokens = 512
|
| 323 |
temperature = 0.7
|
| 324 |
top_p = 0.95
|
|
|
|
| 339 |
boton_volver_inicio_1.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
| 340 |
|
| 341 |
with gr.Column(visible=False) as pagina_2:
|
| 342 |
+
|
| 343 |
gr.Markdown("<h2>Monitor</h2>")
|
| 344 |
+
gr.Markdown("<h4 style='text-align: center; font-size: 1.5em'>Detecci贸n en tiempo real del llanto del beb茅</h4>")
|
| 345 |
|
| 346 |
+
audio_stream = gr.Audio(sources=["microphone"], streaming=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
+
threshold_db = gr.Slider(minimum=0, maximum=200, step=1, value=20, label="Umbral de dB para activar la predicci贸n")
|
|
|
|
| 349 |
|
| 350 |
+
status_label = gr.Textbox(value="Esperando...", label="Estado")
|
| 351 |
+
prediction_label = gr.Textbox(label="Predicci贸n")
|
| 352 |
+
|
| 353 |
+
audio_stream.stream(
|
| 354 |
+
fn=update_status_to_predicting,
|
| 355 |
+
inputs=[audio_stream, threshold_db],
|
| 356 |
+
outputs=status_label
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
# Captura el audio y realiza la predicci贸n si se supera el umbral
|
| 360 |
+
audio_stream.stream(
|
| 361 |
+
fn=capture_and_predict,
|
| 362 |
+
inputs=audio_stream,
|
| 363 |
+
outputs=prediction_label
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
boton_pagina_1.click(mostrar_pagina_1, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
| 367 |
boton_pagina_2.click(mostrar_pagina_2, inputs=None, outputs=[pantalla_inicial, pagina_2])
|
| 368 |
|