Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| #import os | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import librosa | |
| from panns_inference import SoundEventDetection, labels | |
| def plot_sound_event_detection_result(framewise_output): | |
| """Visualization of sound event detection result. | |
| Args: | |
| framewise_output: (time_steps, classes_num) | |
| """ | |
| out_fig_path = 'sed.png' | |
| #os.makedirs(os.path.dirname(out_fig_path), exist_ok=True) | |
| classwise_output = np.max(framewise_output, axis=0) # (classes_num,) | |
| idxes = np.argsort(classwise_output)[::-1] | |
| idxes = idxes[0:5] | |
| ix_to_lb = {i : label for i, label in enumerate(labels)} | |
| lines = [] | |
| for idx in idxes: | |
| line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx]) | |
| lines.append(line) | |
| plt.legend(handles=lines) | |
| plt.xlabel('Frames') | |
| plt.ylabel('Probability') | |
| plt.ylim(0, 1.) | |
| plt.savefig(out_fig_path) | |
| plt.close() | |
| print('Save fig to {}'.format(out_fig_path)) | |
| # modified | |
| return plt.imread(out_fig_path) | |
| def pred(audio): | |
| rate, y = audio | |
| device = 'cpu' # 'cuda' | 'cpu' | |
| #print('sample rate ', rate) | |
| #print('shape ', y.shape) | |
| #print('raw data', y) | |
| y = y.astype(np.float32) | |
| #print('float', y) | |
| y = librosa.core.to_mono(y.T) | |
| #print('shape ', y.shape) | |
| #print('mono', y) | |
| y = librosa.core.resample(y, orig_sr=rate, target_sr=32000) | |
| #print('shape ', y.shape) | |
| #print('resampled', y) | |
| #print(y.mean()) | |
| #print(y.std()) | |
| #y = (y - y.mean())/y.std() | |
| y = y/y.max() | |
| #print('normalized', y) | |
| #print(rate) | |
| #plt.plot(y) | |
| #plt.savefig('wave.png') | |
| #plt.close() | |
| y = y[None, :] # (batch_size, segment_samples) | |
| #print(y) | |
| #print('------ Audio tagging ------') | |
| #at = AudioTagging(checkpoint_path=None, device=device) | |
| #(clipwise_output, embedding) = at.inference(waveform) | |
| #"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)""" | |
| #print_audio_tagging_result(clipwise_output[0]) | |
| print('------ Sound event detection ------') | |
| sed = SoundEventDetection(checkpoint_path=None, device=device) | |
| framewise_output = sed.inference(y) | |
| """(batch_size, time_steps, classes_num)""" | |
| # modified | |
| return plot_sound_event_detection_result(framewise_output[0]) | |
| demo = gr.Interface( | |
| pred, | |
| gr.Audio(source="upload"), | |
| "image", | |
| examples=[ | |
| "telephone_speech.wav", | |
| "ringtone.wav", "animals.wav", | |
| ], | |
| title="Sound Event Detection", | |
| description="This is a demo huggingface space app for PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. Please view README for more details.", | |
| ) | |
| demo.launch() |