Spaces:

pauldarvin
/

wav_SED

Sleeping

App Files Files Community

wav_SED / app.py

pauldarvin

Update app.py

23c74d4 over 3 years ago

raw

history blame contribute delete

2.71 kB

	import gradio as gr

	#import os
	import matplotlib.pyplot as plt
	import numpy as np
	import librosa
	from panns_inference import SoundEventDetection, labels

	def plot_sound_event_detection_result(framewise_output):
	"""Visualization of sound event detection result.

	Args:
	framewise_output: (time_steps, classes_num)
	"""
	out_fig_path = 'sed.png'
	#os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)

	classwise_output = np.max(framewise_output, axis=0) # (classes_num,)

	idxes = np.argsort(classwise_output)[::-1]
	idxes = idxes[0:5]

	ix_to_lb = {i : label for i, label in enumerate(labels)}
	lines = []
	for idx in idxes:
	line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
	lines.append(line)

	plt.legend(handles=lines)
	plt.xlabel('Frames')
	plt.ylabel('Probability')
	plt.ylim(0, 1.)
	plt.savefig(out_fig_path)
	plt.close()
	print('Save fig to {}'.format(out_fig_path))
	# modified
	return plt.imread(out_fig_path)

	def pred(audio):
	rate, y = audio
	device = 'cpu' # 'cuda' \| 'cpu'
	#print('sample rate ', rate)
	#print('shape ', y.shape)
	#print('raw data', y)
	y = y.astype(np.float32)
	#print('float', y)
	y = librosa.core.to_mono(y.T)
	#print('shape ', y.shape)
	#print('mono', y)
	y = librosa.core.resample(y, orig_sr=rate, target_sr=32000)
	#print('shape ', y.shape)
	#print('resampled', y)
	#print(y.mean())
	#print(y.std())
	#y = (y - y.mean())/y.std()
	y = y/y.max()
	#print('normalized', y)
	#print(rate)
	#plt.plot(y)
	#plt.savefig('wave.png')
	#plt.close()
	y = y[None, :] # (batch_size, segment_samples)
	#print(y)

	#print('------ Audio tagging ------')
	#at = AudioTagging(checkpoint_path=None, device=device)
	#(clipwise_output, embedding) = at.inference(waveform)
	#"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""

	#print_audio_tagging_result(clipwise_output[0])

	print('------ Sound event detection ------')
	sed = SoundEventDetection(checkpoint_path=None, device=device)
	framewise_output = sed.inference(y)
	"""(batch_size, time_steps, classes_num)"""

	# modified
	return plot_sound_event_detection_result(framewise_output[0])


	demo = gr.Interface(
	pred,
	gr.Audio(source="upload"),
	"image",
	examples=[
	"telephone_speech.wav",
	"ringtone.wav", "animals.wav",
	],
	title="Sound Event Detection",
	description="This is a demo huggingface space app for PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. Please view README for more details.",
	)

	demo.launch()