Spaces:

pratikshahp
/

audio-to-text-conversion

Runtime error

App Files Files Community

audio-to-text-conversion / app.py

pratikshahp

Update app.py

8b331ad verified over 1 year ago

raw

history blame contribute delete

3.44 kB

	# Transform an audio to text script with language detection.
	# Author: Pratiksha Patel
	# Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file.
	# import required modules
	#import torch
	#import streamlit as st
	#from audio_recorder_streamlit import audio_recorder
	#import numpy as np
	#from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

	#def transcribe_audio(audio_bytes):
	# processor = AutoProcessor.from_pretrained("openai/whisper-large")
	# model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")

	# Convert audio bytes to numpy array
	# audio_array = np.frombuffer(audio_bytes, dtype=np.int16)

	# Normalize audio array
	#audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0

	# Provide inputs to the processor
	##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
	#input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features

	# generate token ids
	#predicted_ids = model.generate(input_features)
	# decode token ids to text
	#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

	#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
	#return transcription
	# Streamlit app
	#st.title("Audio to Text Transcription..")

	#audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)

	#if audio_bytes:
	# st.audio(audio_bytes, format="audio/wav")

	# transcription = transcribe_audio(audio_bytes)

	# if transcription:
	# st.write("Transcription:")
	# st.write(transcription)
	#else:
	# st.write("Error: Failed to transcribe audio.")
	#else:
	# st.write("No audio recorded.")
	import torch
	import streamlit as st
	from audio_recorder_streamlit import audio_recorder
	import numpy as np
	#from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	# Load model directly
	from transformers import AutoProcessor, AutoModelForPreTraining

	def transcribe_audio(audio_bytes):
	# processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
	# model = AutoModelForSpeechSeq2Seq.from_pretrained("facebook/s2t-small-librispeech-asr")
	processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
	model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base")
	# Convert audio bytes to numpy array
	audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
	# Normalize audio array
	audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
	# Provide inputs to the processor
	input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
	# generate token ids
	predicted_ids = model.generate(input_features)
	# decode token ids to text
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
	return transcription

	# Streamlit app
	st.title("Audio to Text Transcription..")

	audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
	if audio_bytes:
	st.audio(audio_bytes, format="audio/wav")

	transcription = transcribe_audio(audio_bytes)

	if transcription:
	st.write("Transcription:")
	st.write(transcription)
	else:
	st.write("Error: Failed to transcribe audio.")
	else:
	st.write("No audio recorded.")