Spaces:

kadirnar
/

Audio-WebUI

Runtime error

App Files Files Community

Audio-WebUI / app.py

kadirnar

Update app.py

22dcbc5 verified over 1 year ago

raw

history blame contribute delete

8.46 kB


	import gradio as gr
	import torch
	from transformers import BitsAndBytesConfig, HqqConfig

	from whisperplus import (
	SpeechToTextPipeline,
	download_youtube_to_mp3,
	download_youtube_to_mp4,
	format_speech_to_dialogue,
	)
	from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
	from whisperplus.pipelines.summarization import TextSummarizationPipeline
	from whisperplus.pipelines.text2speech import TextToSpeechPipeline
	from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
	from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline

	import subprocess

	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

	def youtube_url_to_text(url, model_id, language_choice):
	"""
	Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
	a specified model, and returns the transcript along with the video path.

	Args:
	url (str): The URL of the video to download and convert.
	model_id (str): The ID of the speech-to-text model to use.
	language_choice (str): The language choice for the speech-to-text conversion.

	Returns:
	transcript (str): The transcript of the speech-to-text conversion.
	"""
	audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")

	hqq_config = HqqConfig(
	nbits=4,
	group_size=64,
	quant_zero=False,
	quant_scale=False,
	axis=0,
	offload_meta=False,
	) # axis=0 is used by default

	pipeline = SpeechToTextPipeline(
	model_id=model_id,
	quant_config=hqq_config,
	flash_attention_2=True,
	)

	transcript = pipeline(
	audio_path=audio_path,
	chunk_length_s=30,
	stride_length_s=5,
	max_new_tokens=128,
	batch_size=100,
	language=language_choice,
	return_timestamps=False,
	)
	return transcript


	def summarization(text, model_id="facebook/bart-large-cnn"):
	"""
	Main function that performs summarization using a specified model and returns the summary.

	Args:
	text (str): The text to summarize.
	model_id (str): The ID of the summarization model to use.

	Returns:
	summary (str): The summary of the text.
	"""
	summarizer = TextSummarizationPipeline(model_id=model_id)
	summary = summarizer.summarize(text)

	return summary[0]["summary_text"]


	def long_text_summarization(text, model_id="facebook/bart-large-cnn"):
	"""
	Main function that performs summarization using a specified model and returns the summary.

	Args:
	text (str): The text to summarize.
	model_id (str): The ID of the summarization model to use.

	Returns:
	summary (str): The summary of the text.
	"""
	summarizer = LongTextSummarizationPipeline(model_id=model_id)
	summary_text = summarizer.summarize(text)

	return summary_text


	def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
	"""
	Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
	a specified model, and returns the transcript along with the video path.

	Args:
	url (str): The URL of the video to download and convert.
	model_id (str): The ID of the speech-to-text model to use.
	language_choice (str): The language choice for the speech-to-text conversion.

	Returns:
	transcript (str): The transcript of the speech-to-text conversion.
	video_path (str): The path of the downloaded video.
	"""

	pipeline = ASRDiarizationPipeline.from_pretrained(
	asr_model=model_id,
	diarizer_model="pyannote/speaker-diarization",
	use_auth_token=False,
	chunk_length_s=30,
	device=device,
	)

	audio_path = download_youtube_to_mp3(url)
	output_text = pipeline(
	audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
	dialogue = format_speech_to_dialogue(output_text)
	return dialogue, audio_path


	def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"):
	tts = TextToSpeechPipeline(model_id=model_id)
	audio = tts(text=text, voice_preset=voice_preset)
	return audio


	def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"):
	video_path = download_youtube_to_mp4(url)

	caption = WhisperAutoCaptionPipeline(model_id=model_id)
	output = caption(video_path=video_path, output_path="output.mp4", language=language)
	return output


	with gr.Blocks() as demo:
	with gr.Tab("YouTube URL to Text"):
	with gr.Row():
	with gr.Column():
	url_input = gr.Textbox(label="Enter YouTube URL")
	model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium")
	language_input = gr.Textbox(label="Enter Language", value="en")
	submit_btn1 = gr.Button("Submit")
	with gr.Column():
	output1 = gr.Textbox(label="Transcript")
	submit_btn1.click(
	youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1)

	with gr.Tab("Text Summarization"):
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(label="Enter Text", lines=5)
	model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
	submit_btn2 = gr.Button("Summarize")
	with gr.Column():
	output2 = gr.Textbox(label="Summary")
	submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2)

	with gr.Tab("Long Text Summarization"):
	with gr.Row():
	with gr.Column():
	long_text_input = gr.Textbox(label="Enter Long Text", lines=10)
	model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
	submit_btn3 = gr.Button("Summarize Long Text")
	with gr.Column():
	output3 = gr.Textbox(label="Long Text Summary")
	submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3)

	with gr.Tab("Speaker Diarization"):
	with gr.Row():
	with gr.Column():
	url_input2 = gr.Textbox(label="Enter YouTube URL")
	model_id_input4 = gr.Textbox(label="Enter Model ID")
	num_speakers = gr.Number(label="Number of Speakers", value=2)
	min_speakers = gr.Number(label="Min Speakers", value=1)
	max_speakers = gr.Number(label="Max Speakers", value=4)
	device = gr.Textbox(label="Device", value="cpu")
	submit_btn4 = gr.Button("Diarize")
	with gr.Column():
	output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"])
	submit_btn4.click(
	speaker_diarization,
	inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers],
	outputs=output4)

	with gr.Tab("Text to Speech"):
	with gr.Row():
	with gr.Column():
	text_input2 = gr.Textbox(label="Enter Text", lines=3)
	model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark")
	voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6")
	submit_btn5 = gr.Button("Generate Audio")
	with gr.Column():
	output5 = gr.Audio(label="Generated Audio")
	submit_btn5.click(
	text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5)

	with gr.Tab("Whisper Autocaption"):
	with gr.Row():
	with gr.Column():
	url_input3 = gr.Textbox(label="Enter YouTube URL")
	language = gr.Textbox(label="Language", value="en")
	model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2")
	submit_btn6 = gr.Button("Generate Captions")
	with gr.Column():
	output6 = gr.Video(label="Captioned Video")
	submit_btn6.click(
	whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6)

	demo.launch()