Spaces:

jts-ai-team
/

homeshopping

Paused

App Files Files Community

homeshopping / frontend /app.py

jts-ai-team

Upload 9 files

7bda603 verified about 1 month ago

raw

history blame contribute delete

11.3 kB

	import gradio as gr
	from fastrtc import ReplyOnPause, AlgoOptions, SileroVadOptions, AdditionalOutputs, WebRTC, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials #get_hf_turn_credentials,
	import os
	from dotenv import load_dotenv
	import time
	import numpy as np
	import sys
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
	from backend.tts import synthesize_text
	from backend.asr import transcribe_audio
	from backend.utils import preprocess_audio, is_valid_turn
	from backend.main import stream_chat_response

	from pydub import AudioSegment


	load_dotenv(override=True)

	phone_waiting_sound = AudioSegment.from_mp3("frontend/phone-ringing-382734.mp3")[:1000]
	sound_samples = np.array(phone_waiting_sound.get_array_of_samples(), dtype=np.int16)
	if phone_waiting_sound.channels > 1:
	sound_samples = sound_samples.reshape((-1, phone_waiting_sound.channels)).mean(axis=1)
	sound_samples = sound_samples.astype(np.float32) / 32768.0 # Normalize to [-1,
	def startup(_):
	yield (phone_waiting_sound.frame_rate, sound_samples)
	STARTUP_MESSAGE = "สวัสดีค่ะ มีข้อมูลสอบถามด้านใดคะ?"
	yield from synthesize_text(STARTUP_MESSAGE)
	time.sleep(2)
	yield AdditionalOutputs([{"role": "assistant", "content": STARTUP_MESSAGE}])

	custom_css = """
	/* Overall Gradio page styling: hot pink background */
	body {
	/* background-color: #ff69b4; /* Hot pink */
	margin: 0;
	padding: 0;
	font-family: sans-serif;}
	/* Title styling */
	h1 {
	color: #fff;
	text-shadow: 1px 1px 2px #ff85a2;
	font-size: 2.5em;
	margin-bottom: 20px;
	text-align: center;
	}
	/* Style the column holding the telephone interface */
	.phone-column {
	max-width: 350px !important; /* Limit the width of the phone column */
	margin: 0 auto; /* Center the column */
	border-radius: 20px;
	background-color: #ff69b4; /* Lighter pink for telephone interface */
	box-shadow: 0 0 15px rgba(0, 0, 0, 0.2);
	padding: 20px;
	}
	/* Conversation history box styling */
	#conversation-history-chatbot {
	background-color: #ffc0cb; /* Lighter pink for conversation history */
	border: 1px solid #ccc;
	border-radius: 10px;
	padding: 10px;
	box-shadow: 0 0 15px rgba(0, 0, 0, 0.2);
	}
	"""
	def response(audio: tuple[int, np.ndarray] \| None, conversation_history):
	"""
	Handles user audio input, transcribes it, streams LLM text via backend.main,
	and synthesizes chunks to audio while updating the conversation history.
	"""
	print(f"--- Latency Breakdown ---")
	start_time = time.time()
	if conversation_history is None:
	conversation_history = []

	previous_history = list(conversation_history)

	if not audio or audio[1] is None or not np.any(audio[1]):
	print("No audio input detected; skipping response generation.")
	print(f"------------------------")
	return
	import soundfile as sf

	sample_rate, audio_array = audio

	try:
	processed_audio = preprocess_audio((sample_rate, audio_array), target_frame_rate=16000)
	except Exception as audio_err:
	print(f"Audio preprocessing failed: {audio_err}")
	print(f"------------------------")
	return

	t0 = time.time()
	transcription = transcribe_audio( processed_audio)
	t_asr = time.time() - t0
	print(f"ASR: {t_asr:.4f}s")

	if not transcription.strip():
	print("No valid transcription; skipping response generation.")
	print(f"------------------------")
	return

	user_turn = {"role": "user", "content": transcription}
	print(f"User: {transcription}")
	if is_valid_turn(user_turn):
	conversation_history.append(user_turn)
	yield AdditionalOutputs(conversation_history)

	print("Conversation history:", conversation_history)

	assistant_turn = {"role": "assistant", "content": ""}
	conversation_history.append(assistant_turn)

	history_for_stream = [dict(turn) for turn in previous_history if is_valid_turn(turn)]

	text_buffer = ""
	full_response = ""
	delimiter_count = 0
	n_threshold = 3
	max_n_threshold = 5
	lang = "th"
	chunk_count = 0
	first_chunk_sent = False
	start_llm_stream = time.time()

	try:
	for text_chunk in stream_chat_response(history_for_stream, transcription):
	if not isinstance(text_chunk, str):
	text_chunk = str(text_chunk)

	i = 0
	while i < len(text_chunk):
	char = text_chunk[i]
	text_buffer += char
	full_response += char

	assistant_turn["content"] = full_response.strip()

	is_delimiter = False
	if char in {' ', '\n'}:
	is_delimiter = True
	delimiter_count += 1
	if i + 1 < len(text_chunk) and text_chunk[i + 1] == 'ๆ':
	text_buffer += text_chunk[i + 1]
	full_response += text_chunk[i + 1]
	i += 1

	send_now = False
	if not first_chunk_sent:
	if is_delimiter and text_buffer.strip():
	send_now = True
	else:
	if delimiter_count >= n_threshold and text_buffer.strip():
	send_now = True
	if n_threshold < max_n_threshold:
	n_threshold += 1

	if send_now:
	buffer_to_send = text_buffer.strip()
	try:
	if buffer_to_send and buffer_to_send.endswith('วันที่'):
	buffer_to_send = buffer_to_send[:-len('วันที่')]
	if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'):
	buffer_to_send = buffer_to_send[:-len('ค่ะ')]
	except Exception:
	buffer_to_send = buffer_to_send.replace('ค่ะ', '')

	if buffer_to_send:
	chunk_count += 1
	if chunk_count == 1:
	first_llm_chunk_time = time.time()
	t_llm_first_token = first_llm_chunk_time - start_llm_stream
	print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)")
	yield from synthesize_text(buffer_to_send, lang=lang)
	first_chunk_sent = True
	text_buffer = ""
	delimiter_count = 0
	yield AdditionalOutputs(conversation_history)

	i += 1

	if text_buffer.strip():
	buffer_to_send = text_buffer.strip()
	try:
	if buffer_to_send and buffer_to_send.endswith('วันที่'):
	buffer_to_send = buffer_to_send[:-len('วันที่')]
	if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'):
	buffer_to_send = buffer_to_send[:-len('ค่ะ')]
	except Exception:
	buffer_to_send = buffer_to_send.replace('ค่ะ', '')

	if buffer_to_send:
	chunk_count += 1
	if chunk_count == 1:
	first_llm_chunk_time = time.time()
	t_llm_first_token = first_llm_chunk_time - start_llm_stream
	print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)")
	yield from synthesize_text(buffer_to_send, lang=lang)
	first_chunk_sent = True
	text_buffer = ""
	delimiter_count = 0
	yield AdditionalOutputs(conversation_history)

	except Exception as e:
	print(f"An error occurred during response generation or synthesis: {e}")
	error_message = "ขออภัยค่ะ เกิดข้อผิดพลาดบางอย่าง"
	try:
	yield from synthesize_text(error_message, lang=lang)
	except Exception as synth_error:
	print(f"Could not synthesize error message: {synth_error}")
	assistant_turn["content"] = (assistant_turn.get("content", "") + f" [Error: {e}]").strip()
	yield AdditionalOutputs(conversation_history)

	total_latency = time.time() - start_time
	print(f"Total: {total_latency:.4f}s")
	print(f"------------------------")



	async def get_credentials():
	return await get_cloudflare_turn_credentials_async(hf_token=os.getenv('HF_TOKEN'))

	with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="pink", secondary_hue="pink")) as demo:
	gr.HTML("""<h1 style='text-align: center'>1157 Voicebot Demo</h1>""")
	with gr.Row():
	with gr.Column(scale=1, elem_classes=["phone-column"]):
	audio = WebRTC(
	mode="send-receive",
	modality="audio",
	track_constraints={
	"echoCancellation": True,
	"noiseSuppression": {"exact": True},
	"autoGainControl": {"exact": True}
	},
	rtc_configuration=get_credentials,
	server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
	icon="https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif",
	icon_button_color="#17dbaa",
	pulse_color="#b0f83b",
	button_labels={"start": "Call", "stop": "Hang up", "waiting": "Connecting…"},
	icon_radius=45,
	height="650px",
	width="100%",
	container=False,
	elem_id="phone-call-webrtc"
	)
	with gr.Column():
	conversation_history = gr.Chatbot(
	label="Conversation History",
	type="messages",
	value=[],
	height="675px",
	resizable=True,
	avatar_images=(None, "https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif"),
	)
	gr.DeepLinkButton()

	audio.stream(
	fn=ReplyOnPause(
	response,
	algo_options=AlgoOptions(
	audio_chunk_duration=0.6,
	started_talking_threshold=0.3,
	speech_threshold=0.6
	),
	model_options=SileroVadOptions(
	threshold=0.8,
	min_speech_duration_ms=300,
	max_speech_duration_s=float("inf"),
	min_silence_duration_ms=1200,
	),
	can_interrupt=True,
	startup_fn=startup,
	),
	inputs=[audio, conversation_history],
	outputs=[audio],
	concurrency_limit=1000,
	time_limit=8192
	)

	audio.on_additional_outputs(
	lambda history: history,
	outputs=[conversation_history],
	queue=True,
	show_progress="hidden"
	)

	demo.queue(default_concurrency_limit=1000)
	demo.launch(debug=True, show_error=True, share=True)