Spaces:
Paused
Paused
| import gradio as gr | |
| from fastrtc import ReplyOnPause, AlgoOptions, SileroVadOptions, AdditionalOutputs, WebRTC, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials #get_hf_turn_credentials, | |
| import os | |
| from dotenv import load_dotenv | |
| import time | |
| import numpy as np | |
| import sys | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from backend.tts import synthesize_text | |
| from backend.asr import transcribe_audio | |
| from backend.utils import preprocess_audio, is_valid_turn | |
| from backend.main import stream_chat_response | |
| from pydub import AudioSegment | |
| load_dotenv(override=True) | |
| phone_waiting_sound = AudioSegment.from_mp3("frontend/phone-ringing-382734.mp3")[:1000] | |
| sound_samples = np.array(phone_waiting_sound.get_array_of_samples(), dtype=np.int16) | |
| if phone_waiting_sound.channels > 1: | |
| sound_samples = sound_samples.reshape((-1, phone_waiting_sound.channels)).mean(axis=1) | |
| sound_samples = sound_samples.astype(np.float32) / 32768.0 # Normalize to [-1, | |
| def startup(_): | |
| yield (phone_waiting_sound.frame_rate, sound_samples) | |
| STARTUP_MESSAGE = "สวัสดีค่ะ มีข้อมูลสอบถามด้านใดคะ?" | |
| yield from synthesize_text(STARTUP_MESSAGE) | |
| time.sleep(2) | |
| yield AdditionalOutputs([{"role": "assistant", "content": STARTUP_MESSAGE}]) | |
| custom_css = """ | |
| /* Overall Gradio page styling: hot pink background */ | |
| body { | |
| /* background-color: #ff69b4; /* Hot pink */ | |
| margin: 0; | |
| padding: 0; | |
| font-family: sans-serif;} | |
| /* Title styling */ | |
| h1 { | |
| color: #fff; | |
| text-shadow: 1px 1px 2px #ff85a2; | |
| font-size: 2.5em; | |
| margin-bottom: 20px; | |
| text-align: center; | |
| } | |
| /* Style the column holding the telephone interface */ | |
| .phone-column { | |
| max-width: 350px !important; /* Limit the width of the phone column */ | |
| margin: 0 auto; /* Center the column */ | |
| border-radius: 20px; | |
| background-color: #ff69b4; /* Lighter pink for telephone interface */ | |
| box-shadow: 0 0 15px rgba(0, 0, 0, 0.2); | |
| padding: 20px; | |
| } | |
| /* Conversation history box styling */ | |
| #conversation-history-chatbot { | |
| background-color: #ffc0cb; /* Lighter pink for conversation history */ | |
| border: 1px solid #ccc; | |
| border-radius: 10px; | |
| padding: 10px; | |
| box-shadow: 0 0 15px rgba(0, 0, 0, 0.2); | |
| } | |
| """ | |
| def response(audio: tuple[int, np.ndarray] | None, conversation_history): | |
| """ | |
| Handles user audio input, transcribes it, streams LLM text via backend.main, | |
| and synthesizes chunks to audio while updating the conversation history. | |
| """ | |
| print(f"--- Latency Breakdown ---") | |
| start_time = time.time() | |
| if conversation_history is None: | |
| conversation_history = [] | |
| previous_history = list(conversation_history) | |
| if not audio or audio[1] is None or not np.any(audio[1]): | |
| print("No audio input detected; skipping response generation.") | |
| print(f"------------------------") | |
| return | |
| import soundfile as sf | |
| sample_rate, audio_array = audio | |
| try: | |
| processed_audio = preprocess_audio((sample_rate, audio_array), target_frame_rate=16000) | |
| except Exception as audio_err: | |
| print(f"Audio preprocessing failed: {audio_err}") | |
| print(f"------------------------") | |
| return | |
| t0 = time.time() | |
| transcription = transcribe_audio( processed_audio) | |
| t_asr = time.time() - t0 | |
| print(f"ASR: {t_asr:.4f}s") | |
| if not transcription.strip(): | |
| print("No valid transcription; skipping response generation.") | |
| print(f"------------------------") | |
| return | |
| user_turn = {"role": "user", "content": transcription} | |
| print(f"User: {transcription}") | |
| if is_valid_turn(user_turn): | |
| conversation_history.append(user_turn) | |
| yield AdditionalOutputs(conversation_history) | |
| print("Conversation history:", conversation_history) | |
| assistant_turn = {"role": "assistant", "content": ""} | |
| conversation_history.append(assistant_turn) | |
| history_for_stream = [dict(turn) for turn in previous_history if is_valid_turn(turn)] | |
| text_buffer = "" | |
| full_response = "" | |
| delimiter_count = 0 | |
| n_threshold = 3 | |
| max_n_threshold = 5 | |
| lang = "th" | |
| chunk_count = 0 | |
| first_chunk_sent = False | |
| start_llm_stream = time.time() | |
| try: | |
| for text_chunk in stream_chat_response(history_for_stream, transcription): | |
| if not isinstance(text_chunk, str): | |
| text_chunk = str(text_chunk) | |
| i = 0 | |
| while i < len(text_chunk): | |
| char = text_chunk[i] | |
| text_buffer += char | |
| full_response += char | |
| assistant_turn["content"] = full_response.strip() | |
| is_delimiter = False | |
| if char in {' ', '\n'}: | |
| is_delimiter = True | |
| delimiter_count += 1 | |
| if i + 1 < len(text_chunk) and text_chunk[i + 1] == 'ๆ': | |
| text_buffer += text_chunk[i + 1] | |
| full_response += text_chunk[i + 1] | |
| i += 1 | |
| send_now = False | |
| if not first_chunk_sent: | |
| if is_delimiter and text_buffer.strip(): | |
| send_now = True | |
| else: | |
| if delimiter_count >= n_threshold and text_buffer.strip(): | |
| send_now = True | |
| if n_threshold < max_n_threshold: | |
| n_threshold += 1 | |
| if send_now: | |
| buffer_to_send = text_buffer.strip() | |
| try: | |
| if buffer_to_send and buffer_to_send.endswith('วันที่'): | |
| buffer_to_send = buffer_to_send[:-len('วันที่')] | |
| if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'): | |
| buffer_to_send = buffer_to_send[:-len('ค่ะ')] | |
| except Exception: | |
| buffer_to_send = buffer_to_send.replace('ค่ะ', '') | |
| if buffer_to_send: | |
| chunk_count += 1 | |
| if chunk_count == 1: | |
| first_llm_chunk_time = time.time() | |
| t_llm_first_token = first_llm_chunk_time - start_llm_stream | |
| print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)") | |
| yield from synthesize_text(buffer_to_send, lang=lang) | |
| first_chunk_sent = True | |
| text_buffer = "" | |
| delimiter_count = 0 | |
| yield AdditionalOutputs(conversation_history) | |
| i += 1 | |
| if text_buffer.strip(): | |
| buffer_to_send = text_buffer.strip() | |
| try: | |
| if buffer_to_send and buffer_to_send.endswith('วันที่'): | |
| buffer_to_send = buffer_to_send[:-len('วันที่')] | |
| if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'): | |
| buffer_to_send = buffer_to_send[:-len('ค่ะ')] | |
| except Exception: | |
| buffer_to_send = buffer_to_send.replace('ค่ะ', '') | |
| if buffer_to_send: | |
| chunk_count += 1 | |
| if chunk_count == 1: | |
| first_llm_chunk_time = time.time() | |
| t_llm_first_token = first_llm_chunk_time - start_llm_stream | |
| print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)") | |
| yield from synthesize_text(buffer_to_send, lang=lang) | |
| first_chunk_sent = True | |
| text_buffer = "" | |
| delimiter_count = 0 | |
| yield AdditionalOutputs(conversation_history) | |
| except Exception as e: | |
| print(f"An error occurred during response generation or synthesis: {e}") | |
| error_message = "ขออภัยค่ะ เกิดข้อผิดพลาดบางอย่าง" | |
| try: | |
| yield from synthesize_text(error_message, lang=lang) | |
| except Exception as synth_error: | |
| print(f"Could not synthesize error message: {synth_error}") | |
| assistant_turn["content"] = (assistant_turn.get("content", "") + f" [Error: {e}]").strip() | |
| yield AdditionalOutputs(conversation_history) | |
| total_latency = time.time() - start_time | |
| print(f"Total: {total_latency:.4f}s") | |
| print(f"------------------------") | |
| async def get_credentials(): | |
| return await get_cloudflare_turn_credentials_async(hf_token=os.getenv('HF_TOKEN')) | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="pink", secondary_hue="pink")) as demo: | |
| gr.HTML("""<h1 style='text-align: center'>1157 Voicebot Demo</h1>""") | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_classes=["phone-column"]): | |
| audio = WebRTC( | |
| mode="send-receive", | |
| modality="audio", | |
| track_constraints={ | |
| "echoCancellation": True, | |
| "noiseSuppression": {"exact": True}, | |
| "autoGainControl": {"exact": True} | |
| }, | |
| rtc_configuration=get_credentials, | |
| server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000), | |
| icon="https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif", | |
| icon_button_color="#17dbaa", | |
| pulse_color="#b0f83b", | |
| button_labels={"start": "Call", "stop": "Hang up", "waiting": "Connecting…"}, | |
| icon_radius=45, | |
| height="650px", | |
| width="100%", | |
| container=False, | |
| elem_id="phone-call-webrtc" | |
| ) | |
| with gr.Column(): | |
| conversation_history = gr.Chatbot( | |
| label="Conversation History", | |
| type="messages", | |
| value=[], | |
| height="675px", | |
| resizable=True, | |
| avatar_images=(None, "https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif"), | |
| ) | |
| gr.DeepLinkButton() | |
| audio.stream( | |
| fn=ReplyOnPause( | |
| response, | |
| algo_options=AlgoOptions( | |
| audio_chunk_duration=0.6, | |
| started_talking_threshold=0.3, | |
| speech_threshold=0.6 | |
| ), | |
| model_options=SileroVadOptions( | |
| threshold=0.8, | |
| min_speech_duration_ms=300, | |
| max_speech_duration_s=float("inf"), | |
| min_silence_duration_ms=1200, | |
| ), | |
| can_interrupt=True, | |
| startup_fn=startup, | |
| ), | |
| inputs=[audio, conversation_history], | |
| outputs=[audio], | |
| concurrency_limit=1000, | |
| time_limit=8192 | |
| ) | |
| audio.on_additional_outputs( | |
| lambda history: history, | |
| outputs=[conversation_history], | |
| queue=True, | |
| show_progress="hidden" | |
| ) | |
| demo.queue(default_concurrency_limit=1000) | |
| demo.launch(debug=True, show_error=True, share=True) | |