jts-ai-team's picture
Upload 9 files
7bda603 verified
import gradio as gr
from fastrtc import ReplyOnPause, AlgoOptions, SileroVadOptions, AdditionalOutputs, WebRTC, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials #get_hf_turn_credentials,
import os
from dotenv import load_dotenv
import time
import numpy as np
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from backend.tts import synthesize_text
from backend.asr import transcribe_audio
from backend.utils import preprocess_audio, is_valid_turn
from backend.main import stream_chat_response
from pydub import AudioSegment
load_dotenv(override=True)
phone_waiting_sound = AudioSegment.from_mp3("frontend/phone-ringing-382734.mp3")[:1000]
sound_samples = np.array(phone_waiting_sound.get_array_of_samples(), dtype=np.int16)
if phone_waiting_sound.channels > 1:
sound_samples = sound_samples.reshape((-1, phone_waiting_sound.channels)).mean(axis=1)
sound_samples = sound_samples.astype(np.float32) / 32768.0 # Normalize to [-1,
def startup(_):
yield (phone_waiting_sound.frame_rate, sound_samples)
STARTUP_MESSAGE = "สวัสดีค่ะ มีข้อมูลสอบถามด้านใดคะ?"
yield from synthesize_text(STARTUP_MESSAGE)
time.sleep(2)
yield AdditionalOutputs([{"role": "assistant", "content": STARTUP_MESSAGE}])
custom_css = """
/* Overall Gradio page styling: hot pink background */
body {
/* background-color: #ff69b4; /* Hot pink */
margin: 0;
padding: 0;
font-family: sans-serif;}
/* Title styling */
h1 {
color: #fff;
text-shadow: 1px 1px 2px #ff85a2;
font-size: 2.5em;
margin-bottom: 20px;
text-align: center;
}
/* Style the column holding the telephone interface */
.phone-column {
max-width: 350px !important; /* Limit the width of the phone column */
margin: 0 auto; /* Center the column */
border-radius: 20px;
background-color: #ff69b4; /* Lighter pink for telephone interface */
box-shadow: 0 0 15px rgba(0, 0, 0, 0.2);
padding: 20px;
}
/* Conversation history box styling */
#conversation-history-chatbot {
background-color: #ffc0cb; /* Lighter pink for conversation history */
border: 1px solid #ccc;
border-radius: 10px;
padding: 10px;
box-shadow: 0 0 15px rgba(0, 0, 0, 0.2);
}
"""
def response(audio: tuple[int, np.ndarray] | None, conversation_history):
"""
Handles user audio input, transcribes it, streams LLM text via backend.main,
and synthesizes chunks to audio while updating the conversation history.
"""
print(f"--- Latency Breakdown ---")
start_time = time.time()
if conversation_history is None:
conversation_history = []
previous_history = list(conversation_history)
if not audio or audio[1] is None or not np.any(audio[1]):
print("No audio input detected; skipping response generation.")
print(f"------------------------")
return
import soundfile as sf
sample_rate, audio_array = audio
try:
processed_audio = preprocess_audio((sample_rate, audio_array), target_frame_rate=16000)
except Exception as audio_err:
print(f"Audio preprocessing failed: {audio_err}")
print(f"------------------------")
return
t0 = time.time()
transcription = transcribe_audio( processed_audio)
t_asr = time.time() - t0
print(f"ASR: {t_asr:.4f}s")
if not transcription.strip():
print("No valid transcription; skipping response generation.")
print(f"------------------------")
return
user_turn = {"role": "user", "content": transcription}
print(f"User: {transcription}")
if is_valid_turn(user_turn):
conversation_history.append(user_turn)
yield AdditionalOutputs(conversation_history)
print("Conversation history:", conversation_history)
assistant_turn = {"role": "assistant", "content": ""}
conversation_history.append(assistant_turn)
history_for_stream = [dict(turn) for turn in previous_history if is_valid_turn(turn)]
text_buffer = ""
full_response = ""
delimiter_count = 0
n_threshold = 3
max_n_threshold = 5
lang = "th"
chunk_count = 0
first_chunk_sent = False
start_llm_stream = time.time()
try:
for text_chunk in stream_chat_response(history_for_stream, transcription):
if not isinstance(text_chunk, str):
text_chunk = str(text_chunk)
i = 0
while i < len(text_chunk):
char = text_chunk[i]
text_buffer += char
full_response += char
assistant_turn["content"] = full_response.strip()
is_delimiter = False
if char in {' ', '\n'}:
is_delimiter = True
delimiter_count += 1
if i + 1 < len(text_chunk) and text_chunk[i + 1] == 'ๆ':
text_buffer += text_chunk[i + 1]
full_response += text_chunk[i + 1]
i += 1
send_now = False
if not first_chunk_sent:
if is_delimiter and text_buffer.strip():
send_now = True
else:
if delimiter_count >= n_threshold and text_buffer.strip():
send_now = True
if n_threshold < max_n_threshold:
n_threshold += 1
if send_now:
buffer_to_send = text_buffer.strip()
try:
if buffer_to_send and buffer_to_send.endswith('วันที่'):
buffer_to_send = buffer_to_send[:-len('วันที่')]
if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'):
buffer_to_send = buffer_to_send[:-len('ค่ะ')]
except Exception:
buffer_to_send = buffer_to_send.replace('ค่ะ', '')
if buffer_to_send:
chunk_count += 1
if chunk_count == 1:
first_llm_chunk_time = time.time()
t_llm_first_token = first_llm_chunk_time - start_llm_stream
print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)")
yield from synthesize_text(buffer_to_send, lang=lang)
first_chunk_sent = True
text_buffer = ""
delimiter_count = 0
yield AdditionalOutputs(conversation_history)
i += 1
if text_buffer.strip():
buffer_to_send = text_buffer.strip()
try:
if buffer_to_send and buffer_to_send.endswith('วันที่'):
buffer_to_send = buffer_to_send[:-len('วันที่')]
if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'):
buffer_to_send = buffer_to_send[:-len('ค่ะ')]
except Exception:
buffer_to_send = buffer_to_send.replace('ค่ะ', '')
if buffer_to_send:
chunk_count += 1
if chunk_count == 1:
first_llm_chunk_time = time.time()
t_llm_first_token = first_llm_chunk_time - start_llm_stream
print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)")
yield from synthesize_text(buffer_to_send, lang=lang)
first_chunk_sent = True
text_buffer = ""
delimiter_count = 0
yield AdditionalOutputs(conversation_history)
except Exception as e:
print(f"An error occurred during response generation or synthesis: {e}")
error_message = "ขออภัยค่ะ เกิดข้อผิดพลาดบางอย่าง"
try:
yield from synthesize_text(error_message, lang=lang)
except Exception as synth_error:
print(f"Could not synthesize error message: {synth_error}")
assistant_turn["content"] = (assistant_turn.get("content", "") + f" [Error: {e}]").strip()
yield AdditionalOutputs(conversation_history)
total_latency = time.time() - start_time
print(f"Total: {total_latency:.4f}s")
print(f"------------------------")
async def get_credentials():
return await get_cloudflare_turn_credentials_async(hf_token=os.getenv('HF_TOKEN'))
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="pink", secondary_hue="pink")) as demo:
gr.HTML("""<h1 style='text-align: center'>1157 Voicebot Demo</h1>""")
with gr.Row():
with gr.Column(scale=1, elem_classes=["phone-column"]):
audio = WebRTC(
mode="send-receive",
modality="audio",
track_constraints={
"echoCancellation": True,
"noiseSuppression": {"exact": True},
"autoGainControl": {"exact": True}
},
rtc_configuration=get_credentials,
server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
icon="https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif",
icon_button_color="#17dbaa",
pulse_color="#b0f83b",
button_labels={"start": "Call", "stop": "Hang up", "waiting": "Connecting…"},
icon_radius=45,
height="650px",
width="100%",
container=False,
elem_id="phone-call-webrtc"
)
with gr.Column():
conversation_history = gr.Chatbot(
label="Conversation History",
type="messages",
value=[],
height="675px",
resizable=True,
avatar_images=(None, "https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif"),
)
gr.DeepLinkButton()
audio.stream(
fn=ReplyOnPause(
response,
algo_options=AlgoOptions(
audio_chunk_duration=0.6,
started_talking_threshold=0.3,
speech_threshold=0.6
),
model_options=SileroVadOptions(
threshold=0.8,
min_speech_duration_ms=300,
max_speech_duration_s=float("inf"),
min_silence_duration_ms=1200,
),
can_interrupt=True,
startup_fn=startup,
),
inputs=[audio, conversation_history],
outputs=[audio],
concurrency_limit=1000,
time_limit=8192
)
audio.on_additional_outputs(
lambda history: history,
outputs=[conversation_history],
queue=True,
show_progress="hidden"
)
demo.queue(default_concurrency_limit=1000)
demo.launch(debug=True, show_error=True, share=True)