Spaces:

Siddhant
/

Voice_Assistant_Demo

Runtime error

App Files Files Community

Siddhant commited on Jan 3

Commit

b9a6dd9

1 Parent(s): 58f82d5

Update demo

Browse files

Files changed (13) hide show

app.py +856 -492
pyscripts/utils/dialog_eval/ASR_WER.py +165 -0
pyscripts/utils/dialog_eval/LLM_Metrics.py +245 -0
pyscripts/utils/dialog_eval/TTS_intelligibility.py +169 -0
pyscripts/utils/dialog_eval/TTS_speech_quality.py +98 -0
pyscripts/utils/dialog_eval/__pycache__/ASR_WER.cpython-39.pyc +0 -0
pyscripts/utils/dialog_eval/__pycache__/LLM_Metrics.cpython-39.pyc +0 -0
pyscripts/utils/dialog_eval/__pycache__/TTS_intelligibility.cpython-39.pyc +0 -0
pyscripts/utils/dialog_eval/__pycache__/TTS_speech_quality.cpython-39.pyc +0 -0
pyscripts/utils/dialog_eval/__pycache__/human_feedback.cpython-39.pyc +0 -0
pyscripts/utils/dialog_eval/__pycache__/vert.cpython-39.pyc +0 -0
pyscripts/utils/dialog_eval/human_feedback.py +242 -0
pyscripts/utils/dialog_eval/vert.py +299 -0

app.py CHANGED Viewed

@@ -5,347 +5,382 @@ except ImportError:
     with open('versa.sh', 'rb') as file:
         script = file.read()
     rc = call(script, shell=True)
 import os
 import shutil
-from espnet2.sds.asr.espnet_asr import ESPnetASRModel
-from espnet2.sds.asr.owsm_asr import OWSMModel
-from espnet2.sds.asr.owsm_ctc_asr import OWSMCTCModel
-from espnet2.sds.asr.whisper_asr import WhisperASRModel
-from espnet2.sds.tts.espnet_tts import ESPnetTTSModel
-from espnet2.sds.tts.chat_tts import ChatTTSModel
-from espnet2.sds.llm.hugging_face_llm import HuggingFaceLLM
-from espnet2.sds.vad.webrtc_vad import WebrtcVADModel
-from espnet2.sds.eval.TTS_intelligibility import handle_espnet_TTS_intelligibility
-from espnet2.sds.eval.ASR_WER import handle_espnet_ASR_WER
-from espnet2.sds.eval.TTS_speech_quality import TTS_psuedomos
-from espnet2.sds.eval.LLM_Metrics import perplexity, vert, bert_score, DialoGPT_perplexity
-from espnet2.sds.utils.chat import Chat
-from espnet2.sds.end_to_end.mini_omni_e2e import MiniOmniE2EModel
-import argparse
 import torch
 access_token = os.environ.get("HF_TOKEN")
 ASR_name="pyf98/owsm_ctc_v3.1_1B"
 LLM_name="meta-llama/Llama-3.2-1B-Instruct"
 TTS_name="kan-bayashi/ljspeech_vits"
-ASR_options="pyf98/owsm_ctc_v3.1_1B,espnet/owsm_ctc_v3.2_ft_1B,espnet/owsm_v3.1_ebf,librispeech_asr,whisper".split(",")
 LLM_options="meta-llama/Llama-3.2-1B-Instruct,HuggingFaceTB/SmolLM2-1.7B-Instruct".split(",")
 TTS_options="kan-bayashi/ljspeech_vits,kan-bayashi/libritts_xvector_vits,kan-bayashi/vctk_multi_spk_vits,ChatTTS".split(",")
 Eval_options="Latency,TTS Intelligibility,TTS Speech Quality,ASR WER,Text Dialog Metrics"
 upload_to_hub=None
 ASR_curr_name=None
 LLM_curr_name=None
 TTS_curr_name=None
-# def read_args():
-#     global access_token
-#     global ASR_name
-#     global LLM_name
-#     global TTS_name
-#     global ASR_options
-#     global LLM_options
-#     global TTS_options
-#     global Eval_options
-#     global upload_to_hub
-#     parser = argparse.ArgumentParser(description="Run the app with HF_TOKEN as a command-line argument.")
-#     parser.add_argument("--HF_TOKEN", required=True, help="Provide the Hugging Face token.")
-#     parser.add_argument("--asr_options", required=True, help="Provide the possible ASR options available to user.")
-#     parser.add_argument("--llm_options", required=True, help="Provide the possible LLM options available to user.")
-#     parser.add_argument("--tts_options", required=True, help="Provide the possible TTS options available to user.")
-#     parser.add_argument("--eval_options", required=True, help="Provide the possible automatic evaluation metrics available to user.")
-#     parser.add_argument("--default_asr_model", required=False, default="pyf98/owsm_ctc_v3.1_1B", help="Provide the default ASR model.")
-#     parser.add_argument("--default_llm_model", required=False, default="meta-llama/Llama-3.2-1B-Instruct", help="Provide the default ASR model.")
-#     parser.add_argument("--default_tts_model", required=False, default="kan-bayashi/ljspeech_vits", help="Provide the default ASR model.")
-#     parser.add_argument("--upload_to_hub", required=False, default=None, help="Hugging Face dataset to upload user data")
-#     args = parser.parse_args()
-#     access_token=args.HF_TOKEN
-#     ASR_name=args.default_asr_model
-#     LLM_name=args.default_llm_model
-#     TTS_name=args.default_tts_model
-#     ASR_options=args.asr_options.split(",")
-#     LLM_options=args.llm_options.split(",")
-#     TTS_options=args.tts_options.split(",")
-#     Eval_options=args.eval_options.split(",")
-#     upload_to_hub=args.upload_to_hub
-# read_args()
-from huggingface_hub import HfApi
-api = HfApi()
-import nltk
-nltk.download('averaged_perceptron_tagger_eng')
-import gradio as gr
-import numpy as np
-chat = Chat(2)
-chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. The user is talking to you with their voice and you should respond in a conversational style. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."})
-user_role = "user"
-text2speech=None
-s2t=None
-LM_pipe=None
-client=None
-latency_ASR=0.0
-latency_LM=0.0
-latency_TTS=0.0
-text_str=""
-asr_output_str=""
-vad_output=None
 audio_output = None
 audio_output1 = None
-LLM_response_arr=[]
-total_response_arr=[]
-def handle_selection(option):
-    global TTS_curr_name
-    if TTS_curr_name is not None:
-        if option==TTS_curr_name:
-            return
-    yield gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False)
-    global text2speech
-    TTS_curr_name=option
-    tag = option
-    if tag=="ChatTTS":
-        text2speech = ChatTTSModel()
-    else:
-        text2speech = ESPnetTTSModel(tag)
-    text2speech.warmup()
-    yield gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True)
-def handle_LLM_selection(option):
-    global LLM_curr_name
-    if LLM_curr_name is not None:
-        if option==LLM_curr_name:
-            return
-    yield gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False)
-    global LM_pipe
-    LLM_curr_name=option
-    LM_pipe = HuggingFaceLLM(access_token=access_token,tag = option)
-    LM_pipe.warmup()
-    yield gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True)
-def handle_ASR_selection(option):
-    global ASR_curr_name
-    if option=="librispeech_asr":
-        option="espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp"
-    if ASR_curr_name is not None:
-        if option==ASR_curr_name:
-            return
-    yield gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False)
-    global s2t
-    ASR_curr_name=option
-    if option=="espnet/owsm_v3.1_ebf":
-        s2t = OWSMModel()
-    elif option=="espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp":
-        s2t = ESPnetASRModel(tag=option)
-    elif option=="whisper":
-        s2t = WhisperASRModel()
-    else:
-        s2t = OWSMCTCModel(tag=option)
-    s2t.warmup()
-    yield gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True)
-def handle_eval_selection(option, TTS_audio_output, LLM_Output, ASR_audio_output, ASR_transcript):
     global LLM_response_arr
     global total_response_arr
-    yield (option,gr.Textbox(visible=True))
-    if option=="Latency":
-        text=f"ASR Latency: {latency_ASR:.2f}\nLLM Latency: {latency_LM:.2f}\nTTS Latency: {latency_TTS:.2f}"
-        yield (None,text)
-    elif option=="TTS Intelligibility":
-        yield (None,handle_espnet_TTS_intelligibility(TTS_audio_output,LLM_Output))
-    elif option=="TTS Speech Quality":
-        yield (None,TTS_psuedomos(TTS_audio_output))
-    elif option=="ASR WER":
-        yield (None,handle_espnet_ASR_WER(ASR_audio_output, ASR_transcript))
-    elif option=="Text Dialog Metrics":
-        yield (None,perplexity(LLM_Output.replace("\n"," "))+vert(LLM_response_arr)+bert_score(total_response_arr)+DialoGPT_perplexity(ASR_transcript.replace("\n"," "),LLM_Output.replace("\n"," ")))
-def handle_eval_selection_E2E(option, TTS_audio_output, LLM_Output):
     global LLM_response_arr
     global total_response_arr
-    yield (option,gr.Textbox(visible=True))
-    if option=="Latency":
-        text=f"Total Latency: {latency_TTS:.2f}"
-        yield (None,text)
-    elif option=="TTS Intelligibility":
-        yield (None,handle_espnet_TTS_intelligibility(TTS_audio_output,LLM_Output))
-    elif option=="TTS Speech Quality":
-        yield (None,TTS_psuedomos(TTS_audio_output))
-    elif option=="Text Dialog Metrics":
-        yield (None,perplexity(LLM_Output.replace("\n"," "))+vert(LLM_response_arr))
-def handle_type_selection(option,TTS_radio,ASR_radio,LLM_radio):
-    global client
-    global LM_pipe
-    global s2t
-    global text2speech
-    yield (gr.Radio(visible=False),gr.Radio(visible=False),gr.Radio(visible=False),gr.Radio(visible=False), gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False),gr.Radio(visible=False),gr.Radio(visible=False))
-    if option=="Cascaded":
-        client=None
-        for _ in handle_selection(TTS_radio):
-            continue
-        for _ in handle_ASR_selection(ASR_radio):
-            continue
-        for _ in handle_LLM_selection(LLM_radio):
-            continue
-        yield (gr.Radio(visible=True),gr.Radio(visible=True),gr.Radio(visible=True),gr.Radio(visible=False),gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True),gr.Radio(visible=True, interactive=True),gr.Radio(visible=False))
     else:
-        text2speech=None
-        s2t=None
-        LM_pipe=None
-        global ASR_curr_name
-        global LLM_curr_name
-        global TTS_curr_name
-        ASR_curr_name=None
-        LLM_curr_name=None
-        TTS_curr_name=None
-        handle_E2E_selection()
-        yield (gr.Radio(visible=False),gr.Radio(visible=False),gr.Radio(visible=False),gr.Radio(visible=True),gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True),gr.Radio(visible=False),gr.Radio(visible=True, interactive=True))
-def handle_E2E_selection():
-    global client
-    if client is None:
-        client =  MiniOmniE2EModel()
-        client.warmup()
 def start_warmup():
-    global client
-    for opt in ASR_options:
-        if opt==ASR_name:
-            continue
-        print(opt)
-        for _ in handle_ASR_selection(opt):
-            continue
-    for opt in LLM_options:
-        if opt==LLM_name:
-            continue
-        print(opt)
-        for _ in handle_LLM_selection(opt):
-            continue
-    for opt in TTS_options:
-        if opt==TTS_name:
-            continue
-        print(opt)
-        for _ in handle_selection(opt):
-            continue
-    handle_E2E_selection()
-    client=None
-    for _ in handle_selection(TTS_name):
         continue
-    for _ in handle_ASR_selection(ASR_name):
         continue
-    for _ in handle_LLM_selection(LLM_name):
         continue
-    dummy_input = torch.randn(
             (3000),
             dtype=getattr(torch, "float16"),
             device="cpu",
-    ).cpu().numpy()
-    dummy_text="This is dummy text"
     for opt in Eval_options:
         handle_eval_selection(opt, dummy_input, dummy_text, dummy_input, dummy_text)
-start_warmup()
-vad_model=WebrtcVADModel()
-callback = gr.CSVLogger()
-start_record_time=None
-enable_btn = gr.Button(interactive=True, visible=True)
-disable_btn = gr.Button(interactive=False, visible=False)
 def flash_buttons():
     btn_updates = (enable_btn,) * 8
-    print(enable_btn)
-    yield ("","",)+btn_updates
-def get_ip(request: gr.Request):
-    if "cf-connecting-ip" in request.headers:
-        ip = request.headers["cf-connecting-ip"]
-    elif "x-forwarded-for" in request.headers:
-        ip = request.headers["x-forwarded-for"]
-        if "," in ip:
-            ip = ip.split(",")[0]
-    else:
-        ip = request.client.host
-    return ip
-def vote_last_response(vote_type, request: gr.Request):
-    with open("save_dict.json", "a") as fout:
-        data = {
-            "tstamp": round(time.time(), 4),
-            "type": vote_type,
-            "ip": get_ip(request),
-        }
-        fout.write(json.dumps(data) + "\n")
-def natural_vote1_last_response(
-    request: gr.Request
 ):
-    ip_address1=get_ip(request)
-    print(f"Very Natural (voted). ip: {ip_address1}")
-    return ("Very Natural",ip_address1,)+(disable_btn,) * 4
-def natural_vote2_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Somewhat Awkward (voted). ip: {ip_address1}")
-    return ("Somewhat Awkward",ip_address1,)+(disable_btn,) * 4
-def natural_vote3_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Very Awkward (voted). ip: {ip_address1}")
-    return ("Very Awkward",ip_address1,)+(disable_btn,) * 4
-def natural_vote4_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Unnatural (voted). ip: {ip_address1}")
-    return ("Unnatural",ip_address1,)+(disable_btn,) * 4
-def relevant_vote1_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Highly Relevant (voted). ip: {ip_address1}")
-    return ("Highly Relevant",ip_address1,)+(disable_btn,) * 4
-def relevant_vote2_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Partially Relevant (voted). ip: {ip_address1}")
-    return ("Partially Relevant",ip_address1,)+(disable_btn,) * 4
-def relevant_vote3_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Slightly Irrelevant (voted). ip: {ip_address1}")
-    return ("Slightly Irrelevant",ip_address1,)+(disable_btn,) * 4
-def relevant_vote4_last_response(
-    request: gr.Request
-):
-    ip_address1=get_ip(request)
-    print(f"Completely Irrelevant (voted). ip: {ip_address1}")
-    return ("Completely Irrelevant",ip_address1,)+(disable_btn,) * 4
-import json
-import time
-def transcribe(stream, new_chunk, TTS_option, ASR_option, LLM_option, type_option):
     sr, y = new_chunk
     global text_str
     global chat
@@ -364,219 +399,548 @@ def transcribe(stream, new_chunk, TTS_option, ASR_option, LLM_option, type_optio
     global total_response_arr
     if stream is None:
         # Handle user refresh
-        # import pdb;pdb.set_trace()
-        for (_,_,_,_,asr_output_box,text_box,audio_box,_,_) in handle_type_selection(type_option,TTS_option,ASR_option,LLM_option):
             gr.Info("The models are being reloaded due to a browser refresh.")
-            yield (stream,asr_output_box,text_box,audio_box,gr.Audio(visible=False))
-        stream=y
-        chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."})
-        text_str=""
         audio_output = None
         audio_output1 = None
     else:
-        stream=np.concatenate((stream,y))
-    orig_sr=sr
-    sr=16000
-    if client is not None:
-        array=vad_model(y,orig_sr, binary=True)
-    else:
-        array=vad_model(y,orig_sr)
-    if array is not None:
-        print("VAD: end of speech detected")
-        start_time = time.time()
-        if client is not None:
-            try:
-                (text_str, audio_output)=client(array, orig_sr)
-            except Exception as e:
-                text_str=""
-                audio_output=None
-                raise gr.Error(f"Error during audio streaming: {e}")
-            asr_output_str=""
-            latency_TTS=(time.time() - start_time)
-        else:
-            prompt=s2t(array)
-            if len(prompt.strip().split())<2:
-                text_str1=text_str
-                yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
-                return
-            asr_output_str=prompt
-            total_response_arr.append(prompt.replace("\n"," "))
-            start_LM_time=time.time()
-            latency_ASR=(start_LM_time - start_time)
-            chat.append({"role": user_role, "content": prompt})
-            chat_messages = chat.to_list()
-            generated_text = LM_pipe(chat_messages)
-            start_TTS_time=time.time()
-            latency_LM=(start_TTS_time - start_LM_time)
-            chat.append({"role": "assistant", "content": generated_text})
-            text_str=generated_text
-            audio_output=text2speech(text_str)
-            latency_TTS=(time.time() - start_TTS_time)
-        audio_output1=(orig_sr,stream)
-        stream=y
-        LLM_response_arr.append(text_str.replace("\n"," "))
-        total_response_arr.append(text_str.replace("\n"," "))
-    text_str1=text_str
-    if ((text_str!="") and (start_record_time is None)):
-        start_record_time=time.time()
     elif start_record_time is not None:
-        current_record_time=time.time()
-        if current_record_time-start_record_time>300:
-            gr.Info("Conversations are limited to 5 minutes. The session will restart in approximately 60 seconds. Please wait for the demo to reset. Close this message once you have read it.", duration=None)
-            yield stream,gr.Textbox(visible=False),gr.Textbox(visible=False),gr.Audio(visible=False),gr.Audio(visible=False)
             if upload_to_hub is not None:
                 api.upload_folder(
                     folder_path="flagged_data_points",
-                    path_in_repo="checkpoint_"+str(start_record_time),
                     repo_id=upload_to_hub,
                     repo_type="dataset",
                     token=access_token,
                 )
-            chat.buffer=[{"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise and complete responses of less than 15 words."}]
-            text_str=""
             audio_output = None
             audio_output1 = None
             asr_output_str = ""
             start_record_time = None
-            LLM_response_arr=[]
-            total_response_arr=[]
-            shutil.rmtree('flagged_data_points')
             os.mkdir("flagged_data_points")
-            yield (stream,asr_output_str,text_str1, audio_output, audio_output1)
-            yield stream,gr.Textbox(visible=True),gr.Textbox(visible=True),gr.Audio(visible=True),gr.Audio(visible=False)
-    yield (stream,asr_output_str,text_str1, audio_output, audio_output1)
 with gr.Blocks(
-        title="E2E Spoken Dialog System",
-    ) as demo:
-        with gr.Row():
-            with gr.Column(scale=1):
-                user_audio = gr.Audio(sources=["microphone"], streaming=True, waveform_options=gr.WaveformOptions(sample_rate=16000))
-                with gr.Row():
-                    type_radio = gr.Radio(
-                        choices=["Cascaded", "E2E"],
-                        label="Choose type of Spoken Dialog:",
-                        value="Cascaded",
-                    )
-                with gr.Row():
-                    ASR_radio = gr.Radio(
-                        choices=ASR_options,
-                        label="Choose ASR:",
-                        value=ASR_name,
-                    )
-                with gr.Row():
-                    LLM_radio = gr.Radio(
-                        choices=LLM_options,
-                        label="Choose LLM:",
-                        value=LLM_name,
-                    )
-                with gr.Row():
-                    radio = gr.Radio(
-                        choices=TTS_options,
-                        label="Choose TTS:",
-                        value=TTS_name,
-                    )
-                with gr.Row():
-                    E2Eradio = gr.Radio(
-                        choices=["mini-omni"],
-                        label="Choose E2E model:",
-                        value="mini-omni",
-                        visible=False,
-                    )
-                with gr.Row():
-                    feedback_btn = gr.Button(
-                        value="Please provide your feedback after each system response below.", visible=True, interactive=False, elem_id="button"
-                    )
-                with gr.Row():
-                    natural_btn1 = gr.Button(
-                        value="Very Natural", visible=False, interactive=False, scale=1
-                    )
-                    natural_btn2 = gr.Button(
-                        value="Somewhat Awkward", visible=False, interactive=False, scale=1
-                    )
-                    natural_btn3 = gr.Button(value="Very Awkward", visible=False, interactive=False, scale=1)
-                    natural_btn4 = gr.Button(
-                        value="Unnatural", visible=False, interactive=False, scale=1
-                    )
-                with gr.Row():
-                    relevant_btn1 = gr.Button(
-                        value="Highly Relevant", visible=False, interactive=False, scale=1
-                    )
-                    relevant_btn2 = gr.Button(
-                        value="Partially Relevant", visible=False, interactive=False, scale=1
-                    )
-                    relevant_btn3 = gr.Button(value="Slightly Irrelevant", visible=False, interactive=False, scale=1)
-                    relevant_btn4 = gr.Button(
-                        value= "Completely Irrelevant", visible=False, interactive=False, scale=1
-                    )
-            with gr.Column(scale=1):
-                output_audio = gr.Audio(label="Output", interactive=False, autoplay=True, visible=True)
-                output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False)
-                output_asr_text = gr.Textbox(label="ASR output", interactive=False)
-                output_text = gr.Textbox(label="LLM output", interactive=False)
-                eval_radio = gr.Radio(
-                    choices=["Latency", "TTS Intelligibility", "TTS Speech Quality", "ASR WER","Text Dialog Metrics"],
-                    label="Choose Evaluation metrics:",
                 )
-                eval_radio_E2E = gr.Radio(
-                    choices=["Latency", "TTS Intelligibility", "TTS Speech Quality","Text Dialog Metrics"],
-                    label="Choose Evaluation metrics:",
                     visible=False,
                 )
-                output_eval_text = gr.Textbox(label="Evaluation Results")
-                state = gr.State()
-        with gr.Row():
-            privacy_text = gr.Textbox(label="Privacy Notice",interactive=False, value="By using this demo, you acknowledge that interactions with this dialog system are collected for research and improvement purposes. The data will only be used to enhance the performance and understanding of the system. If you have any concerns about data collection, please discontinue use.")
-        btn_list=[
-                natural_btn1,
-                natural_btn2,
-                natural_btn3,
-                natural_btn4,
-                relevant_btn1,
-                relevant_btn2,
-                relevant_btn3,
-                relevant_btn4,
-        ]
-        natural_btn_list=[
-            natural_btn1,
-            natural_btn2,
-            natural_btn3,
-            natural_btn4,
-        ]
-        relevant_btn_list=[
-            relevant_btn1,
-            relevant_btn2,
-            relevant_btn3,
-            relevant_btn4,
-        ]
-        natural_response = gr.Textbox(label="natural_response",visible=False,interactive=False)
-        diversity_response = gr.Textbox(label="diversity_response",visible=False,interactive=False)
-        ip_address = gr.Textbox(label="ip_address",visible=False,interactive=False)
-        callback.setup([user_audio, output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address],"flagged_data_points")
-        user_audio.stream(transcribe, inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio], outputs=[state, output_asr_text, output_text, output_audio, output_audio1]).then(lambda *args: callback.flag(list(args)),[user_audio], None,preprocess=False)
-        radio.change(fn=handle_selection, inputs=[radio], outputs=[output_asr_text, output_text, output_audio])
-        LLM_radio.change(fn=handle_LLM_selection, inputs=[LLM_radio], outputs=[output_asr_text, output_text, output_audio])
-        ASR_radio.change(fn=handle_ASR_selection, inputs=[ASR_radio], outputs=[output_asr_text, output_text, output_audio])
-        eval_radio.change(fn=handle_eval_selection, inputs=[eval_radio,output_audio,output_text,output_audio1,output_asr_text], outputs=[eval_radio,output_eval_text])
-        eval_radio_E2E.change(fn=handle_eval_selection_E2E, inputs=[eval_radio_E2E,output_audio,output_text], outputs=[eval_radio_E2E,output_eval_text])
-        type_radio.change(fn=handle_type_selection,inputs=[type_radio,radio,ASR_radio,LLM_radio], outputs=[radio,ASR_radio,LLM_radio, E2Eradio,output_asr_text, output_text, output_audio,eval_radio,eval_radio_E2E])
-        output_audio.play(
-            flash_buttons, [], [natural_response,diversity_response]+btn_list
-        ).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio], None,preprocess=False)
-        natural_btn1.click(natural_vote1_last_response,[],[natural_response,ip_address]+natural_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        natural_btn2.click(natural_vote2_last_response,[],[natural_response,ip_address]+natural_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        natural_btn3.click(natural_vote3_last_response,[],[natural_response,ip_address]+natural_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        natural_btn4.click(natural_vote4_last_response,[],[natural_response,ip_address]+natural_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        relevant_btn1.click(relevant_vote1_last_response,[],[diversity_response,ip_address]+relevant_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        relevant_btn2.click(relevant_vote2_last_response,[],[diversity_response,ip_address]+relevant_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        relevant_btn3.click(relevant_vote3_last_response,[],[diversity_response,ip_address]+relevant_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
-        relevant_btn4.click(relevant_vote4_last_response,[],[diversity_response,ip_address]+relevant_btn_list).then(lambda *args: callback.flag(list(args)),[user_audio,output_asr_text, output_text, output_audio,output_audio1,type_radio, ASR_radio, LLM_radio, radio, E2Eradio, natural_response,diversity_response,ip_address], None,preprocess=False)
 demo.launch(share=True)

     with open('versa.sh', 'rb') as file:
         script = file.read()
     rc = call(script, shell=True)
 import os
 import shutil
+import time
+from typing import Generator, Optional, Tuple
+import gradio as gr
+import nltk
+import numpy as np
 import torch
+from huggingface_hub import HfApi
+from pyscripts.utils.dialog_eval.ASR_WER import handle_espnet_ASR_WER
+from pyscripts.utils.dialog_eval.human_feedback import (
+    natural_vote1_last_response,
+    natural_vote2_last_response,
+    natural_vote3_last_response,
+    natural_vote4_last_response,
+    relevant_vote1_last_response,
+    relevant_vote2_last_response,
+    relevant_vote3_last_response,
+    relevant_vote4_last_response,
+)
+from pyscripts.utils.dialog_eval.LLM_Metrics import (
+    DialoGPT_perplexity,
+    bert_score,
+    perplexity,
+    vert,
+)
+from pyscripts.utils.dialog_eval.TTS_intelligibility import (
+    handle_espnet_TTS_intelligibility,
+)
+from pyscripts.utils.dialog_eval.TTS_speech_quality import TTS_psuedomos
+from espnet2.sds.espnet_model import ESPnetSDSModelInterface
+# ------------------------
+# Hyperparameters
+# ------------------------
 access_token = os.environ.get("HF_TOKEN")
 ASR_name="pyf98/owsm_ctc_v3.1_1B"
 LLM_name="meta-llama/Llama-3.2-1B-Instruct"
 TTS_name="kan-bayashi/ljspeech_vits"
+ASR_options="pyf98/owsm_ctc_v3.1_1B,espnet/owsm_ctc_v3.2_ft_1B,espnet/owsm_v3.1_ebf,librispeech_asr,whisper-large".split(",")
 LLM_options="meta-llama/Llama-3.2-1B-Instruct,HuggingFaceTB/SmolLM2-1.7B-Instruct".split(",")
 TTS_options="kan-bayashi/ljspeech_vits,kan-bayashi/libritts_xvector_vits,kan-bayashi/vctk_multi_spk_vits,ChatTTS".split(",")
 Eval_options="Latency,TTS Intelligibility,TTS Speech Quality,ASR WER,Text Dialog Metrics"
 upload_to_hub=None
+dialogue_model = ESPnetSDSModelInterface(
+    ASR_name, LLM_name, TTS_name, "Cascaded", access_token
+)
 ASR_curr_name=None
 LLM_curr_name=None
 TTS_curr_name=None
+latency_ASR = 0.0
+latency_LM = 0.0
+latency_TTS = 0.0
+text_str = ""
+asr_output_str = ""
+vad_output = None
 audio_output = None
 audio_output1 = None
+LLM_response_arr = []
+total_response_arr = []
+callback = gr.CSVLogger()
+start_record_time = None
+enable_btn = gr.Button(interactive=True, visible=True)
+# ------------------------
+# Function Definitions
+# ------------------------
+def handle_eval_selection(
+    option: str,
+    TTS_audio_output: str,
+    LLM_Output: str,
+    ASR_audio_output: str,
+    ASR_transcript: str,
+):
+    """
+    Handles the evaluation of a selected metric based on
+    user input and provided outputs.
+    This function evaluates different aspects of a
+    casacaded conversational AI pipeline, such as:
+    Latency, TTS intelligibility, TTS speech quality,
+    ASR WER, and text dialog metrics.
+    It is designed to integrate with Gradio via
+    multiple yield statements,
+    allowing updates to be displayed in real time.
+    Parameters:
+    ----------
+    option : str
+        The evaluation metric selected by the user.
+        Supported options include:
+        - "Latency"
+        - "TTS Intelligibility"
+        - "TTS Speech Quality"
+        - "ASR WER"
+        - "Text Dialog Metrics"
+    TTS_audio_output : np.ndarray
+        The audio output generated by the TTS module for evaluation.
+    LLM_Output : str
+        The text output generated by the LLM module for evaluation.
+    ASR_audio_output : np.ndarray
+        The audio input/output used for ASR evaluation.
+    ASR_transcript : str
+        The transcript generated by the ASR module for evaluation.
+    Returns:
+    -------
+    str
+        A string representation of the evaluation results.
+        The specific result depends on the selected evaluation metric:
+        - "Latency": Latencies of ASR, LLM, and TTS modules.
+        - "TTS Intelligibility": A range of scores indicating how intelligible
+        the TTS audio output is based on different reference ASR models.
+        - "TTS Speech Quality": A range of scores representing the
+        speech quality of the TTS audio output.
+        - "ASR WER": The Word Error Rate (WER) of the ASR output
+        based on different judge ASR models.
+        - "Text Dialog Metrics": A combination of perplexity,
+        diversity metrics, and relevance scores for the dialog.
+    Raises:
+    ------
+    ValueError
+        If the `option` parameter does not match any supported evaluation metric.
+    Example:
+    -------
+    >>> result = handle_eval_selection(
+            option="Latency",
+            TTS_audio_output=audio_array,
+            LLM_Output="Generated response",
+            ASR_audio_output=audio_input,
+            ASR_transcript="Expected transcript"
+        )
+    >>> print(result)
+    "ASR Latency: 0.14
+     LLM Latency: 0.42
+     TTS Latency: 0.21"
+    """
     global LLM_response_arr
     global total_response_arr
+    yield (option, gr.Textbox(visible=True))
+    if option == "Latency":
+        text = (
+            f"ASR Latency: {latency_ASR:.2f}\n"
+            f"LLM Latency: {latency_LM:.2f}\n"
+            f"TTS Latency: {latency_TTS:.2f}"
+        )
+        yield (None, text)
+    elif option == "TTS Intelligibility":
+        yield (None, handle_espnet_TTS_intelligibility(TTS_audio_output, LLM_Output))
+    elif option == "TTS Speech Quality":
+        yield (None, TTS_psuedomos(TTS_audio_output))
+    elif option == "ASR WER":
+        yield (None, handle_espnet_ASR_WER(ASR_audio_output, ASR_transcript))
+    elif option == "Text Dialog Metrics":
+        yield (
+            None,
+            perplexity(LLM_Output.replace("\n", " "))
+            + vert(LLM_response_arr)
+            + bert_score(total_response_arr)
+            + DialoGPT_perplexity(
+                ASR_transcript.replace("\n", " "), LLM_Output.replace("\n", " ")
+            ),
+        )
+    elif option is None:
+        return
+    else:
+        raise ValueError(f"Unknown option: {option}")
+def handle_eval_selection_E2E(
+    option: str,
+    TTS_audio_output: str,
+    LLM_Output: str,
+):
+    """
+    Handles the evaluation of a selected metric based on user input
+    and provided outputs.
+    This function evaluates different aspects of an E2E
+    conversational AI model, such as:
+    Latency, TTS intelligibility, TTS speech quality, and
+    text dialog metrics.
+    It is designed to integrate with Gradio via
+    multiple yield statements,
+    allowing updates to be displayed in real time.
+    Parameters:
+    ----------
+    option : str
+        The evaluation metric selected by the user.
+        Supported options include:
+        - "Latency"
+        - "TTS Intelligibility"
+        - "TTS Speech Quality"
+        - "Text Dialog Metrics"
+    TTS_audio_output : np.ndarray
+        The audio output generated by the TTS module for evaluation.
+    LLM_Output : str
+        The text output generated by the LLM module for evaluation.
+    Returns:
+    -------
+    str
+        A string representation of the evaluation results.
+        The specific result depends on the selected evaluation metric:
+        - "Latency": Latency of the entire system.
+        - "TTS Intelligibility": A range of scores indicating how intelligible the
+        TTS audio output is based on different reference ASR models.
+        - "TTS Speech Quality": A range of scores representing the
+         speech quality of the TTS audio output.
+        - "Text Dialog Metrics": A combination of perplexity and
+        diversity metrics for the dialog.
+    Raises:
+    ------
+    ValueError
+        If the `option` parameter does not match any supported evaluation metric.
+    Example:
+    -------
+    >>> result = handle_eval_selection(
+            option="Latency",
+            TTS_audio_output=audio_array,
+            LLM_Output="Generated response",
+        )
+    >>> print(result)
+    "Total Latency: 2.34"
+    """
     global LLM_response_arr
     global total_response_arr
+    yield (option, gr.Textbox(visible=True))
+    if option == "Latency":
+        text = f"Total Latency: {latency_TTS:.2f}"
+        yield (None, text)
+    elif option == "TTS Intelligibility":
+        yield (None, handle_espnet_TTS_intelligibility(TTS_audio_output, LLM_Output))
+    elif option == "TTS Speech Quality":
+        yield (None, TTS_psuedomos(TTS_audio_output))
+    elif option == "Text Dialog Metrics":
+        yield (None, perplexity(LLM_Output.replace("\n", " ")) + vert(LLM_response_arr))
+    elif option is None:
+        return
     else:
+        raise ValueError(f"Unknown option: {option}")
 def start_warmup():
+    """
+    Initializes and warms up the dialogue and evaluation model.
+    This function is designed to ensure that all
+    components of the dialogue model are pre-loaded
+    and ready for execution, avoiding delays during runtime.
+    """
+    global dialogue_model
+    global ASR_options
+    global LLM_options
+    global TTS_options
+    global ASR_name
+    global LLM_name
+    global TTS_name
+    for opt_count in range(len(ASR_options)):
+        opt = ASR_options[opt_count]
+        try:
+            for _ in dialogue_model.handle_ASR_selection(opt):
+                continue
+        except Exception:
+            print("Removing " + opt + " from ASR options since it cannot be loaded.")
+            ASR_options = ASR_options[:opt_count] + ASR_options[(opt_count + 1) :]
+            if opt == ASR_name:
+                ASR_name = ASR_options[0]
+    for opt_count in range(len(LLM_options)):
+        opt = LLM_options[opt_count]
+        try:
+            for _ in dialogue_model.handle_LLM_selection(opt):
+                continue
+        except Exception:
+            print("Removing " + opt + " from LLM options since it cannot be loaded.")
+            LLM_options = LLM_options[:opt_count] + LLM_options[(opt_count + 1) :]
+            if opt == LLM_name:
+                LLM_name = LLM_options[0]
+    for opt_count in range(len(TTS_options)):
+        opt = TTS_options[opt_count]
+        try:
+            for _ in dialogue_model.handle_TTS_selection(opt):
+                continue
+        except Exception:
+            print("Removing " + opt + " from TTS options since it cannot be loaded.")
+            TTS_options = TTS_options[:opt_count] + TTS_options[(opt_count + 1) :]
+            if opt == TTS_name:
+                TTS_name = TTS_options[0]
+    dialogue_model.handle_E2E_selection()
+    dialogue_model.client = None
+    for _ in dialogue_model.handle_TTS_selection(TTS_name):
         continue
+    for _ in dialogue_model.handle_ASR_selection(ASR_name):
         continue
+    for _ in dialogue_model.handle_LLM_selection(LLM_name):
         continue
+    dummy_input = (
+        torch.randn(
             (3000),
             dtype=getattr(torch, "float16"),
             device="cpu",
+        )
+        .cpu()
+        .numpy()
+    )
+    dummy_text = "This is dummy text"
     for opt in Eval_options:
         handle_eval_selection(opt, dummy_input, dummy_text, dummy_input, dummy_text)
 def flash_buttons():
+    """
+    Enables human feedback buttons after displaying system output.
+    """
     btn_updates = (enable_btn,) * 8
+    yield (
+        "",
+        "",
+    ) + btn_updates
+def transcribe(
+    stream: np.ndarray,
+    new_chunk: Tuple[int, np.ndarray],
+    TTS_option: str,
+    ASR_option: str,
+    LLM_option: str,
+    type_option: str,
 ):
+    """
+    Processes and transcribes an audio stream in real-time.
+    This function handles the transcription of audio input
+    and its transformation through a cascaded
+    or E2E conversational AI system.
+    It dynamically updates the transcription, text generation,
+    and synthesized speech output, while managing global states and latencies.
+    Args:
+        stream: The current audio stream buffer.
+            `None` if the stream is being reset (e.g., after user refresh).
+        new_chunk: A tuple containing:
+            - `sr`: Sample rate of the new audio chunk.
+            - `y`: New audio data chunk.
+        TTS_option: Selected TTS model option.
+        ASR_option: Selected ASR model option.
+        LLM_option: Selected LLM model option.
+        type_option: Type of system ("Cascaded" or "E2E").
+    Yields:
+        Tuple[Optional[np.ndarray], Optional[str], Optional[str],
+        Optional[Tuple[int, np.ndarray]], Optional[Tuple[int, np.ndarray]]]:
+            A tuple containing:
+            - Updated stream buffer.
+            - ASR output text.
+            - Generated LLM output text.
+            - Audio output as a tuple of sample rate and audio waveform.
+            - User input audio as a tuple of sample rate and audio waveform.
+    Notes:
+        - Resets the session if the transcription exceeds 5 minutes.
+        - Updates the Gradio interface elements dynamically.
+        - Manages latencies.
+    """
     sr, y = new_chunk
     global text_str
     global chat
     global total_response_arr
     if stream is None:
         # Handle user refresh
+        for (
+            _,
+            _,
+            _,
+            _,
+            asr_output_box,
+            text_box,
+            audio_box,
+            _,
+            _,
+        ) in dialogue_model.handle_type_selection(
+            type_option, TTS_option, ASR_option, LLM_option
+        ):
             gr.Info("The models are being reloaded due to a browser refresh.")
+            yield (stream, asr_output_box, text_box, audio_box, gr.Audio(visible=False))
+        stream = y
+        text_str = ""
         audio_output = None
         audio_output1 = None
     else:
+        stream = np.concatenate((stream, y))
+    (
+        asr_output_str,
+        text_str,
+        audio_output,
+        audio_output1,
+        latency_ASR,
+        latency_LM,
+        latency_TTS,
+        stream,
+        change,
+    ) = dialogue_model(
+        y,
+        sr,
+        stream,
+        asr_output_str,
+        text_str,
+        audio_output,
+        audio_output1,
+        latency_ASR,
+        latency_LM,
+        latency_TTS,
+    )
+    text_str1 = text_str
+    if change:
+        print("Output changed")
+        if asr_output_str != "":
+            total_response_arr.append(asr_output_str.replace("\n", " "))
+        LLM_response_arr.append(text_str.replace("\n", " "))
+        total_response_arr.append(text_str.replace("\n", " "))
+    if (text_str != "") and (start_record_time is None):
+        start_record_time = time.time()
     elif start_record_time is not None:
+        current_record_time = time.time()
+        if current_record_time - start_record_time > 300:
+            gr.Info(
+                "Conversations are limited to 5 minutes. "
+                "The session will restart in approximately 60 seconds. "
+                "Please wait for the demo to reset. "
+                "Close this message once you have read it.",
+                duration=None,
+            )
+            yield stream, gr.Textbox(visible=False), gr.Textbox(
+                visible=False
+            ), gr.Audio(visible=False), gr.Audio(visible=False)
             if upload_to_hub is not None:
                 api.upload_folder(
                     folder_path="flagged_data_points",
+                    path_in_repo="checkpoint_" + str(start_record_time),
                     repo_id=upload_to_hub,
                     repo_type="dataset",
                     token=access_token,
                 )
+            dialogue_model.chat.buffer = []
+            text_str = ""
             audio_output = None
             audio_output1 = None
             asr_output_str = ""
             start_record_time = None
+            LLM_response_arr = []
+            total_response_arr = []
+            shutil.rmtree("flagged_data_points")
             os.mkdir("flagged_data_points")
+            yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
+            yield stream, gr.Textbox(visible=True), gr.Textbox(visible=True), gr.Audio(
+                visible=True
+            ), gr.Audio(visible=False)
+    yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
+# ------------------------
+# Executable Script
+# ------------------------
+api = HfApi()
+nltk.download("averaged_perceptron_tagger_eng")
+start_warmup()
 with gr.Blocks(
+    title="E2E Spoken Dialog System",
+) as demo:
+    with gr.Row():
+        gr.Markdown(
+            """
+            ## ESPnet-SDS
+            Welcome to our unified web interface for various cascaded and
+            E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
+            supporting real-time automated evaluation metrics, and
+            human-in-the-loop feedback collection.
+            For more details on how to use the app, refer to the [README]
+            (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
+        """
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            user_audio = gr.Audio(
+                sources=["microphone"],
+                streaming=True,
+                waveform_options=gr.WaveformOptions(sample_rate=16000),
+            )
+            with gr.Row():
+                type_radio = gr.Radio(
+                    choices=["Cascaded", "E2E"],
+                    label="Choose type of Spoken Dialog:",
+                    value="Cascaded",
+                )
+            with gr.Row():
+                ASR_radio = gr.Radio(
+                    choices=ASR_options,
+                    label="Choose ASR:",
+                    value=ASR_name,
+                )
+            with gr.Row():
+                LLM_radio = gr.Radio(
+                    choices=LLM_options,
+                    label="Choose LLM:",
+                    value=LLM_name,
                 )
+            with gr.Row():
+                radio = gr.Radio(
+                    choices=TTS_options,
+                    label="Choose TTS:",
+                    value=TTS_name,
+                )
+            with gr.Row():
+                E2Eradio = gr.Radio(
+                    choices=["mini-omni"],
+                    label="Choose E2E model:",
+                    value="mini-omni",
                     visible=False,
                 )
+            with gr.Row():
+                feedback_btn = gr.Button(
+                    value=(
+                        "Please provide your feedback "
+                        "after each system response below."
+                    ),
+                    visible=True,
+                    interactive=False,
+                    elem_id="button",
+                )
+            with gr.Row():
+                natural_btn1 = gr.Button(
+                    value="Very Natural", visible=False, interactive=False, scale=1
+                )
+                natural_btn2 = gr.Button(
+                    value="Somewhat Awkward", visible=False, interactive=False, scale=1
+                )
+                natural_btn3 = gr.Button(
+                    value="Very Awkward", visible=False, interactive=False, scale=1
+                )
+                natural_btn4 = gr.Button(
+                    value="Unnatural", visible=False, interactive=False, scale=1
+                )
+            with gr.Row():
+                relevant_btn1 = gr.Button(
+                    value="Highly Relevant", visible=False, interactive=False, scale=1
+                )
+                relevant_btn2 = gr.Button(
+                    value="Partially Relevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+                relevant_btn3 = gr.Button(
+                    value="Slightly Irrelevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+                relevant_btn4 = gr.Button(
+                    value="Completely Irrelevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="Output", autoplay=True, visible=True)
+            output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False)
+            output_asr_text = gr.Textbox(label="ASR output")
+            output_text = gr.Textbox(label="LLM output")
+            eval_radio = gr.Radio(
+                choices=[
+                    "Latency",
+                    "TTS Intelligibility",
+                    "TTS Speech Quality",
+                    "ASR WER",
+                    "Text Dialog Metrics",
+                ],
+                label="Choose Evaluation metrics:",
+            )
+            eval_radio_E2E = gr.Radio(
+                choices=[
+                    "Latency",
+                    "TTS Intelligibility",
+                    "TTS Speech Quality",
+                    "Text Dialog Metrics",
+                ],
+                label="Choose Evaluation metrics:",
+                visible=False,
+            )
+            output_eval_text = gr.Textbox(label="Evaluation Results")
+            state = gr.State()
+    with gr.Row():
+        privacy_text = gr.Textbox(
+            label="Privacy Notice",
+            interactive=False,
+            value=(
+                "By using this demo, you acknowledge that"
+                "interactions with this dialog system are collected "
+                "for research and improvement purposes. The data "
+                "will only be used to enhance the performance and "
+                "understanding of the system. If you have any "
+                "concerns about data collection, please discontinue "
+                "use."
+            ),
+        )
+    btn_list = [
+        natural_btn1,
+        natural_btn2,
+        natural_btn3,
+        natural_btn4,
+        relevant_btn1,
+        relevant_btn2,
+        relevant_btn3,
+        relevant_btn4,
+    ]
+    natural_btn_list = [
+        natural_btn1,
+        natural_btn2,
+        natural_btn3,
+        natural_btn4,
+    ]
+    relevant_btn_list = [
+        relevant_btn1,
+        relevant_btn2,
+        relevant_btn3,
+        relevant_btn4,
+    ]
+    natural_response = gr.Textbox(
+        label="natural_response", visible=False, interactive=False
+    )
+    diversity_response = gr.Textbox(
+        label="diversity_response", visible=False, interactive=False
+    )
+    ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
+    callback.setup(
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        "flagged_data_points",
+    )
+    user_audio.stream(
+        transcribe,
+        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio],
+        outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
+    ).then(
+        lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False
+    )
+    radio.change(
+        fn=dialogue_model.handle_TTS_selection,
+        inputs=[radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    LLM_radio.change(
+        fn=dialogue_model.handle_LLM_selection,
+        inputs=[LLM_radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    ASR_radio.change(
+        fn=dialogue_model.handle_ASR_selection,
+        inputs=[ASR_radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    eval_radio.change(
+        fn=handle_eval_selection,
+        inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
+        outputs=[eval_radio, output_eval_text],
+    )
+    eval_radio_E2E.change(
+        fn=handle_eval_selection_E2E,
+        inputs=[eval_radio_E2E, output_audio, output_text],
+        outputs=[eval_radio_E2E, output_eval_text],
+    )
+    type_radio.change(
+        fn=dialogue_model.handle_type_selection,
+        inputs=[type_radio, radio, ASR_radio, LLM_radio],
+        outputs=[
+            radio,
+            ASR_radio,
+            LLM_radio,
+            E2Eradio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            eval_radio,
+            eval_radio_E2E,
+        ],
+    )
+    output_audio.play(
+        flash_buttons, [], [natural_response, diversity_response] + btn_list
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn1.click(
+        natural_vote1_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn2.click(
+        natural_vote2_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn3.click(
+        natural_vote3_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn4.click(
+        natural_vote4_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn1.click(
+        relevant_vote1_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn2.click(
+        relevant_vote2_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn3.click(
+        relevant_vote3_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn4.click(
+        relevant_vote4_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
 demo.launch(share=True)

pyscripts/utils/dialog_eval/ASR_WER.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Tuple
+import numpy as np
+from espnet2.sds.utils.utils import int2float
+def handle_espnet_ASR_WER(
+    ASR_audio_output: Tuple[int, np.ndarray], ASR_transcript: str
+) -> str:
+    """
+    Compute and return Word Error Rate (WER) and Character Error Rate (CER) metrics
+    for multiple judge ASR systems (ESPnet, OWSM, Whisper) using the Versa library.
+    This function performs the following:
+        1. Imports necessary metrics and setup functions from Versa.
+        2. Prepares configuration arguments for each ASR system (ESPnet, OWSM, Whisper).
+        3. Runs the Levenshtein-based WER/CER calculations.
+        4. Returns a formatted string summarizing WER and CER
+        results for reference produced by each ASR system.
+    Args:
+        ASR_audio_output (tuple):
+            A tuple where:
+                - The first element is the frame rate.
+                - The second element is the audio signal (NumPy array).
+        ASR_transcript (str):
+            The transcript produced by the ASR model in the cascaded
+            conversational AI pipeline.
+    Returns:
+        str:
+            A formatted string showing the WER and CER percentages
+            for ESPnet, OWSM, and Whisper. Example output:
+            "ESPnet WER: 10.50
+             ESPnet CER: 7.20
+             OWSM WER: 11.30
+             OWSM CER: 8.00
+             Whisper WER: 9.25
+             Whisper CER: 6.50"
+    Raises:
+        ImportError:
+            If Versa is not installed or cannot be imported.
+    Example:
+        >>> asr_audio_output = (16000, audio_array)
+        >>> asr_transcript = "This is the ASR transcript."
+        >>> result = handle_espnet_ASR_WER(asr_audio_output, asr_transcript)
+        >>> print(result)
+        "ESPnet WER: 10.50
+         ESPnet CER: 7.20
+         OWSM WER: 11.30
+         OWSM CER: 8.00
+         Whisper WER: 9.25
+         Whisper CER: 6.50"
+    """
+    try:
+        from versa import (
+            espnet_levenshtein_metric,
+            espnet_wer_setup,
+            owsm_levenshtein_metric,
+            owsm_wer_setup,
+            whisper_levenshtein_metric,
+            whisper_wer_setup,
+        )
+    except Exception as e:
+        print("Error: Versa is not properly installed.")
+        raise e
+    score_modules_espnet = {
+        "module": espnet_levenshtein_metric,
+        "args": espnet_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_espnet["module"](
+        score_modules_espnet["args"],
+        int2float(ASR_audio_output[1]),
+        ASR_transcript,
+        ASR_audio_output[0],
+    )
+    espnet_wer = (
+        dict1["espnet_wer_delete"]
+        + dict1["espnet_wer_insert"]
+        + dict1["espnet_wer_replace"]
+    ) / (
+        dict1["espnet_wer_insert"]
+        + dict1["espnet_wer_replace"]
+        + dict1["espnet_wer_equal"]
+    )
+    espnet_cer = (
+        dict1["espnet_cer_delete"]
+        + dict1["espnet_cer_insert"]
+        + dict1["espnet_cer_replace"]
+    ) / (
+        dict1["espnet_cer_insert"]
+        + dict1["espnet_cer_replace"]
+        + dict1["espnet_cer_equal"]
+    )
+    score_modules_owsm = {
+        "module": owsm_levenshtein_metric,
+        "args": owsm_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_owsm["module"](
+        score_modules_owsm["args"],
+        int2float(ASR_audio_output[1]),
+        ASR_transcript,
+        ASR_audio_output[0],
+    )
+    owsm_wer = (
+        dict1["owsm_wer_delete"] + dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"]
+    ) / (dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"] + dict1["owsm_wer_equal"])
+    owsm_cer = (
+        dict1["owsm_cer_delete"] + dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"]
+    ) / (dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"] + dict1["owsm_cer_equal"])
+    score_modules_whisper = {
+        "module": whisper_levenshtein_metric,
+        "args": whisper_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_whisper["module"](
+        score_modules_whisper["args"],
+        int2float(ASR_audio_output[1]),
+        ASR_transcript,
+        ASR_audio_output[0],
+    )
+    whisper_wer = (
+        dict1["whisper_wer_delete"]
+        + dict1["whisper_wer_insert"]
+        + dict1["whisper_wer_replace"]
+    ) / (
+        dict1["whisper_wer_insert"]
+        + dict1["whisper_wer_replace"]
+        + dict1["whisper_wer_equal"]
+    )
+    whisper_cer = (
+        dict1["whisper_cer_delete"]
+        + dict1["whisper_cer_insert"]
+        + dict1["whisper_cer_replace"]
+    ) / (
+        dict1["whisper_cer_insert"]
+        + dict1["whisper_cer_replace"]
+        + dict1["whisper_cer_equal"]
+    )
+    return (
+        f"ESPnet WER: {espnet_wer*100:.2f}\n"
+        f"ESPnet CER: {espnet_cer*100:.2f}\n"
+        f"OWSM WER: {owsm_wer*100:.2f}\n"
+        f"OWSM CER: {owsm_cer*100:.2f}\n"
+        f"Whisper WER: {whisper_wer*100:.2f}\n"
+        f"Whisper CER: {whisper_cer*100:.2f}"
+    )

pyscripts/utils/dialog_eval/LLM_Metrics.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from multiprocessing import Pool
+from typing import List
+import numpy as np
+import torch
+from pyscripts.utils.dialog_eval.vert import (
+    get_auto_bleu2_geometric,
+    get_self_bleu2_geometric,
+    run_f,
+)
+from scipy.stats import gmean
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+def perplexity(LLM_Output: str, model_id: str = "gpt2") -> str:
+    """
+    Compute the perplexity of the given text using a specified model from the
+    `evaluate` library (default: GPT-2).
+    Args:
+        LLM_Output str:
+            The text (string) for which perplexity is to be computed.
+        model_id (str, optional):
+            The identifier of the model to use for computing
+            perplexity. Defaults to "gpt2".
+    Returns:
+        str:
+            A formatted string showing the perplexity of the
+            provided text(s), for example:
+            "Perplexity: 45.23\n"
+    Raises:
+        ImportError:
+            If the `evaluate` library is not installed or cannot be imported.
+    Example:
+        >>> text = "Hello world, this is a test."
+        >>> result = perplexity(text, model_id="gpt2")
+        >>> print(result)
+        "Perplexity: 27.34\n"
+    """
+    try:
+        import evaluate
+    except Exception as e:
+        print("Error: evaluate is not properly installed.")
+        raise e
+    perplexity = evaluate.load("perplexity", module_type="metric")
+    results = perplexity.compute(model_id=model_id, predictions=[LLM_Output])
+    return f"Perplexity: {results['mean_perplexity']:.2f}\n"
+def vert(LLM_response_arr: List[str]) -> str:
+    """
+    Calculate and return Self BLEU-2, Auto BLEU-2 and VERT-2
+    metrics for a list of LLM responses.
+    Args:
+        LLM_response_arr (List[str]):
+            A list of responses (strings) generated by the language
+            model acting as text dialog response generator.
+    Returns:
+        str:
+            A formatted string that includes each computed metric and the final
+            VERT value, for example:
+            "Self-BLEU2-geometric: 42.13
+             Auto-BLEU2-geometric: 38.94
+             VERT: 40.5
+             "
+    Example:
+        >>> # Suppose we have the following LLM responses:
+        >>> responses = ["Hello world", "Foo bar", "Lorem ipsum dolor sit amet"]
+        >>> result = vert(responses)
+        >>> print(result)
+        "Self-BLEU2-geometric: 42.13
+         Auto-BLEU2-geometric: 38.94
+         VERT: 40.5
+         "
+    """
+    terms = [x.strip().split() for x in LLM_response_arr]
+    tasks = [
+        ("Self-BLEU2-geometric", get_self_bleu2_geometric),
+        ("Auto-BLEU2-geometric", get_auto_bleu2_geometric),
+    ]
+    n_processes = min(16, len(tasks))
+    with Pool(n_processes) as pool:
+        metrics = pool.map(run_f, [(t[1], terms) for t in tasks])
+    metric_arr = []
+    str1 = ""
+    for (metric_name, _), metric in zip(tasks, metrics):
+        metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric))
+        metric, sem = [round(100 * x, 2) for x in [metric, sem]]
+        metric_arr.append(metric)
+        str1 += f"{metric_name}: {metric}\n"
+    str1 += f"VERT: {round(gmean(metric_arr), 2)}\n"
+    return str1
+def bert_score(
+    total_response_arr: List[str], bert_model_name: str = "bert-base-uncased"
+) -> str:
+    """
+    Compute a cosine similarity score between the concatenated
+    context (all but the last element)
+    and the final response (last element) using a BERT-based model.
+    This serves as a simplified
+    measure of how closely the response aligns with the preceding context semantically.
+    Args:
+        total_response_arr (List[str]):
+            A list of strings. The last element represents the response,
+            while all other elements
+            are treated as the context.
+        bert_model_name (str, optional):
+            The name or path of the BERT model to use (from the Hugging Face Model Hub).
+            Defaults to "bert-base-uncased".
+    Returns:
+        str:
+            A string containing the cosine similarity
+            (as a percentage) followed by a newline.
+            For example:
+                "Cosine Similarity: 85.67\n"
+    Example:
+        >>> total_responses = [
+        ...     "User: Hi, how are you?",
+        ...     "Assistant: I'm good! How can I help you today?",
+        ...     "User: Can you tell me a joke?",
+        ...     "Assistant: Sure! Here's one: Why did the chicken join a band?"
+        ... ]
+        >>> result = bert_score(total_responses, bert_model_name="bert-base-uncased")
+        >>> print(result)
+        "Cosine Similarity: 75.89\n"
+    """
+    def cosine_similarity_context_response(context, response, model, tokenizer):
+        # Tokenize and encode both context and response
+        context_inputs = tokenizer(context, return_tensors="pt", truncation=True)
+        response_inputs = tokenizer(response, return_tensors="pt", truncation=True)
+        for k in context_inputs:
+            context_inputs[k] = context_inputs[k].cuda()
+        for k in response_inputs:
+            response_inputs[k] = response_inputs[k].cuda()
+        # Get embeddings from the model
+        with torch.no_grad():
+            context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1)
+            response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1)
+        # Compute cosine similarity
+        similarity = cosine_similarity(
+            context_embedding.cpu().numpy(), response_embedding.cpu().numpy()
+        )
+        return similarity[0][0]
+    bert_model = AutoModel.from_pretrained(bert_model_name).cuda()
+    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+    similarity = cosine_similarity_context_response(
+        " ".join(total_response_arr[:-1]),
+        total_response_arr[-1],
+        bert_model,
+        bert_tokenizer,
+    )
+    return f"Cosine Similarity: {similarity*100:.2f}" + "\n"
+def DialoGPT_perplexity(
+    user_utterance: str,
+    response: str,
+    dialog_model_name: str = "microsoft/DialoGPT-medium",
+) -> str:
+    """
+    Compute the perplexity of a response given a user utterance using a pre-trained
+    DialoGPT model. The function loads DialoGPT (medium by default)
+    from the Hugging Face Model Hub, then calculates the perplexity
+    for the
+    (context + response) sequence.
+    Args:
+        user_utterance (str):
+            The user utterance preceding the model's response.
+        response (str):
+            The generated response whose perplexity needs to be evaluated.
+    Returns:
+        str:
+            A formatted string containing the DialoGPT perplexity score. For example:
+            "DialoGPT Perplexity: 25.67\n"
+    Example:
+        >>> user_text = "Hi, how are you today?"
+        >>> system_response = "I'm good, thank you! How can I help you?"
+        >>> result = DialoGPT_perplexity(user_text, system_response)
+        >>> print(result)
+        "DialoGPT Perplexity: 31.45\n"
+    """
+    def evaluate_response_with_dialoGPT(context, response, model, tokenizer):
+        """
+        Evaluate the appropriateness of a response based on the
+        given context using DialoGPT.
+        Args:
+            context (str): The dialogue context (previous conversation).
+            response (str): The generated response to evaluate.
+            model: Pre-trained DialoGPT model.
+            tokenizer: Corresponding tokenizer for the DialoGPT model.
+        Returns:
+            float: Perplexity score of the response given the context.
+        """
+        model.eval()
+        # Combine context and response as input
+        input_text = context + tokenizer.eos_token + response + tokenizer.eos_token
+        inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+        inputs["input_ids"] = inputs["input_ids"].cuda()
+        inputs["attention_mask"] = inputs["attention_mask"].cuda()
+        # import pdb;pdb.set_trace()
+        # Compute model outputs and loss
+        with torch.no_grad():
+            outputs = model(**inputs, labels=inputs["input_ids"].cuda())
+            loss = outputs.loss
+        # Calculate perplexity
+        perplexity = torch.exp(loss)
+        return perplexity.cpu().item()
+    # Load DialoGPT model and tokenizer
+    model_name = dialog_model_name
+    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    perplexity = evaluate_response_with_dialoGPT(
+        user_utterance, response, model, tokenizer
+    )
+    return f"DialoGPT Perplexity: {perplexity:.2f}" + "\n"

pyscripts/utils/dialog_eval/TTS_intelligibility.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from typing import Tuple
+import numpy as np
+from espnet2.sds.utils.utils import int2float
+def handle_espnet_TTS_intelligibility(
+    TTS_audio_output: Tuple[int, np.ndarray], LLM_Output: str
+) -> str:
+    """
+    Compute and return Word Error Rate (WER) and Character Error Rate (CER) metrics
+    for multiple ASR systems (ESPnet, OWSM, Whisper) using the Versa library.
+    This function:
+      1. Imports the necessary metrics and setup functions from Versa.
+      2. Prepares configuration arguments for each ASR system (ESPnet, OWSM, Whisper).
+      3. Runs the Levenshtein-based WER/CER calculations on the provided TTS audio.
+      4. Returns a formatted string summarizing WER and CER results
+      for hypotheses produced
+        by each ASR system when transcribing the TTS audio, using
+        the LLM output as the reference text.
+    Args:
+        TTS_audio_output (Tuple[int, np.ndarray]):
+            A tuple consisting of:
+                - The first element (int): the frame rate of the audio.
+                - The second element (np.ndarray):
+                the audio signal (e.g., a NumPy array).
+        LLM_Output (str):
+            The reference text generated by the LLM, which serves as the ground truth
+            for evaluating the TTS audio.
+    Returns:
+        str:
+            A formatted string showing the WER and CER percentages
+            for ESPnet, OWSM, and Whisper.
+            Example:
+            ESPnet WER: 10.50
+            ESPnet CER: 7.20
+            OWSM WER: 11.30
+            OWSM CER: 8.00
+            Whisper WER: 9.25
+            Whisper CER: 6.50
+    Raises:
+        ImportError:
+            If the Versa library is not installed or cannot be imported.
+    Example:
+        >>> tts_audio_output = (16000, audio_array)
+        >>> llm_output = "This is the reference text for evaluation."
+        >>> result = handle_espnet_TTS_intelligibility(tts_audio_output, llm_output)
+        >>> print(result)
+        ESPnet WER: 10.50
+        ESPnet CER: 7.20
+        OWSM WER: 11.30
+        OWSM CER: 8.00
+        Whisper WER: 9.25
+        Whisper CER: 6.50
+    """
+    try:
+        from versa import (
+            espnet_levenshtein_metric,
+            espnet_wer_setup,
+            owsm_levenshtein_metric,
+            owsm_wer_setup,
+            whisper_levenshtein_metric,
+            whisper_wer_setup,
+        )
+    except Exception as e:
+        print("Error: Versa is not properly installed.")
+        raise e
+    score_modules_espnet = {
+        "module": espnet_levenshtein_metric,
+        "args": espnet_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_espnet["module"](
+        score_modules_espnet["args"],
+        int2float(TTS_audio_output[1]),
+        LLM_Output,
+        TTS_audio_output[0],
+    )
+    espnet_wer = (
+        dict1["espnet_wer_delete"]
+        + dict1["espnet_wer_insert"]
+        + dict1["espnet_wer_replace"]
+    ) / (
+        dict1["espnet_wer_delete"]
+        + dict1["espnet_wer_replace"]
+        + dict1["espnet_wer_equal"]
+    )
+    espnet_cer = (
+        dict1["espnet_cer_delete"]
+        + dict1["espnet_cer_insert"]
+        + dict1["espnet_cer_replace"]
+    ) / (
+        dict1["espnet_cer_delete"]
+        + dict1["espnet_cer_replace"]
+        + dict1["espnet_cer_equal"]
+    )
+    score_modules_owsm = {
+        "module": owsm_levenshtein_metric,
+        "args": owsm_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_owsm["module"](
+        score_modules_owsm["args"],
+        int2float(TTS_audio_output[1]),
+        LLM_Output,
+        TTS_audio_output[0],
+    )
+    owsm_wer = (
+        dict1["owsm_wer_delete"] + dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"]
+    ) / (dict1["owsm_wer_delete"] + dict1["owsm_wer_replace"] + dict1["owsm_wer_equal"])
+    owsm_cer = (
+        dict1["owsm_cer_delete"] + dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"]
+    ) / (dict1["owsm_cer_delete"] + dict1["owsm_cer_replace"] + dict1["owsm_cer_equal"])
+    score_modules_whisper = {
+        "module": whisper_levenshtein_metric,
+        "args": whisper_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_whisper["module"](
+        score_modules_whisper["args"],
+        int2float(TTS_audio_output[1]),
+        LLM_Output,
+        TTS_audio_output[0],
+    )
+    whisper_wer = (
+        dict1["whisper_wer_delete"]
+        + dict1["whisper_wer_insert"]
+        + dict1["whisper_wer_replace"]
+    ) / (
+        dict1["whisper_wer_delete"]
+        + dict1["whisper_wer_replace"]
+        + dict1["whisper_wer_equal"]
+    )
+    whisper_cer = (
+        dict1["whisper_cer_delete"]
+        + dict1["whisper_cer_insert"]
+        + dict1["whisper_cer_replace"]
+    ) / (
+        dict1["whisper_cer_delete"]
+        + dict1["whisper_cer_replace"]
+        + dict1["whisper_cer_equal"]
+    )
+    return (
+        f"ESPnet WER: {espnet_wer*100:.2f}\n"
+        f"ESPnet CER: {espnet_cer*100:.2f}\n"
+        f"OWSM WER: {owsm_wer*100:.2f}\n"
+        f"OWSM CER: {owsm_cer*100:.2f}\n"
+        f"Whisper WER: {whisper_wer*100:.2f}\n"
+        f"Whisper CER: {whisper_cer*100:.2f}"
+    )

pyscripts/utils/dialog_eval/TTS_speech_quality.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import Tuple
+import numpy as np
+from espnet2.sds.utils.utils import int2float
+def TTS_psuedomos(TTS_audio_output: Tuple[int, np.ndarray]) -> str:
+    """
+    Compute and return speech quality metrics
+    for the given synthesized audio output
+    using the Versa library.
+    Args:
+        TTS_audio_output (Tuple[int, np.ndarray]):
+            A tuple containing:
+                - The first element (int): The frame rate of the audio.
+                - The second element (np.ndarray): The audio signal,
+                typically a NumPy array.
+    Returns:
+        str:
+            A formatted string containing each metric name
+            and its corresponding score, for example:
+            utmos: 3.54
+            dnsmos: 3.47
+            plcmos: 3.62
+            sheet_ssqa: 4.03
+    Raises:
+        ImportError:
+            If the Versa library is not installed or cannot be imported.
+    Example:
+        >>> tts_audio_output = (16000, audio_array)
+        >>> result = TTS_psuedomos(tts_audio_output)
+        >>> print(result)
+        utmos: 3.54
+        dnsmos: 3.47
+        plcmos: 3.62
+        sheet_ssqa: 4.03
+    """
+    try:
+        from versa import (
+            pseudo_mos_metric,
+            pseudo_mos_setup,
+            sheet_ssqa,
+            sheet_ssqa_setup,
+        )
+    except Exception as e:
+        print("Error: Versa is not properly installed.")
+        raise e
+    predictor_dict, predictor_fs = pseudo_mos_setup(
+        use_gpu=True,
+        predictor_types=["utmos", "dnsmos", "plcmos"],
+        predictor_args={
+            "utmos": {"fs": 16000},
+            "dnsmos": {"fs": 16000},
+            "plcmos": {"fs": 16000},
+        },
+    )
+    score_modules = {
+        "module": pseudo_mos_metric,
+        "args": {
+            "predictor_dict": predictor_dict,
+            "predictor_fs": predictor_fs,
+            "use_gpu": True,
+        },
+    }
+    dict1 = score_modules["module"](
+        int2float(TTS_audio_output[1]),
+        TTS_audio_output[0],
+        **score_modules["args"],
+    )
+    str1 = ""
+    for k in dict1:
+        str1 = str1 + f"{k}: {dict1[k]:.2f}\n"
+    sheet_model = sheet_ssqa_setup(
+        model_tag="default",
+        model_path=None,
+        model_config=None,
+        use_gpu=True,
+    )
+    score_modules = {
+        "module": sheet_ssqa,
+        "args": {"model": sheet_model, "use_gpu": True},
+    }
+    dict1 = score_modules["module"](
+        score_modules["args"]["model"],
+        int2float(TTS_audio_output[1]),
+        TTS_audio_output[0],
+        use_gpu=score_modules["args"]["use_gpu"],
+    )
+    for k in dict1:
+        str1 = str1 + f"{k}: {dict1[k]:.2f}\n"
+    return str1

pyscripts/utils/dialog_eval/__pycache__/ASR_WER.cpython-39.pyc ADDED Viewed

Binary file (4.12 kB). View file

pyscripts/utils/dialog_eval/__pycache__/LLM_Metrics.cpython-39.pyc ADDED Viewed

Binary file (8.51 kB). View file

pyscripts/utils/dialog_eval/__pycache__/TTS_intelligibility.cpython-39.pyc ADDED Viewed

Binary file (4.34 kB). View file

pyscripts/utils/dialog_eval/__pycache__/TTS_speech_quality.cpython-39.pyc ADDED Viewed

Binary file (2.39 kB). View file

pyscripts/utils/dialog_eval/__pycache__/human_feedback.cpython-39.pyc ADDED Viewed

Binary file (7.34 kB). View file

pyscripts/utils/dialog_eval/__pycache__/vert.cpython-39.pyc ADDED Viewed

Binary file (9.13 kB). View file

pyscripts/utils/dialog_eval/human_feedback.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import gradio as gr
+disable_btn = gr.Button(interactive=False, visible=False)
+def get_ip(request: gr.Request) -> str:
+    """
+    Retrieve the IP address from an incoming HTTP request.
+    Args:
+        request (gr.Request):
+            The incoming HTTP request from which the IP address will be extracted.
+    Returns:
+        str:
+            The IP address as a string.
+    """
+    if "cf-connecting-ip" in request.headers:
+        ip = request.headers["cf-connecting-ip"]
+    elif "x-forwarded-for" in request.headers:
+        ip = request.headers["x-forwarded-for"]
+        if "," in ip:
+            ip = ip.split(",")[0]
+    else:
+        ip = request.client.host
+    return ip
+def natural_vote1_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Very Natural".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Very Natural", <ip_address>, (disable_btn,) * 4)
+            - "Very Natural": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Very Natural (voted). ip: {ip_address1}")
+    return (
+        "Very Natural",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def natural_vote2_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Somewhat Awkward".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Somewhat Awkward", <ip_address>, (disable_btn,) * 4)
+            - "Somewhat Awkward": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Somewhat Awkward (voted). ip: {ip_address1}")
+    return (
+        "Somewhat Awkward",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def natural_vote3_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Very Awkward".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Very Awkward", <ip_address>, (disable_btn,) * 4)
+            - "Very Awkward": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Very Awkward (voted). ip: {ip_address1}")
+    return (
+        "Very Awkward",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def natural_vote4_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Unnatural".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Unnatural", <ip_address>, (disable_btn,) * 4)
+            - "Unnatural": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Unnatural (voted). ip: {ip_address1}")
+    return (
+        "Unnatural",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote1_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Highly Relevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Highly Relevant", <ip_address>, (disable_btn,) * 4)
+            - "Highly Relevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Highly Relevant (voted). ip: {ip_address1}")
+    return (
+        "Highly Relevant",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote2_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Partially Relevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Partially Relevant", <ip_address>, (disable_btn,) * 4)
+            - "Partially Relevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Partially Relevant (voted). ip: {ip_address1}")
+    return (
+        "Partially Relevant",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote3_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Slightly Irrelevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Slightly Irrelevant", <ip_address>, (disable_btn,) * 4)
+            - "Slightly Irrelevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Slightly Irrelevant (voted). ip: {ip_address1}")
+    return (
+        "Slightly Irrelevant",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote4_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Completely Irrelevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Completely Irrelevant", <ip_address>, (disable_btn,) * 4)
+            - "Completely Irrelevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Completely Irrelevant (voted). ip: {ip_address1}")
+    return (
+        "Completely Irrelevant",
+        ip_address1,
+    ) + (disable_btn,) * 4

pyscripts/utils/dialog_eval/vert.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import sys
+import warnings
+from collections import Counter
+from fractions import Fraction
+import nltk
+import numpy as np
+from nltk.translate.bleu_score import (
+    SmoothingFunction,
+    brevity_penalty,
+    closest_ref_length,
+    modified_precision,
+)
+def corpus_bleu(
+    list_of_references,
+    hypotheses,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+    averaging_mode="geometric",
+    no_length_penalty=False,
+):
+    """
+    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+    Instead of averaging the sentence level BLEU scores (i.e. marco-average
+    precision), the original BLEU metric (Papineni et al. 2002) accounts for
+    the micro-average precision (i.e. summing the numerators and denominators
+    for each hypothesis-reference(s) pairs before the division).
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+    ...          'because', 'he', 'read', 'the', 'book']
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+    0.5920...
+    The example below show that corpus_bleu() is different from averaging
+    sentence_bleu() for hypotheses
+    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
+    >>> score2 = sentence_bleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+    0.6223...
+    :param list_of_references: a corpus of lists of reference
+    sentences, w.r.t. hypotheses
+    :type list_of_references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh: Option to re-normalize the weights uniformly.
+    :type auto_reweigh: bool
+    :return: The corpus-level BLEU score.
+    :rtype: float
+    """
+    # Before proceeding to compute BLEU, perform sanity checks.
+    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
+    hyp_lengths, ref_lengths = 0, 0
+    assert len(list_of_references) == len(hypotheses), (
+        "The number of hypotheses and their reference(s) should be the " "same "
+    )
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(weights, start=1):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len = len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+    # Calculate corpus-level brevity penalty.
+    if no_length_penalty and averaging_mode == "geometric":
+        bp = 1.0
+    elif no_length_penalty and averaging_mode == "arithmetic":
+        bp = 0.0
+    else:
+        assert not no_length_penalty
+        assert (
+            averaging_mode != "arithmetic"
+        ), "Not sure how to apply length penalty when aurithmetic mode"
+        bp = brevity_penalty(ref_lengths, hyp_lengths)
+    # Uniformly re-weighting based on maximum hypothesis lengths if largest
+    # order of n-grams < 4 and weights is set at default.
+    if auto_reweigh:
+        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
+            weights = (1 / hyp_lengths,) * hyp_lengths
+    # Collects the various precision values for the different ngram orders.
+    p_n = [
+        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+        for i, _ in enumerate(weights, start=1)
+    ]
+    # Returns 0 if there's no matching n-grams
+    # We only need to check for p_numerators[1] == 0, since if there's
+    # no unigrams, there won't be any higher order ngrams.
+    if p_numerators[1] == 0:
+        return 0
+    # If there's no smoothing, set use method0 from SmoothinFunction class.
+    if not smoothing_function:
+        smoothing_function = SmoothingFunction().method0
+    # Smoothen the modified precision.
+    # Note: smoothing_function() may convert values into floats;
+    #       it tries to retain the Fraction object as much as the
+    #       smoothing method allows.
+    p_n = smoothing_function(
+        p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
+    )
+    if averaging_mode == "geometric":
+        s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
+        s = bp * math.exp(math.fsum(s))
+    elif averaging_mode == "arithmetic":
+        s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
+        s = math.fsum(s)
+    return s
+def sentence_bleu(
+    references,
+    hypothesis,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+    averaging_mode="geometric",
+    no_length_penalty=False,
+):
+    return corpus_bleu(
+        [references],
+        [hypothesis],
+        weights,
+        smoothing_function,
+        auto_reweigh,
+        averaging_mode,
+        no_length_penalty,
+    )
+def get_target_sequences(manifest, ground_truth, to_take=1000):
+    import json
+    import pathlib
+    with open(ground_truth, "r") as fin:
+        original_continuations = json.loads(fin.read())
+    sequence2length = [(k, v[0]) for k, v in original_continuations.items()]
+    assert all(float(v) >= 6.0 for (_, v) in sequence2length)  # 6 seconds
+    sequence2length.sort(key=lambda x: x[1])
+    to_take_sequences = set(v[0] for v in sequence2length[:to_take])
+    to_take_ids = []
+    with open(manifest, "r") as f:
+        f.readline()
+        for i, line in enumerate(f.readlines()):
+            seq_id = line.split()[0]
+            seq_id = pathlib.Path(seq_id).name.split("__")[0]
+            if seq_id in to_take_sequences:
+                to_take_ids.append(i)
+    print(f"Took {len(to_take_ids)} ids")
+    return set(to_take_ids)
+def get_self_bleu(utterances, averaging_mode, weights):
+    self_bleu = []
+    for i in range(len(utterances)):
+        hypo = utterances[i]
+        rest = utterances[:i] + utterances[i + 1 :]
+        self_bleu.append(
+            sentence_bleu(
+                rest,
+                hypo,
+                weights,
+                no_length_penalty=True,
+                averaging_mode=averaging_mode,
+            )
+        )
+    return self_bleu
+def get_self_bleu2_arithmetic(utterances):
+    weights = (0.5, 0.5)  # equal weight for unigrams and bigrams
+    return get_self_bleu(utterances, averaging_mode="arithmetic", weights=weights)
+def get_self_bleu2_geometric(utterances):
+    weights = (0.5, 0.5)
+    return get_self_bleu(utterances, averaging_mode="geometric", weights=weights)
+def get_auto_bleu2_arithmetic(utterances):
+    weights = (0.5, 0.5)
+    return [auto_bleu(u, mean_mode="arithmetic", weights=weights) for u in utterances]
+def get_auto_bleu2_geometric(utterances):
+    weights = (0.5, 0.5)
+    return [auto_bleu(u, mean_mode="geometric", weights=weights) for u in utterances]
+def get_auto_bleu3_geometric(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return [auto_bleu(u, mean_mode="geometric", weights=weights) for u in utterances]
+def get_auto_bleu3_arithmetic(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return [auto_bleu(u, mean_mode="arithmetic", weights=weights) for u in utterances]
+def get_self_bleu3_arithmetic(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return get_self_bleu(utterances, averaging_mode="arithmetic", weights=weights)
+def get_self_bleu3_geometric(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return get_self_bleu(utterances, averaging_mode="geometric", weights=weights)
+def auto_bleu(sentence, weights, mean_mode="arithmetic"):
+    if len(sentence) <= 1:
+        return 0
+    N = len(weights)
+    bleu_n = np.zeros([N])
+    for n in range(N):
+        targ_ngrams = list(nltk.ngrams(sentence, n + 1))
+        for p in range(len(targ_ngrams)):
+            left = sentence[:p]
+            right = sentence[(p + n + 1) :]
+            rest_ngrams = list(nltk.ngrams(left, n + 1)) + list(
+                nltk.ngrams(right, n + 1)
+            )
+            # compute the nb of matching ngrams
+            bleu_n[n] += targ_ngrams[p] in rest_ngrams
+        bleu_n[n] /= len(targ_ngrams)  # average them to get a proportion
+    weights = np.array(weights)
+    if mean_mode == "arithmetic":
+        return (bleu_n * weights).sum()
+    elif mean_mode == "geometric":
+        return (bleu_n**weights).prod()
+    else:
+        raise ValueError(f"Unknown agggregation mode {mean_mode}")
+def run_f(task_params):
+    f, terms = task_params
+    return f(terms)