Spaces:
Sleeping
Sleeping
| import torch | |
| import spaces | |
| import numpy as np | |
| import gradio as gr | |
| from gtts import gTTS | |
| from transformers import pipeline | |
| from huggingface_hub import InferenceClient | |
| ASR_MODEL_NAME = "openai/whisper-small" | |
| LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2" | |
| system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant and You respond with one to two sentences. [/INST] Hello there! I'm friday how can I help you?</s>""" | |
| chat_history = system_prompt + """""" | |
| formatted_history = """""" | |
| client = InferenceClient(LLM_MODEL_NAME) | |
| device = 0 if torch.cuda.is_available() else "cpu" | |
| pipe = pipeline( | |
| task="automatic-speech-recognition", | |
| model=ASR_MODEL_NAME, | |
| device=device, | |
| ) | |
| def generate(user_prompt, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0): | |
| temperature = float(temperature) | |
| if temperature < 1e-2: | |
| temperature = 1e-2 | |
| top_p = float(top_p) | |
| generate_kwargs = dict( | |
| temperature=temperature, | |
| max_new_tokens=max_new_tokens, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty, | |
| do_sample=True, | |
| seed=42, | |
| ) | |
| chat_history += f""" <s>[INST] {user_prompt} [/INST] """ | |
| output = client.text_generation( | |
| chat_history, **generate_kwargs, stream=False, details=False, return_full_text=False) | |
| print(output) | |
| return output | |
| def transcribe(audio): | |
| sr, y = audio | |
| y = y.astype(np.float32) | |
| y /= np.max(np.abs(y)) | |
| inputs = pipe({"sampling_rate": sr, "raw": y})["text"] | |
| formatted_history += f"""Human: {inputs}\n""" | |
| llm_response = generate(inputs) | |
| chat_history += f""" {llm_response}</s>""" | |
| formatted_history += f"""Friday: {llm_response}\n""" | |
| audio_response = gTTS(llm_response) | |
| audio_response.save("response.mp3") | |
| print(formatted_history) | |
| return "response.mp3" | |
| with gr.Blocks() as demo: | |
| gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>") | |
| with gr.Row(): | |
| audio_input = gr.Audio(label="Human", sources="microphone") | |
| output_audio = gr.Audio(label="Friday", type="filepath", | |
| interactive=False, | |
| autoplay=True, | |
| elem_classes="audio") | |
| transcribe_btn = gr.Button("Transcribe") | |
| transcribe_btn.click(fn=transcribe, inputs=audio_input, | |
| outputs=output_audio) | |
| demo.queue() | |
| demo.launch() | |