Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,10 +50,19 @@ def cut_dialogue_history(history_memory, keep_last_n_words = 500):
|
|
| 50 |
return '\n' + '\n'.join(paragraphs)
|
| 51 |
|
| 52 |
class ConversationBot:
|
| 53 |
-
def __init__(self):
|
| 54 |
print("Initializing AudioGPT")
|
| 55 |
self.tools = []
|
| 56 |
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def run_text(self, text, state):
|
| 58 |
print("===============Running run_text =============")
|
| 59 |
print("Inputs:", text, state)
|
|
@@ -147,7 +156,7 @@ class ConversationBot:
|
|
| 147 |
print("===============Running inpainting =============")
|
| 148 |
print("Inputs:", state)
|
| 149 |
print("======>Previous memory:\n %s" % self.agent.memory)
|
| 150 |
-
inpaint = Inpaint(device="
|
| 151 |
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
|
| 152 |
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"*{new_image_filename}*"
|
| 153 |
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
|
@@ -163,52 +172,6 @@ class ConversationBot:
|
|
| 163 |
return gr.Button.update(visible=False)
|
| 164 |
def init_agent(self, openai_api_key):
|
| 165 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
| 166 |
-
self.t2i = T2I(device="cuda:0")
|
| 167 |
-
# self.i2t = ImageCaptioning(device="cuda:0")
|
| 168 |
-
self.t2a = T2A(device="cuda:0")
|
| 169 |
-
self.tts = TTS(device="cpu")
|
| 170 |
-
self.t2s = T2S(device="cpu")
|
| 171 |
-
self.i2a = I2A(device="cuda:0")
|
| 172 |
-
self.a2t = A2T(device="cpu")
|
| 173 |
-
self.asr = ASR(device="cuda:0")
|
| 174 |
-
self.inpaint = Inpaint(device="cuda:0")
|
| 175 |
-
# self.tts_ood = TTS_OOD(device="cpu")
|
| 176 |
-
self.tools = [
|
| 177 |
-
Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
| 178 |
-
description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
|
| 179 |
-
"The input to this tool should be a string, representing the text used to generate image. "),
|
| 180 |
-
# Tool(name="Get Photo Description", func=self.i2t.inference,
|
| 181 |
-
# description="useful for when you want to know what is inside the photo. receives image_path as input. "
|
| 182 |
-
# "The input to this tool should be a string, representing the image_path. "),
|
| 183 |
-
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
| 184 |
-
description="useful for when you want to generate an audio from a user input text and it saved it to a file."
|
| 185 |
-
"The input to this tool should be a string, representing the text used to generate audio."),
|
| 186 |
-
# Tool(
|
| 187 |
-
# name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
|
| 188 |
-
# description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
| 189 |
-
# "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
| 190 |
-
# "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
|
| 191 |
-
Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
|
| 192 |
-
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
|
| 193 |
-
"If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
|
| 194 |
-
"If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
|
| 195 |
-
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
| 196 |
-
"The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
|
| 197 |
-
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
| 198 |
-
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
| 199 |
-
"The input to this tool should be a string, representing the text used to be converted to speech."),
|
| 200 |
-
Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
| 201 |
-
description="useful for when you want to generate an audio based on an image."
|
| 202 |
-
"The input to this tool should be a string, representing the image_path. "),
|
| 203 |
-
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
| 204 |
-
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
| 205 |
-
"The input to this tool should be a string, representing the audio_path."),
|
| 206 |
-
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
| 207 |
-
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
| 208 |
-
"The input to this tool should be a string, representing the audio_path."),
|
| 209 |
-
Tool(name="Transcribe speech", func=self.asr.inference,
|
| 210 |
-
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
| 211 |
-
"The input to this tool should be a string, representing the audio_path.")]
|
| 212 |
self.agent = initialize_agent(
|
| 213 |
self.tools,
|
| 214 |
self.llm,
|
|
@@ -221,8 +184,16 @@ class ConversationBot:
|
|
| 221 |
|
| 222 |
|
| 223 |
|
| 224 |
-
if __name__ == '__main__':
|
| 225 |
-
bot = ConversationBot(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
| 227 |
with gr.Row():
|
| 228 |
openai_api_key_textbox = gr.Textbox(
|
|
|
|
| 50 |
return '\n' + '\n'.join(paragraphs)
|
| 51 |
|
| 52 |
class ConversationBot:
|
| 53 |
+
def __init__(self, load_dict):
|
| 54 |
print("Initializing AudioGPT")
|
| 55 |
self.tools = []
|
| 56 |
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
| 57 |
+
self.models = dict()
|
| 58 |
+
for class_name, device in load_dict.items():
|
| 59 |
+
self.models[class_name] = globals()[class_name](device=device)
|
| 60 |
+
for class_name, instance in self.models.items():
|
| 61 |
+
for e in dir(instance):
|
| 62 |
+
if e.startswith('inference'):
|
| 63 |
+
func = getattr(instance, e)
|
| 64 |
+
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
| 65 |
+
|
| 66 |
def run_text(self, text, state):
|
| 67 |
print("===============Running run_text =============")
|
| 68 |
print("Inputs:", text, state)
|
|
|
|
| 156 |
print("===============Running inpainting =============")
|
| 157 |
print("Inputs:", state)
|
| 158 |
print("======>Previous memory:\n %s" % self.agent.memory)
|
| 159 |
+
inpaint = Inpaint(device="cpu")
|
| 160 |
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
|
| 161 |
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"*{new_image_filename}*"
|
| 162 |
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
|
|
|
| 172 |
return gr.Button.update(visible=False)
|
| 173 |
def init_agent(self, openai_api_key):
|
| 174 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
self.agent = initialize_agent(
|
| 176 |
self.tools,
|
| 177 |
self.llm,
|
|
|
|
| 184 |
|
| 185 |
|
| 186 |
|
| 187 |
+
if __name__ == '__main__':
|
| 188 |
+
bot = ConversationBot({'T2I': 'cuda:0',
|
| 189 |
+
'T2A': 'cuda:0',
|
| 190 |
+
'I2A': 'cuda:0',
|
| 191 |
+
'TTS': 'cpu',
|
| 192 |
+
'T2S': 'cpu',
|
| 193 |
+
'Inpaint': 'cpu',
|
| 194 |
+
'ASR': 'cuda:0',
|
| 195 |
+
'A2T': 'cpu',
|
| 196 |
+
})
|
| 197 |
with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
| 198 |
with gr.Row():
|
| 199 |
openai_api_key_textbox = gr.Textbox(
|