Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -70,7 +70,7 @@ class ConversationBot:
|
|
| 70 |
tool = res['intermediate_steps'][0][0].tool
|
| 71 |
if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
|
| 72 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 73 |
-
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
| 74 |
state = state + [(text, response)]
|
| 75 |
print("Outputs:", state)
|
| 76 |
return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
|
@@ -85,7 +85,7 @@ class ConversationBot:
|
|
| 85 |
print("Outputs:", state)
|
| 86 |
return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
|
| 87 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 88 |
-
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
| 89 |
audio_filename = res['intermediate_steps'][0][1]
|
| 90 |
state = state + [(text, response)]
|
| 91 |
print("Outputs:", state)
|
|
@@ -134,7 +134,7 @@ class ConversationBot:
|
|
| 134 |
AI_prompt = "Received. "
|
| 135 |
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
| 136 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 137 |
-
state = state + [(f"*{image_filename}*", AI_prompt)]
|
| 138 |
print("Outputs:", state)
|
| 139 |
return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
|
| 140 |
|
|
@@ -144,7 +144,7 @@ class ConversationBot:
|
|
| 144 |
print("======>Previous memory:\n %s" % self.agent.memory)
|
| 145 |
inpaint = Inpaint(device="cuda:0")
|
| 146 |
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
|
| 147 |
-
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"*{new_image_filename}*"
|
| 148 |
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
| 149 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 150 |
state = state + [(f"Audio Inpainting", AI_prompt)]
|
|
@@ -160,13 +160,13 @@ class ConversationBot:
|
|
| 160 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
| 161 |
# self.t2i = T2I(device="cuda:0")
|
| 162 |
# self.i2t = ImageCaptioning(device="cuda:0")
|
| 163 |
-
self.t2a = T2A(device="cpu")
|
| 164 |
self.tts = TTS(device="cpu")
|
| 165 |
# self.t2s = T2S(device="cuda:0")
|
| 166 |
-
self.i2a = I2A(device="cpu")
|
| 167 |
-
self.a2t = A2T(device="cpu")
|
| 168 |
# self.asr = ASR(device="cuda:0")
|
| 169 |
-
self.inpaint = Inpaint(device="cpu")
|
| 170 |
#self.tts_ood = TTS_OOD(device="cuda:0")
|
| 171 |
self.tools = [
|
| 172 |
# Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
|
@@ -175,9 +175,9 @@ class ConversationBot:
|
|
| 175 |
# Tool(name="Get Photo Description", func=self.i2t.inference,
|
| 176 |
# description="useful for when you want to know what is inside the photo. receives image_path as input. "
|
| 177 |
# "The input to this tool should be a string, representing the image_path. "),
|
| 178 |
-
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
| 179 |
-
|
| 180 |
-
|
| 181 |
# Tool(
|
| 182 |
# name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
|
| 183 |
# description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
|
@@ -191,16 +191,16 @@ class ConversationBot:
|
|
| 191 |
# "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
|
| 192 |
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
| 193 |
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
| 194 |
-
"The input to this tool should be a string, representing the text used to be converted to speech.")
|
| 195 |
-
Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
| 202 |
-
|
| 203 |
-
|
| 204 |
# Tool(name="Transcribe speech", func=self.asr.inference,
|
| 205 |
# description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
| 206 |
# "The input to this tool should be a string, representing the audio_path.")]
|
|
|
|
| 70 |
tool = res['intermediate_steps'][0][0].tool
|
| 71 |
if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
|
| 72 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 73 |
+
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
| 74 |
state = state + [(text, response)]
|
| 75 |
print("Outputs:", state)
|
| 76 |
return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
|
|
|
| 85 |
print("Outputs:", state)
|
| 86 |
return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
|
| 87 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 88 |
+
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
| 89 |
audio_filename = res['intermediate_steps'][0][1]
|
| 90 |
state = state + [(text, response)]
|
| 91 |
print("Outputs:", state)
|
|
|
|
| 134 |
AI_prompt = "Received. "
|
| 135 |
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
| 136 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 137 |
+
state = state + [(f"*{image_filename}*", AI_prompt)]
|
| 138 |
print("Outputs:", state)
|
| 139 |
return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
|
| 140 |
|
|
|
|
| 144 |
print("======>Previous memory:\n %s" % self.agent.memory)
|
| 145 |
inpaint = Inpaint(device="cuda:0")
|
| 146 |
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
|
| 147 |
+
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"*{new_image_filename}*"
|
| 148 |
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
| 149 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 150 |
state = state + [(f"Audio Inpainting", AI_prompt)]
|
|
|
|
| 160 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
| 161 |
# self.t2i = T2I(device="cuda:0")
|
| 162 |
# self.i2t = ImageCaptioning(device="cuda:0")
|
| 163 |
+
# self.t2a = T2A(device="cpu")
|
| 164 |
self.tts = TTS(device="cpu")
|
| 165 |
# self.t2s = T2S(device="cuda:0")
|
| 166 |
+
# self.i2a = I2A(device="cpu")
|
| 167 |
+
# self.a2t = A2T(device="cpu")
|
| 168 |
# self.asr = ASR(device="cuda:0")
|
| 169 |
+
# self.inpaint = Inpaint(device="cpu")
|
| 170 |
#self.tts_ood = TTS_OOD(device="cuda:0")
|
| 171 |
self.tools = [
|
| 172 |
# Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
|
|
|
| 175 |
# Tool(name="Get Photo Description", func=self.i2t.inference,
|
| 176 |
# description="useful for when you want to know what is inside the photo. receives image_path as input. "
|
| 177 |
# "The input to this tool should be a string, representing the image_path. "),
|
| 178 |
+
# Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
| 179 |
+
# description="useful for when you want to generate an audio from a user input text and it saved it to a file."
|
| 180 |
+
# "The input to this tool should be a string, representing the text used to generate audio."),
|
| 181 |
# Tool(
|
| 182 |
# name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
|
| 183 |
# description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
|
|
|
| 191 |
# "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
|
| 192 |
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
| 193 |
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
| 194 |
+
"The input to this tool should be a string, representing the text used to be converted to speech.")]
|
| 195 |
+
# Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
| 196 |
+
# description="useful for when you want to generate an audio based on an image."
|
| 197 |
+
# "The input to this tool should be a string, representing the image_path. "),
|
| 198 |
+
# Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
| 199 |
+
# description="useful for when you want to describe an audio in text, receives audio_path as input."
|
| 200 |
+
# "The input to this tool should be a string, representing the audio_path."),
|
| 201 |
+
# Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
| 202 |
+
# description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
| 203 |
+
# "The input to this tool should be a string, representing the audio_path.")]
|
| 204 |
# Tool(name="Transcribe speech", func=self.asr.inference,
|
| 205 |
# description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
| 206 |
# "The input to this tool should be a string, representing the audio_path.")]
|