Spaces:
Runtime error
Runtime error
| import string | |
| import gradio as gr | |
| import requests | |
| import torch | |
| from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline | |
| from PIL import Image | |
| model_name="hfl/vle-base-for-vqa" | |
| model = VLEForVQA.from_pretrained(model_name) | |
| vle_processor = VLEProcessor.from_pretrained(model_name) | |
| vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor) | |
| from transformers import BlipForQuestionAnswering, BlipProcessor | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large") | |
| model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device) | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
| def caption(input_image): | |
| inputs = cap_processor(input_image, return_tensors="pt") | |
| # inputs["num_beams"] = 1 | |
| # inputs['num_return_sequences'] =1 | |
| out = cap_model.generate(**inputs) | |
| return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True)) | |
| import openai | |
| import os | |
| openai.api_key= os.getenv('openai_appkey') | |
| def gpt3_short(question,vqa_answer,caption): | |
| vqa_answer,vqa_score=vqa_answer | |
| prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\ | |
| "; B: "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; C: "+vqa_answer[2]+", score:"+str(vqa_score[2])+\ | |
| "; D: "+vqa_answer[3]+', score:'+str(vqa_score[3])+\ | |
| ". Choose A if it is not in conflict with the description of the picture and A's score is bigger than 0.8; otherwise choose the B, C or D based on the description." | |
| # prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer." | |
| response = openai.Completion.create( | |
| engine="text-davinci-003", | |
| prompt=prompt, | |
| max_tokens=10, | |
| n=1, | |
| stop=None, | |
| temperature=0.7, | |
| ) | |
| answer = response.choices[0].text.strip() | |
| llm_ans=answer | |
| choice=set(["A","B","C","D"]) | |
| llm_ans=llm_ans.replace("\n"," ").replace(":"," ").replace("."," " ).replace(","," ") | |
| sllm_ans=llm_ans.split(" ") | |
| for cho in sllm_ans: | |
| if cho in choice: | |
| llm_ans=cho | |
| break | |
| if llm_ans not in choice: | |
| llm_ans="A" | |
| llm_ans=vqa_answer[ord(llm_ans)-ord("A")] | |
| answer=llm_ans | |
| return answer | |
| def gpt3_long(question,vqa_answer,caption): | |
| vqa_answer,vqa_score=vqa_answer | |
| # prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+"socre:"+str(vqa_score[0])+\ | |
| # " B: "+vqa_answer[1]+" score:"+str(vqa_score[1])+" C: "+vqa_answer[2]+" score:"+str(vqa_score[2])+\ | |
| # " D: "+vqa_answer[3]+'score:'+str(vqa_score[3])+\ | |
| # "Tell me the right answer with a long sentence." | |
| prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\ | |
| "; "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; "+vqa_answer[2]+", score:"+str(vqa_score[2])+\ | |
| "; "+vqa_answer[3]+', score:'+str(vqa_score[3])+\ | |
| ". Question: "+question+" Tell me the right answer with a sentence." | |
| # prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+" socre:"+str(vqa_score[0])+\ | |
| # " "+vqa_answer[1]+" score:"+str(vqa_score[1])+" "+vqa_answer[2]+" score:"+str(vqa_score[2])+\ | |
| # " "+vqa_answer[3]+'score:'+str(vqa_score[3])+\ | |
| # "Tell me the right answer with a long sentence." | |
| # prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer." | |
| response = openai.Completion.create( | |
| engine="text-davinci-003", | |
| prompt=prompt, | |
| max_tokens=30, | |
| n=1, | |
| stop=None, | |
| temperature=0.7, | |
| ) | |
| answer = response.choices[0].text.strip() | |
| return answer | |
| def gpt3(question,vqa_answer,caption): | |
| prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer." | |
| response = openai.Completion.create( | |
| engine="text-davinci-003", | |
| prompt=prompt, | |
| max_tokens=30, | |
| n=1, | |
| stop=None, | |
| temperature=0.7, | |
| ) | |
| answer = response.choices[0].text.strip() | |
| # return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer | |
| return answer | |
| def vle(input_image,input_text): | |
| vqa_answers = vqa_pipeline({"image":input_image, "question":input_text}, top_k=4) | |
| # return [" ".join([str(value) for key,value in vqa.items()] )for vqa in vqa_answers] | |
| return [vqa['answer'] for vqa in vqa_answers],[vqa['score'] for vqa in vqa_answers] | |
| def inference_chat(input_image,input_text): | |
| cap=caption(input_image) | |
| print(cap) | |
| # inputs = processor(images=input_image, text=input_text,return_tensors="pt") | |
| # inputs["max_length"] = 10 | |
| # inputs["num_beams"] = 5 | |
| # inputs['num_return_sequences'] =4 | |
| # out = model_vqa.generate(**inputs) | |
| # out=processor.batch_decode(out, skip_special_tokens=True) | |
| out=vle(input_image,input_text) | |
| # vqa="\n".join(out[0]) | |
| # gpt3_out=gpt3(input_text,vqa,cap) | |
| gpt3_out=gpt3_long(input_text,out,cap) | |
| gpt3_out1=gpt3_short(input_text,out,cap) | |
| return out[0][0], gpt3_out,gpt3_out1 | |
| title = """# VQA with VLE and LLM""" | |
| description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details. | |
| We demonstrate visual question answering systems built with VLE and LLM.""" | |
| description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer. | |
| **VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM.""" | |
| with gr.Blocks( | |
| css=""" | |
| .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px} | |
| #component-21 > div.wrap.svelte-w6rprc {height: 600px;} | |
| """ | |
| ) as iface: | |
| state = gr.State([]) | |
| #caption_output = None | |
| gr.Markdown(title) | |
| gr.Markdown(description) | |
| #gr.Markdown(article) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image(type="pil",label="VQA Image Input") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| chat_input = gr.Textbox(lines=1, label="VQA Question Input") | |
| with gr.Row(): | |
| clear_button = gr.Button(value="Clear", interactive=True,width=30) | |
| submit_button = gr.Button( | |
| value="Submit", interactive=True, variant="primary" | |
| ) | |
| ''' | |
| cap_submit_button = gr.Button( | |
| value="Submit_CAP", interactive=True, variant="primary" | |
| ) | |
| gpt3_submit_button = gr.Button( | |
| value="Submit_GPT3", interactive=True, variant="primary" | |
| ) | |
| ''' | |
| with gr.Column(): | |
| gr.Markdown(description1) | |
| caption_output = gr.Textbox(lines=0, label="VQA") | |
| caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)") | |
| gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)") | |
| # image_input.change( | |
| # lambda: ("", [],"","",""), | |
| # [], | |
| # [ caption_output, state,caption_output,gpt3_output_v1,caption_output_v1], | |
| # queue=False, | |
| # ) | |
| chat_input.submit( | |
| inference_chat, | |
| [ | |
| image_input, | |
| chat_input, | |
| ], | |
| [ caption_output,gpt3_output_v1,caption_output_v1], | |
| ) | |
| clear_button.click( | |
| lambda: ("", [],"","",""), | |
| [], | |
| [chat_input, state,caption_output,gpt3_output_v1,caption_output_v1], | |
| queue=False, | |
| ) | |
| submit_button.click( | |
| inference_chat, | |
| [ | |
| image_input, | |
| chat_input, | |
| ], | |
| [caption_output,gpt3_output_v1,caption_output_v1], | |
| ) | |
| ''' | |
| cap_submit_button.click( | |
| caption, | |
| [ | |
| image_input, | |
| ], | |
| [caption_output_v1], | |
| ) | |
| gpt3_submit_button.click( | |
| gpt3, | |
| [ | |
| chat_input, | |
| caption_output , | |
| caption_output_v1, | |
| ], | |
| [gpt3_output_v1], | |
| ) | |
| ''' | |
| examples=[['bird.jpeg',"How many birds are there in the tree?","2","2","2"], | |
| ['qa9.jpg',"What type of vehicle is being pulled by the horses ?",'carriage','sled','Sled'], | |
| ['upload4.jpg',"What is this old man doing?","fishing","fishing","Fishing"]] | |
| examples = gr.Examples( | |
| examples=examples,inputs=[image_input, chat_input,caption_output,caption_output_v1,gpt3_output_v1], | |
| ) | |
| iface.queue(concurrency_count=1, api_open=False, max_size=10) | |
| iface.launch(enable_queue=True) |