Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import os | |
| from datetime import datetime | |
| from numpy import true_divide | |
| import gradio as gr | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| os.system("python setup.py build develop --user") | |
| from maskrcnn_benchmark.config import cfg | |
| from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo | |
| import vqa | |
| import cv2 | |
| from PIL import Image | |
| import numpy as np | |
| # Use this command for evaluate the GLIP-T model | |
| config_file = "configs/glip_Swin_T_O365_GoldG.yaml" | |
| weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth" | |
| # manual override some options | |
| cfg.local_rank = 0 | |
| cfg.num_gpus = 1 | |
| cfg.merge_from_file(config_file) | |
| cfg.merge_from_list(["MODEL.WEIGHT", weight_file]) | |
| cfg.merge_from_list(["MODEL.DEVICE", "cuda"]) | |
| glip_demo = GLIPDemo( | |
| cfg, | |
| min_image_size=800, | |
| confidence_threshold=0.7, | |
| show_mask_heatmaps=False | |
| ) | |
| blip_demo = vqa.VQA( | |
| model_path = 'checkpoints/model_base_vqa_capfilt_large.pth') | |
| def predict_image(image, object, question): | |
| now = datetime.now() | |
| dt_string = now.strftime("%d/%m/%Y %H:%M:%S") | |
| print("TimeStamp {}".format(dt_string)) | |
| result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5) | |
| result = result[:, :, [2, 1, 0]] | |
| answer = blip_demo.vqa_demo(image, question) | |
| return result, answer | |
| def predict_video(video, object, question, frame_drop_value): | |
| now = datetime.now() | |
| dt_string = now.strftime("%d/%m/%Y %H:%M:%S") | |
| print("TimeStamp {}".format(dt_string)) | |
| vid = cv2.VideoCapture(video) | |
| count = 0 | |
| result = None | |
| answer = None | |
| while True: | |
| ret, frame = vid.read() | |
| if ret: | |
| count+=1 | |
| if count % frame_drop_value == 0: | |
| # image = Image.fromarray(frame) | |
| image = frame | |
| cv2.putText( | |
| img = image, | |
| text = str(count), | |
| org = (20, 20), | |
| fontFace = cv2.FONT_HERSHEY_DUPLEX, | |
| fontScale = 0.5, | |
| color = (125, 246, 55), | |
| thickness = 1) | |
| result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5) | |
| answer = blip_demo.vqa_demo(image, question) | |
| yield result, answer | |
| else: | |
| break | |
| yield result, answer | |
| with gr.Blocks() as demo: | |
| gr.Markdown("Text-Based Object Detection and Visual Question Answering") | |
| with gr.Tab("Image"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(label='input image') | |
| obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..") | |
| vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..") | |
| image_button = gr.Button("Submit") | |
| with gr.Column(): | |
| image_output = gr.outputs.Image(type="pil", label="grounding results") | |
| vqa_output = gr.Textbox(label="Answer") | |
| with gr.Tab("Video"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.PlayableVideo(label='input video', mirror_webcam=False) | |
| obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..") | |
| vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..") | |
| frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5) | |
| video_button = gr.Button("Submit") | |
| with gr.Column(): | |
| video_output = gr.outputs.Image(type="pil", label="grounding results") | |
| vqa_output_video = gr.Textbox(label="Answer") | |
| with gr.Tab("Webcam"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| cam_input = gr.Video(label='input video', mirror_webcam=False, source="webcam") | |
| obj_input_cam = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..") | |
| vqa_input_cam = gr.Textbox(label='Question', lines=1, placeholder="Question here..") | |
| frame_drop_input_cam = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5) | |
| cam_button = gr.Button("Submit") | |
| with gr.Column(): | |
| cam_output = gr.outputs.Image(type="pil", label="grounding results") | |
| vqa_output_cam = gr.Textbox(label="Answer") | |
| image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output]) | |
| video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video]) | |
| cam_button.click(predict_video, inputs=[cam_input, obj_input_cam, vqa_input_cam, frame_drop_input_cam], outputs=[cam_output, vqa_output_cam]) | |
| demo.queue() | |
| demo.launch() |