Spaces:
Build error
Build error
| """VIP.""" | |
| import json | |
| import re | |
| import cv2 | |
| from tqdm import trange | |
| import numpy as np | |
| import vip | |
| def make_prompt(description, top_n=3): | |
| return f""" | |
| INSTRUCTIONS: | |
| You are tasked to locate an object, region, or point in space in the given annotated image according to a description. | |
| The image is annoated with numbered circles. | |
| Choose the top {top_n} circles that have the most overlap with and/or is closest to what the description is describing in the image. | |
| You are a five-time world champion in this game. | |
| Give a one sentence analysis of why you chose those points. | |
| Provide your answer at the end in a valid JSON of this format: | |
| {{"points": []}} | |
| DESCRIPTION: {description} | |
| IMAGE: | |
| """.strip() | |
| def extract_json(response, key): | |
| json_part = re.search(r"\{.*\}", response, re.DOTALL) | |
| parsed_json = {} | |
| if json_part: | |
| json_data = json_part.group() | |
| # Parse the JSON data | |
| parsed_json = json.loads(json_data) | |
| else: | |
| print("No JSON data found ******\n", response) | |
| return parsed_json[key] | |
| def vip_perform_selection(prompter, vlm, im, desc, arm_coord, samples, top_n): | |
| """Perform one selection pass given samples.""" | |
| image_circles_np = prompter.add_arrow_overlay_plt( | |
| image=im, samples=samples, arm_xy=arm_coord | |
| ) | |
| _, encoded_image_circles = cv2.imencode(".png", image_circles_np) | |
| prompt_seq = [make_prompt(desc, top_n=top_n), encoded_image_circles] | |
| response = vlm.query(prompt_seq) | |
| try: | |
| arrow_ids = extract_json(response, "points") | |
| except Exception as e: | |
| print(e) | |
| arrow_ids = [] | |
| return arrow_ids, image_circles_np | |
| def vip_runner( | |
| vlm, | |
| im, | |
| desc, | |
| style, | |
| action_spec, | |
| n_samples_init=25, | |
| n_samples_opt=10, | |
| n_iters=3, | |
| n_parallel_trials=1, | |
| ): | |
| """VIP.""" | |
| prompter = vip.VisualIterativePrompter( | |
| style, action_spec, vip.SupportedEmbodiments.HF_DEMO | |
| ) | |
| output_ims = [] | |
| arm_coord = (int(im.shape[1] / 2), int(im.shape[0] / 2)) | |
| new_samples = [] | |
| center_mean = action_spec["loc"] | |
| for i in range(n_parallel_trials): | |
| center_mean = action_spec["loc"] | |
| center_std = action_spec["scale"] | |
| for itr in trange(n_iters): | |
| if itr == 0: | |
| style["num_samples"] = n_samples_init | |
| else: | |
| style["num_samples"] = n_samples_opt | |
| samples = prompter.sample_actions(im, arm_coord, center_mean, center_std) | |
| arrow_ids, image_circles_np = vip_perform_selection( | |
| prompter, vlm, im, desc, arm_coord, samples, top_n=3 | |
| ) | |
| # plot sampled circles as red | |
| selected_samples = [] | |
| for selected_id in arrow_ids: | |
| sample = samples[selected_id] | |
| sample.coord.color = (255, 0, 0) | |
| selected_samples.append(sample) | |
| image_circles_marked_np = prompter.add_arrow_overlay_plt( | |
| image_circles_np, selected_samples, arm_coord | |
| ) | |
| output_ims.append(image_circles_marked_np) | |
| yield output_ims, f"Image generated for parallel sample {i+1}/{n_parallel_trials} iteration {itr+1}/{n_iters}. Still working..." | |
| # if at last iteration, pick one answer out of the selected ones | |
| if itr == n_iters - 1: | |
| arrow_ids, _ = vip_perform_selection( | |
| prompter, vlm, im, desc, arm_coord, selected_samples, top_n=1 | |
| ) | |
| selected_samples = [] | |
| for selected_id in arrow_ids: | |
| sample = samples[selected_id] | |
| sample.coord.color = (255, 0, 0) | |
| selected_samples.append(sample) | |
| image_circles_marked_np = prompter.add_arrow_overlay_plt( | |
| im, selected_samples, arm_coord | |
| ) | |
| output_ims.append(image_circles_marked_np) | |
| new_samples += selected_samples | |
| yield output_ims, f"Image generated for parallel sample {i+1}/{n_parallel_trials} last iteration. Still working..." | |
| center_mean, center_std = prompter.fit(arrow_ids, samples) | |
| if n_parallel_trials > 1: | |
| # adjust sample label to avoid duplications | |
| for sample_id in range(len(new_samples)): | |
| new_samples[sample_id].label = str(sample_id) | |
| arrow_ids, _ = vip_perform_selection( | |
| prompter, vlm, im, desc, arm_coord, new_samples, top_n=1 | |
| ) | |
| selected_samples = [] | |
| for selected_id in arrow_ids: | |
| sample = new_samples[selected_id] | |
| sample.coord.color = (255, 0, 0) | |
| selected_samples.append(sample) | |
| image_circles_marked_np = prompter.add_arrow_overlay_plt( | |
| im, selected_samples, arm_coord | |
| ) | |
| output_ims.append(image_circles_marked_np) | |
| center_mean, _ = prompter.fit(arrow_ids, new_samples) | |
| if output_ims: | |
| yield ( | |
| output_ims, | |
| ( | |
| "Final selected coordinate:" | |
| f" {np.round(prompter.action_to_coord(center_mean, im, arm_coord).xy, decimals=0)}" | |
| ), | |
| ) | |
| return [], "Unable to understand query" | |