Spaces:

TencentARC
/

Caption-Anything

Runtime error

App Files Files Community

ttengwang commited on May 1, 2023

Commit

ccb14a3

1 Parent(s): cd2f644

support "segment everything in a paragraph"

Browse files

Files changed (12) hide show

app.py +89 -65
caption_anything/captioner/base_captioner.py +38 -30
caption_anything/captioner/blip.py +12 -5
caption_anything/captioner/blip2.py +24 -11
caption_anything/captioner/git.py +14 -7
caption_anything/model.py +210 -71
caption_anything/segmenter/__init__.py +11 -2
caption_anything/segmenter/base_segmenter.py +8 -3
caption_anything/utils/densecap_painter.py +64 -0
caption_anything/utils/parser.py +6 -0
caption_anything/utils/utils.py +31 -0
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import json
-import PIL
 import gradio as gr
 import numpy as np
 from gradio import processing_utils
@@ -11,7 +10,7 @@ import functools
 from caption_anything.model import CaptionAnything
 from caption_anything.utils.image_editing_utils import create_bubble_frame
-from caption_anything.utils.utils import mask_painter, seg_model_map, prepare_segmenter
 from caption_anything.utils.parser import parse_augment
 from caption_anything.captioner import build_captioner
 from caption_anything.text_refiner import build_text_refiner
@@ -23,6 +22,7 @@ from segment_anything import sam_model_registry
 args = parse_augment()
 args.segmenter = "huge"
 args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
 if args.segmenter_checkpoint is None:
     _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
 else:
@@ -53,9 +53,7 @@ class ImageSketcher(gr.Image):
                 mask = np.zeros((height, width, 4), dtype=np.uint8)
                 mask[..., -1] = 255
                 mask = self.postprocess(mask)
                 x['mask'] = mask
         return super().preprocess(x)
@@ -74,16 +72,19 @@ def init_openai_api_key(api_key=""):
     if api_key and len(api_key) > 30:
         try:
             text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
-            text_refiner.llm('hi')  # test
             visual_chatgpt = ConversationBot(shared_chatbot_tools, api_key)
         except:
             text_refiner = None
             visual_chatgpt = None
     openai_available = text_refiner is not None
-    return gr.update(visible=openai_available), gr.update(visible=openai_available), gr.update(
-        visible=openai_available), gr.update(visible=True), gr.update(visible=True), gr.update(
-        visible=True), text_refiner, visual_chatgpt
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
@@ -130,18 +131,15 @@ def chat_input_callback(*args):
         state = state + [(chat_input, response)]
         return state, state
 def upload_callback(image_input, state, visual_chatgpt=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
     click_state = [[], [], []]
-    res = 1024
-    width, height = image_input.size
-    ratio = min(1.0 * res / max(width, height), 1.0)
-    if ratio < 1.0:
-        image_input = image_input.resize((int(width * ratio), int(height * ratio)))
-        print('Scaling input image to {}'.format(image_input.size))
     model = build_caption_anything_with_models(
         args,
@@ -159,8 +157,8 @@ def upload_callback(image_input, state, visual_chatgpt=None):
         new_image_path = get_new_image_name('chat_image', func_name='upload')
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
-        img_caption, _ = model.captioner.inference_seg(image_input)
-        Human_prompt = f'\nHuman: provide a new figure with path {new_image_path}. The description is: {img_caption}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received."
         visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
         visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
@@ -201,11 +199,10 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
-    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)
     state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
     state = state + [(None, "raw_caption: {}".format(out['generated_captions']['raw_caption']))]
-    wiki = out['generated_captions'].get('wiki', "")
     update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
@@ -221,21 +218,22 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
         point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
         visual_chatgpt.point_prompt = point_prompt
-    yield state, state, click_state, image_input, wiki
     if not args.disable_gpt and model.text_refiner:
         refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
                                                        enable_wiki=enable_wiki)
         # new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
         new_cap = refined_caption['caption']
-        wiki = refined_caption['wiki']
         state = state + [(None, f"caption: {new_cap}")]
         refined_image_input = create_bubble_frame(origin_image_input, new_cap, (click_index[0], click_index[1]),
                                                   input_mask,
                                                   input_points=input_points, input_labels=input_labels)
-        yield state, state, click_state, refined_image_input, wiki
-def get_sketch_prompt(mask: PIL.Image.Image):
     """
     Get the prompt for the sketcher.
     TODO: This is a temporary solution. We should cluster the sketch and get the bounding box of each cluster.
@@ -282,12 +280,11 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
-    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)
     # Update components and states
     state.append((f'Box: {boxes}', None))
     state.append((None, f'raw_caption: {out["generated_captions"]["raw_caption"]}'))
-    wiki = out['generated_captions'].get('wiki', "")
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     image_input = mask_painter(np.array(image_input), input_mask)
@@ -297,18 +294,19 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
     fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
     image_input = create_bubble_frame(image_input, text, fake_click_index, input_mask)
-    yield state, state, image_input, wiki
     if not args.disable_gpt and model.text_refiner:
         refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
                                                        enable_wiki=enable_wiki)
         new_cap = refined_caption['caption']
-        wiki = refined_caption['wiki']
         state = state + [(None, f"caption: {new_cap}")]
         refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
-        yield state, state, refined_image_input, wiki
 def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
@@ -319,7 +317,26 @@ def clear_chat_memory(visual_chatgpt, keep_global=False):
         else:
             visual_chatgpt.current_image = None
             visual_chatgpt.global_prompt = ""
 def get_style():
     current_version = version.parse(gr.__version__)
     if current_version <= version.parse('3.24.1'):
@@ -400,7 +417,7 @@ def create_ui():
                         with gr.Row():
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
-                with gr.Column(visible=False) as modules_need_gpt:
                     with gr.Row(scale=1.0):
                         language = gr.Dropdown(
                             ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
@@ -431,26 +448,31 @@ def create_ui():
                             value="No",
                             label="Enable Wiki",
                             interactive=True)
-                with gr.Column(visible=True) as modules_not_need_gpt3:
-                    gr.Examples(
-                        examples=examples,
-                        inputs=[example_image],
-                    )
             with gr.Column(scale=0.5):
-                openai_api_key = gr.Textbox(
-                    placeholder="Input openAI API key",
-                    show_label=False,
-                    label="OpenAI API Key",
-                    lines=1,
-                    type="password")
-                with gr.Row(scale=0.5):
-                    enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
-                    disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
-                                                       variant='primary')
-                with gr.Column(visible=False) as modules_need_gpt2:
-                    wiki_output = gr.Textbox(lines=5, label="Wiki", max_lines=5)
-                with gr.Column(visible=False) as modules_not_need_gpt2:
-                    chatbot = gr.Chatbot(label="Chat about Selected Object", ).style(height=550, scale=0.5)
                     with gr.Column(visible=False) as modules_need_gpt3:
                         chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
                             container=False)
@@ -459,36 +481,38 @@ def create_ui():
                             submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
-                              outputs=[modules_need_gpt, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, modules_not_need_gpt3, text_refiner, visual_chatgpt])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
-                                    outputs=[modules_need_gpt, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, modules_not_need_gpt3, text_refiner, visual_chatgpt])
-        disable_chatGPT_button.click(init_openai_api_key,
-                                     outputs=[modules_need_gpt, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
-                                              modules_not_need_gpt2, modules_not_need_gpt3, text_refiner, visual_chatgpt])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, wiki_output, origin_image],
             queue=False,
             show_progress=False
         )
         openai_api_key.submit(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, wiki_output, origin_image],
             queue=False,
             show_progress=False
         )
         clear_button_click.click(
-            lambda x: ([[], [], []], x, ""),
             [origin_image],
-            [click_state, image_input, wiki_output],
             queue=False,
             show_progress=False
         )
@@ -496,7 +520,7 @@ def create_ui():
         clear_button_image.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, wiki_output, origin_image],
             queue=False,
             show_progress=False
         )
@@ -513,7 +537,7 @@ def create_ui():
         image_input.clear(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
-            [image_input, chatbot, state, click_state, wiki_output, origin_image],
             queue=False,
             show_progress=False
         )
@@ -544,7 +568,7 @@ def create_ui():
                 origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt
             ],
-            outputs=[chatbot, state, click_state, image_input, wiki_output],
             show_progress=False, queue=True
         )
@@ -554,7 +578,7 @@ def create_ui():
                 sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
                 original_size, input_size, text_refiner
             ],
-            outputs=[chatbot, state, sketcher_input, wiki_output],
             show_progress=False, queue=True
         )

 import os
 import json
 import gradio as gr
 import numpy as np
 from gradio import processing_utils
 from caption_anything.model import CaptionAnything
 from caption_anything.utils.image_editing_utils import create_bubble_frame
+from caption_anything.utils.utils import mask_painter, seg_model_map, prepare_segmenter, image_resize
 from caption_anything.utils.parser import parse_augment
 from caption_anything.captioner import build_captioner
 from caption_anything.text_refiner import build_text_refiner
 args = parse_augment()
 args.segmenter = "huge"
 args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
 if args.segmenter_checkpoint is None:
     _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
 else:
                 mask = np.zeros((height, width, 4), dtype=np.uint8)
                 mask[..., -1] = 255
                 mask = self.postprocess(mask)
                 x['mask'] = mask
         return super().preprocess(x)
     if api_key and len(api_key) > 30:
         try:
             text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
+            assert len(text_refiner.llm('hi')) > 0 # test
             visual_chatgpt = ConversationBot(shared_chatbot_tools, api_key)
         except:
             text_refiner = None
             visual_chatgpt = None
     openai_available = text_refiner is not None
+    if openai_available:
+        return [gr.update(visible=True)]*6 + [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
+    else:
+        return [gr.update(visible=False)]*6 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
+def init_wo_openai_api_key():
+        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]*2 + [gr.update(visible=False)]*2 + [None, None, None]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
         state = state + [(chat_input, response)]
         return state, state
 def upload_callback(image_input, state, visual_chatgpt=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
     click_state = [[], [], []]
+    image_input = image_resize(image_input, res=1024)
     model = build_caption_anything_with_models(
         args,
         new_image_path = get_new_image_name('chat_image', func_name='upload')
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
+        img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
+        Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {img_caption}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received."
         visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
         visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
+    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
     state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
     state = state + [(None, "raw_caption: {}".format(out['generated_captions']['raw_caption']))]
     update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
         point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
         visual_chatgpt.point_prompt = point_prompt
+    yield state, state, click_state, image_input
     if not args.disable_gpt and model.text_refiner:
         refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
                                                        enable_wiki=enable_wiki)
         # new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
         new_cap = refined_caption['caption']
+        if refined_caption['wiki']:
+            state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
         state = state + [(None, f"caption: {new_cap}")]
         refined_image_input = create_bubble_frame(origin_image_input, new_cap, (click_index[0], click_index[1]),
                                                   input_mask,
                                                   input_points=input_points, input_labels=input_labels)
+        yield state, state, click_state, refined_image_input
+def get_sketch_prompt(mask: Image.Image):
     """
     Get the prompt for the sketcher.
     TODO: This is a temporary solution. We should cluster the sketch and get the bounding box of each cluster.
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
+    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
     # Update components and states
     state.append((f'Box: {boxes}', None))
     state.append((None, f'raw_caption: {out["generated_captions"]["raw_caption"]}'))
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     image_input = mask_painter(np.array(image_input), input_mask)
     fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
     image_input = create_bubble_frame(image_input, text, fake_click_index, input_mask)
+    yield state, state, image_input
     if not args.disable_gpt and model.text_refiner:
         refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
                                                        enable_wiki=enable_wiki)
         new_cap = refined_caption['caption']
+        if refined_caption['wiki']:
+            state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
         state = state + [(None, f"caption: {new_cap}")]
         refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
+        yield state, state, refined_image_input
 def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
         else:
             visual_chatgpt.current_image = None
             visual_chatgpt.global_prompt = ""
+def cap_everything(image_input, visual_chatgpt, text_refiner):
+    model = build_caption_anything_with_models(
+        args,
+        api_key="",
+        captioner=shared_captioner,
+        sam_model=shared_sam_model,
+        text_refiner=text_refiner,
+        session_id=iface.app_id
+    )
+    paragraph = model.inference_cap_everything(image_input, verbose=True)
+    # state = state + [(None, f"Caption Everything: {paragraph}")]
+    Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
+    AI_prompt = "Received."
+    visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
+    visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+    return paragraph
 def get_style():
     current_version = version.parse(gr.__version__)
     if current_version <= version.parse('3.24.1'):
                         with gr.Row():
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
+                with gr.Column(visible=False) as modules_need_gpt1:
                     with gr.Row(scale=1.0):
                         language = gr.Dropdown(
                             ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
                             value="No",
                             label="Enable Wiki",
                             interactive=True)
+                # with gr.Column(visible=True) as modules_not_need_gpt3:
+                gr.Examples(
+                    examples=examples,
+                    inputs=[example_image],
+                )
             with gr.Column(scale=0.5):
+                with gr.Column(visible=True) as module_key_input:
+                    openai_api_key = gr.Textbox(
+                        placeholder="Input openAI API key",
+                        show_label=False,
+                        label="OpenAI API Key",
+                        lines=1,
+                        type="password")
+                    with gr.Row(scale=0.5):
+                        enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
+                        disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
+                                                        variant='primary')
+                with gr.Column(visible=False) as module_notification_box:
+                    notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
+                with gr.Column(visible=False) as modules_need_gpt2:
+                    paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
+                with gr.Column(visible=False) as modules_need_gpt0:
+                    cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
+                with gr.Column(visible=False) as modules_not_need_gpt2:
+                    chatbot = gr.Chatbot(label="Chatbox", ).style(height=550, scale=0.5)
                     with gr.Column(visible=False) as modules_need_gpt3:
                         chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
                             container=False)
                             submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
+                              outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
+                                    outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
+        disable_chatGPT_button.click(init_wo_openai_api_key,
+                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
+                                              modules_not_need_gpt2, module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
         openai_api_key.submit(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
+        cap_everything_button.click(cap_everything, [origin_image, visual_chatgpt, text_refiner], [paragraph_output])
         clear_button_click.click(
+            lambda x: ([[], [], []], x),
             [origin_image],
+            [click_state, image_input],
             queue=False,
             show_progress=False
         )
         clear_button_image.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
         image_input.clear(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
             queue=False,
             show_progress=False
         )
                 origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt
             ],
+            outputs=[chatbot, state, click_state, image_input],
             show_progress=False, queue=True
         )
                 sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
                 original_size, input_size, text_refiner
             ],
+            outputs=[chatbot, state, sketcher_input],
             show_progress=False, queue=True
         )

caption_anything/captioner/base_captioner.py CHANGED Viewed

@@ -5,7 +5,7 @@ import json
 import pdb
 import cv2
 import numpy as np
-from typing import Union
 import time
 import clip
@@ -16,13 +16,10 @@ def boundary(inputs):
     col = inputs.shape[1]
     inputs = inputs.reshape(-1)
     lens = len(inputs)
     start = np.argmax(inputs)
     end = lens - 1 - np.argmax(np.flip(inputs))
     top = start // col
     bottom = end // col
     return top, bottom
@@ -84,27 +81,27 @@ class BaseCaptioner:
         self.enable_filter = enable_filter
         if enable_filter:
             self.filter, self.preprocess = clip.load('ViT-B/32', device)
-        self.threshold = 0.2
     @torch.no_grad()
-    def filter_caption(self, image: Union[np.ndarray, Image.Image, str], caption: str):
         image = load_image(image, return_type='pil')
         image = self.preprocess(image).unsqueeze(0).to(self.device)  # (1, 3, 224, 224)
-        text = clip.tokenize(caption).to(self.device)  # (1, 77)
         image_features = self.filter.encode_image(image)  # (1, 512)
-        text_features = self.filter.encode_text(text)  # (1, 512)
         image_features /= image_features.norm(dim=-1, keepdim=True)
         text_features /= text_features.norm(dim=-1, keepdim=True)
-        similarity = torch.matmul(image_features, text_features.transpose(1, 0)).item()
-        if similarity < self.threshold:
-            print('There seems to be nothing where you clicked.')
-            out = ""
         else:
-            out = caption
         print(f'Clip score of the caption is {similarity}')
-        return out
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter: bool = False):
         raise NotImplementedError()
@@ -112,7 +109,7 @@ class BaseCaptioner:
     def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, filter: bool = False):
         raise NotImplementedError()
-    def inference_box(self, image: Union[np.ndarray, Image.Image, str], box: Union[list, np.ndarray], filter=False):
         image = load_image(image, return_type="pil")
         if np.array(box).size == 4:
@@ -123,23 +120,31 @@ class BaseCaptioner:
         elif np.array(box).size == 8:  # four corners of an irregular rectangle
             image_crop = cut_box(np.array(image), box)
-        crop_save_path = f'result/crop_{time.time()}.png'
-        Image.fromarray(image_crop).save(crop_save_path)
-        print(f'croped image saved in {crop_save_path}')
-        caption = self.inference(image_crop, filter)
-        return caption, crop_save_path
-    def inference_seg(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str] = None,
-                      crop_mode="w_bg", filter=False, disable_regular_box=False):
         if seg_mask is None:
             seg_mask = np.ones(image.size).astype(bool)
         image = load_image(image, return_type="pil")
         seg_mask = load_image(seg_mask, return_type="pil")
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0
         if crop_mode == "wo_bg":
             image = np.array(image) * seg_mask[:, :, np.newaxis] + (1 - seg_mask[:, :, np.newaxis]) * 255
             image = np.uint8(image)
@@ -150,10 +155,13 @@ class BaseCaptioner:
             min_area_box = seg_to_box(seg_mask)
         else:
             min_area_box = new_seg_to_box(seg_mask)
-        return self.inference_box(image, min_area_box, filter)
-    def generate_seg_cropped_image(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str],
-                                   crop_mode="w_bg", disable_regular_box=False):
         image = load_image(image, return_type="pil")
         seg_mask = load_image(seg_mask, return_type="pil")

 import pdb
 import cv2
 import numpy as np
+from typing import Any, Union, List
 import time
 import clip
     col = inputs.shape[1]
     inputs = inputs.reshape(-1)
     lens = len(inputs)
     start = np.argmax(inputs)
     end = lens - 1 - np.argmax(np.flip(inputs))
     top = start // col
     bottom = end // col
     return top, bottom
         self.enable_filter = enable_filter
         if enable_filter:
             self.filter, self.preprocess = clip.load('ViT-B/32', device)
     @torch.no_grad()
+    def filter_caption(self, image: Union[np.ndarray, Image.Image, str], caption: str, reference_caption: List[str]=[]):
         image = load_image(image, return_type='pil')
         image = self.preprocess(image).unsqueeze(0).to(self.device)  # (1, 3, 224, 224)
+        captions = [caption]
+        if len(reference_caption):
+            captions.extend(reference_caption)
+        text = clip.tokenize(captions).to(self.device)  # (>1, 77)
         image_features = self.filter.encode_image(image)  # (1, 512)
+        text_features = self.filter.encode_text(text) # # (>1, 512)
         image_features /= image_features.norm(dim=-1, keepdim=True)
         text_features /= text_features.norm(dim=-1, keepdim=True)
+        if len(reference_caption):
+            similarity = torch.matmul(image_features, text_features.transpose(1, 0)) / 0.07
+            similarity = similarity.softmax(dim=1)[0, 0].item()
         else:
+            similarity = torch.matmul(image_features, text_features.transpose(1, 0)).item()
         print(f'Clip score of the caption is {similarity}')
+        return similarity
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter: bool = False):
         raise NotImplementedError()
     def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, filter: bool = False):
         raise NotImplementedError()
+    def inference_box(self, image: Union[np.ndarray, Image.Image, str], box: Union[list, np.ndarray], filter=False, verbose=False, caption_args={}):
         image = load_image(image, return_type="pil")
         if np.array(box).size == 4:
         elif np.array(box).size == 8:  # four corners of an irregular rectangle
             image_crop = cut_box(np.array(image), box)
+        crop_save_path = None
+        if verbose:
+            crop_save_path = f'result/crop_{time.time()}.png'
+            Image.fromarray(image_crop).save(crop_save_path)
+            print(f'croped image saved in {crop_save_path}')
+        caption = self.inference(image_crop, filter, caption_args)
+        caption.update({'crop_save_path': crop_save_path})
+        return caption
+    def inference_seg(self,
+                      image: Union[np.ndarray, str],
+                      seg_mask: Union[np.ndarray, Image.Image, str] = None,
+                      crop_mode="w_bg",
+                      filter=False,
+                      disable_regular_box=False,
+                      verbose=False,
+                      caption_args={}):
         if seg_mask is None:
             seg_mask = np.ones(image.size).astype(bool)
         image = load_image(image, return_type="pil")
         seg_mask = load_image(seg_mask, return_type="pil")
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0
         if crop_mode == "wo_bg":
             image = np.array(image) * seg_mask[:, :, np.newaxis] + (1 - seg_mask[:, :, np.newaxis]) * 255
             image = np.uint8(image)
             min_area_box = seg_to_box(seg_mask)
         else:
             min_area_box = new_seg_to_box(seg_mask)
+        return self.inference_box(image, min_area_box, filter, verbose, caption_args)
+    def generate_seg_cropped_image(self,
+                                   image: Union[np.ndarray, str],
+                                   seg_mask: Union[np.ndarray, Image.Image, str],
+                                   crop_mode="w_bg",
+                                   disable_regular_box=False):
         image = load_image(image, return_type="pil")
         seg_mask = load_image(seg_mask, return_type="pil")

caption_anything/captioner/blip.py CHANGED Viewed

@@ -20,19 +20,24 @@ class BLIPCaptioner(BaseCaptioner):
                                                                   torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
-    def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
         image = load_image(image, return_type="pil")
         inputs = self.processor(image, return_tensors="pt").to(self.device, self.torch_dtype)
         out = self.model.generate(**inputs, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
         if self.enable_filter and filter:
-            captions = self.filter_caption(image, captions)
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
-        return captions
     @torch.no_grad()
     def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg",
                                       filter=False, disable_regular_box=False):
         crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode,
                                                          disable_regular_box=disable_regular_box)
         image = load_image(image, return_type="pil")
@@ -47,9 +52,11 @@ class BLIPCaptioner(BaseCaptioner):
         out = self.model.generate(pixel_values=pixel_values, pixel_masks=pixel_masks, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
         if self.enable_filter and filter:
-            captions = self.filter_caption(image, captions)
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
-        return captions, crop_save_path
 if __name__ == '__main__':

                                                                   torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
+    def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False, args={}):
         image = load_image(image, return_type="pil")
         inputs = self.processor(image, return_tensors="pt").to(self.device, self.torch_dtype)
         out = self.model.generate(**inputs, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
+        result = {}
         if self.enable_filter and filter:
+            clip_score = self.filter_caption(image, captions)
+            result['clip_score'] = clip_score
+        result.update({'caption':captions})
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
+        return {'caption': captions}
     @torch.no_grad()
     def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg",
                                       filter=False, disable_regular_box=False):
+        result = {}
         crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode,
                                                          disable_regular_box=disable_regular_box)
         image = load_image(image, return_type="pil")
         out = self.model.generate(pixel_values=pixel_values, pixel_masks=pixel_masks, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
         if self.enable_filter and filter:
+            clip_score = self.filter_caption(image, captions)
+            result['clip_score'] = clip_score
+        result.update({'caption':captions, 'crop_save_path':crop_save_path})
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
+        return result
 if __name__ == '__main__':

caption_anything/captioner/blip2.py CHANGED Viewed

@@ -20,18 +20,31 @@ class BLIP2Captioner(BaseCaptioner):
             self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map='sequential', load_in_8bit=True)
     @torch.no_grad()
-    def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
         image = load_image(image, return_type="pil")
         if not self.dialogue:
-            text_prompt = 'The image shows'
-            inputs = self.processor(image, text = text_prompt, return_tensors="pt").to(self.device, self.torch_dtype)
-            out = self.model.generate(**inputs, max_new_tokens=50)
-            captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
             if self.enable_filter and filter:
-                captions = self.filter_caption(image, captions)
-            print(f"\nProcessed ImageCaptioning by BLIP2Captioner, Output Text: {captions}")
-            return captions
         else:
             context = []
             template = "Question: {} Answer: {}."
@@ -44,8 +57,8 @@ class BLIP2Captioner(BaseCaptioner):
                 out = self.model.generate(**inputs, max_new_tokens=50)
                 captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
                 context.append((input_texts, captions))
-        return captions
 if __name__ == '__main__':

             self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map='sequential', load_in_8bit=True)
     @torch.no_grad()
+    def inference(self,
+                  image: Union[np.ndarray, Image.Image, str],
+                  filter=False,
+                  args={}):
+        args['return_ppl'] = args.get('return_ppl', False)
+        args['text_prompt'] = args.get('text_prompt', 'Question: what does the image show? Answer:')
+        args['reference_caption'] = args.get('reference_caption', [])
         image = load_image(image, return_type="pil")
+        result = {}
         if not self.dialogue:
+            inputs = self.processor(image, text = args['text_prompt'], return_tensors="pt").to(self.device, self.torch_dtype)
+            out = self.model.generate(**inputs, return_dict_in_generate=True, output_scores=True, max_new_tokens=50)
+            captions = self.processor.batch_decode(out.sequences, skip_special_tokens=True)
+            caption = [caption.strip() for caption in captions][0]
             if self.enable_filter and filter:
+                print('reference caption: {}, caption: {}'.format(args['reference_caption'], caption))
+                clip_score = self.filter_caption(image, caption, args['reference_caption'])
+                result['clip_score'] = clip_score
+            if args['return_ppl']:
+                ppl_score = torch.stack(out.scores, dim=1).softmax(dim=2).log().max(dim=2)[0].sum(dim=1)[0]
+                result['ppl_score'] = ppl_score.item()
+            print(f"\nProcessed ImageCaptioning by BLIP2Captioner, Output Text: {caption}")
+            result['caption'] = caption
+            return result
         else:
             context = []
             template = "Question: {} Answer: {}."
                 out = self.model.generate(**inputs, max_new_tokens=50)
                 captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
                 context.append((input_texts, captions))
+                result['caption'] = captions
+            return result
 if __name__ == '__main__':

caption_anything/captioner/git.py CHANGED Viewed

@@ -19,19 +19,24 @@ class GITCaptioner(BaseCaptioner):
         self.model = GitForCausalLM.from_pretrained("microsoft/git-large", torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
-    def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
         image = load_image(image, return_type="pil")
         pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device, self.torch_dtype)
         generated_ids = self.model.generate(pixel_values=pixel_values, max_new_tokens=50)
-        generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
         if self.enable_filter and filter:
-            captions = self.filter_caption(image, captions)
-        print(f"\nProcessed ImageCaptioning by GITCaptioner, Output Text: {generated_caption}")
-        return generated_caption
     @torch.no_grad()
     def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg",
                                       filter=False, disable_regular_box=False):
         crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode,
                                                          disable_regular_box=disable_regular_box)
         image = load_image(image, return_type="pil")
@@ -46,9 +51,11 @@ class GITCaptioner(BaseCaptioner):
         out = self.model.generate(pixel_values=pixel_values, pixel_masks=pixel_masks, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
         if self.enable_filter and filter:
-            captions = self.filter_caption(image, captions)
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
-        return captions, crop_save_path
 if __name__ == '__main__':

         self.model = GitForCausalLM.from_pretrained("microsoft/git-large", torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
+    def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False, args={}):
         image = load_image(image, return_type="pil")
         pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device, self.torch_dtype)
         generated_ids = self.model.generate(pixel_values=pixel_values, max_new_tokens=50)
+        captions = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        result = {}
         if self.enable_filter and filter:
+            clip_score = self.filter_caption(image, captions)
+            result['clip_score'] = clip_score
+        result.update({'caption':captions})
+        print(f"\nProcessed ImageCaptioning by GITCaptioner, Output Text: {captions}")
+        return {'caption': captions}
     @torch.no_grad()
     def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg",
                                       filter=False, disable_regular_box=False):
+        result = {}
         crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode,
                                                          disable_regular_box=disable_regular_box)
         image = load_image(image, return_type="pil")
         out = self.model.generate(pixel_values=pixel_values, pixel_masks=pixel_masks, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
         if self.enable_filter and filter:
+            clip_score = self.filter_caption(image, captions)
+            result['clip_score'] = clip_score
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
+        result.update({'caption':captions, 'crop_save_path':crop_save_path})
+        return result
 if __name__ == '__main__':

caption_anything/model.py CHANGED Viewed

@@ -5,24 +5,33 @@ import time
 from PIL import Image
 import cv2
 import numpy as np
 from caption_anything.captioner import build_captioner, BaseCaptioner
-from caption_anything.segmenter import build_segmenter
 from caption_anything.text_refiner import build_text_refiner
 class CaptionAnything:
     def __init__(self, args, api_key="", captioner=None, segmenter=None, text_refiner=None):
         self.args = args
         self.captioner = build_captioner(args.captioner, args.device, args) if captioner is None else captioner
         self.segmenter = build_segmenter(args.segmenter, args.device, args) if segmenter is None else segmenter
         self.text_refiner = None
         if not args.disable_gpt:
             if text_refiner is not None:
                 self.text_refiner = text_refiner
-            else:
                 self.init_refiner(api_key)
     @property
     def image_embedding(self):
         return self.segmenter.image_embedding
@@ -61,65 +70,195 @@ class CaptionAnything:
             self.text_refiner = None
             print('OpenAI GPT is not available')
-    def inference(self, image, prompt, controls, disable_gpt=False, enable_wiki=False):
-        # TODO: Add support to multiple seg masks.
         #  segment with prompt
         print("CA prompt: ", prompt, "CA controls", controls)
-        seg_mask = self.segmenter.inference(image, prompt)[0, ...]
-        if self.args.enable_morphologyex:
-            seg_mask = 255 * seg_mask.astype(np.uint8)
-            seg_mask = np.stack([seg_mask, seg_mask, seg_mask], axis=-1)
-            seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_OPEN, kernel=np.ones((6, 6), np.uint8))
-            seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_CLOSE, kernel=np.ones((6, 6), np.uint8))
-            seg_mask = seg_mask[:, :, 0] > 0
-        mask_save_path = f'result/mask_{time.time()}.png'
-        if not os.path.exists(os.path.dirname(mask_save_path)):
-            os.makedirs(os.path.dirname(mask_save_path))
-        seg_mask_img = Image.fromarray(seg_mask.astype('int') * 255.)
-        if seg_mask_img.mode != 'RGB':
-            seg_mask_img = seg_mask_img.convert('RGB')
-        seg_mask_img.save(mask_save_path)
-        print('seg_mask path: ', mask_save_path)
-        print("seg_mask.shape: ", seg_mask.shape)
-        #  captioning with mask
-        if self.args.enable_reduce_tokens:
-            caption, crop_save_path = self.captioner. \
-                inference_with_reduced_tokens(image, seg_mask,
-                                              crop_mode=self.args.seg_crop_mode,
-                                              filter=self.args.clip_filter,
-                                              disable_regular_box=self.args.disable_regular_box)
-        else:
-            caption, crop_save_path = self.captioner. \
-                inference_seg(image, seg_mask, crop_mode=self.args.seg_crop_mode,
-                              filter=self.args.clip_filter,
-                              disable_regular_box=self.args.disable_regular_box)
-        #  refining with TextRefiner
-        context_captions = []
-        if self.args.context_captions:
-            context_captions.append(self.captioner.inference(image))
-        if not disable_gpt and self.text_refiner is not None:
-            refined_caption = self.text_refiner.inference(query=caption, controls=controls, context=context_captions,
-                                                          enable_wiki=enable_wiki)
         else:
-            refined_caption = {'raw_caption': caption}
-        out = {'generated_captions': refined_caption,
-               'crop_save_path': crop_save_path,
-               'mask_save_path': mask_save_path,
-               'mask': seg_mask_img,
-               'context_captions': context_captions}
-        return out
 if __name__ == "__main__":
     from caption_anything.utils.parser import parse_augment
     args = parse_augment()
-    # image_path = 'test_images/img3.jpg'
-    image_path = 'test_images/img1.jpg'
     prompts = [
         {
             "prompt_type": ["click"],
@@ -127,12 +266,12 @@ if __name__ == "__main__":
             "input_label": [1, 0],
             "multimask_output": "True",
         },
-        {
-            "prompt_type": ["click"],
-            "input_point": [[300, 800]],
-            "input_label": [1],
-            "multimask_output": "True",
-        }
     ]
     controls = {
         "length": "30",
@@ -143,11 +282,11 @@ if __name__ == "__main__":
     }
     model = CaptionAnything(args, os.environ['OPENAI_API_KEY'])
-    for prompt in prompts:
-        print('*' * 30)
-        print('Image path: ', image_path)
-        image = Image.open(image_path)
-        print(image)
-        print('Visual controls (SAM prompt):\n', prompt)
-        print('Language controls:\n', controls)
-        out = model.inference(image_path, prompt, controls)

 from PIL import Image
 import cv2
 import numpy as np
+from PIL import Image
+import easyocr
+import copy
 from caption_anything.captioner import build_captioner, BaseCaptioner
+from caption_anything.segmenter import build_segmenter, build_segmenter_densecap
 from caption_anything.text_refiner import build_text_refiner
+from caption_anything.utils.utils import prepare_segmenter, seg_model_map, load_image, get_image_shape
+from caption_anything.utils.utils import mask_painter_foreground_all, mask_painter, xywh_to_x1y1x2y2, image_resize
+from caption_anything.utils.densecap_painter import draw_bbox
 class CaptionAnything:
     def __init__(self, args, api_key="", captioner=None, segmenter=None, text_refiner=None):
         self.args = args
         self.captioner = build_captioner(args.captioner, args.device, args) if captioner is None else captioner
         self.segmenter = build_segmenter(args.segmenter, args.device, args) if segmenter is None else segmenter
+        self.segmenter_densecap = build_segmenter_densecap(args.segmenter, args.device, args, model=self.segmenter.model)
+        self.lang = ["ch_tra", "en"]
+        self.reader = easyocr.Reader(self.lang)
         self.text_refiner = None
         if not args.disable_gpt:
             if text_refiner is not None:
                 self.text_refiner = text_refiner
+            elif api_key != "":
                 self.init_refiner(api_key)
+        self.require_caption_prompt = args.captioner == 'blip2'
     @property
     def image_embedding(self):
         return self.segmenter.image_embedding
             self.text_refiner = None
             print('OpenAI GPT is not available')
+    def inference(self, image, prompt, controls, disable_gpt=False, enable_wiki=False, verbose=False, is_densecap=False, args={}):
         #  segment with prompt
         print("CA prompt: ", prompt, "CA controls", controls)
+        is_seg_everything = 'everything' in prompt['prompt_type']
+        args['seg_crop_mode'] = args.get('seg_crop_mode', self.args.seg_crop_mode)
+        args['clip_filter'] = args.get('clip_filter', self.args.clip_filter)
+        args['disable_regular_box'] = args.get('disable_regular_box', self.args.disable_regular_box)
+        args['context_captions'] = args.get('context_captions', self.args.context_captions)
+        args['enable_reduce_tokens'] = args.get('enable_reduce_tokens', self.args.enable_reduce_tokens)
+        args['enable_morphologyex'] = args.get('enable_morphologyex', self.args.enable_morphologyex)
+        args['topN'] = args.get('topN', 10) if is_seg_everything else 1
+        args['min_mask_area'] = args.get('min_mask_area', 0)
+        if not is_densecap:
+            seg_results = self.segmenter.inference(image, prompt)
         else:
+            seg_results = self.segmenter_densecap.inference(image, prompt)
+        seg_masks, seg_bbox, seg_area = seg_results if is_seg_everything else (seg_results, None, None)
+        if args['topN'] > 1: # sort by area
+            samples = list(zip(*[seg_masks, seg_bbox, seg_area]))
+            # top_samples = sorted(samples, key=lambda x: x[2], reverse=True)
+            # seg_masks, seg_bbox, seg_area = list(zip(*top_samples))
+            samples = list(filter(lambda x: x[2] > args['min_mask_area'], samples))
+            samples = samples[:args['topN']]
+            seg_masks, seg_bbox, seg_area = list(zip(*samples))
+        out_list = []
+        for i, seg_mask in enumerate(seg_masks):
+            if args['enable_morphologyex']:
+                seg_mask = 255 * seg_mask.astype(np.uint8)
+                seg_mask = np.stack([seg_mask, seg_mask, seg_mask], axis=-1)
+                seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_OPEN, kernel=np.ones((6, 6), np.uint8))
+                seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_CLOSE, kernel=np.ones((6, 6), np.uint8))
+                seg_mask = seg_mask[:, :, 0] > 0
+            seg_mask_img = Image.fromarray(seg_mask.astype('int') * 255.)
+            mask_save_path = None
+            if verbose:
+                mask_save_path = f'result/mask_{time.time()}.png'
+                if not os.path.exists(os.path.dirname(mask_save_path)):
+                    os.makedirs(os.path.dirname(mask_save_path))
+                if seg_mask_img.mode != 'RGB':
+                    seg_mask_img = seg_mask_img.convert('RGB')
+                seg_mask_img.save(mask_save_path)
+                print('seg_mask path: ', mask_save_path)
+                print("seg_mask.shape: ", seg_mask.shape)
+            #  captioning with mask
+            if args['enable_reduce_tokens']:
+                result = self.captioner.inference_with_reduced_tokens(image, seg_mask,
+                                                  crop_mode=args['seg_crop_mode'],
+                                                  filter=args['clip_filter'],
+                                                  disable_regular_box=args['disable_regular_box'],
+                                                  verbose=verbose,
+                                                  caption_args=args)
+            else:
+                result = self.captioner.inference_seg(image, seg_mask,
+                                  crop_mode=args['seg_crop_mode'],
+                                  filter=args['clip_filter'],
+                                  disable_regular_box=args['disable_regular_box'],
+                                  verbose=verbose,
+                                  caption_args=args)
+            caption = result.get('caption', None)
+            crop_save_path = result.get('crop_save_path', None)
+            #  refining with TextRefiner
+            context_captions = []
+            if args['context_captions']:
+                context_captions.append(self.captioner.inference(image)['caption'])
+            if not disable_gpt and self.text_refiner is not None:
+                refined_caption = self.text_refiner.inference(query=caption, controls=controls, context=context_captions,
+                                                            enable_wiki=enable_wiki)
+            else:
+                refined_caption = {'raw_caption': caption}
+            out = {'generated_captions': refined_caption,
+                'crop_save_path': crop_save_path,
+                'mask_save_path': mask_save_path,
+                'mask': seg_mask_img,
+                'bbox': seg_bbox[i] if seg_bbox is not None else None,
+                'area': seg_area[i] if seg_area is not None else None,
+                'context_captions': context_captions,
+                'ppl_score': result.get('ppl_score', -100.),
+                'clip_score': result.get('clip_score', 0.)
+                }
+            out_list.append(out)
+        return out_list
+    def parse_dense_caption(self, image, topN=10, reference_caption=[], verbose=False):
+        width, height = get_image_shape(image)
+        prompt = {'prompt_type': ['everything']}
+        densecap_args = {
+            'return_ppl': True,
+            'clip_filter': True,
+            'reference_caption': reference_caption,
+            'text_prompt': "", # 'Question: what does the image show? Answer:'
+            'seg_crop_mode': 'w_bg',
+            # 'text_prompt': "",
+            # 'seg_crop_mode': 'wo_bg',
+            'disable_regular_box': False,
+            'topN': topN,
+            'min_ppl_score': -1.8,
+            'min_clip_score': 0.30,
+            'min_mask_area': 2500,
+            }
+        dense_captions = self.inference(image, prompt,
+                                        controls=None,
+                                        disable_gpt=True,
+                                        verbose=verbose,
+                                        is_densecap=True,
+                                        args=densecap_args)
+        print('Process Dense Captioning: \n', dense_captions)
+        dense_captions = list(filter(lambda x: x['ppl_score'] / (1+len(x['generated_captions']['raw_caption'].split())) >= densecap_args['min_ppl_score'], dense_captions))
+        dense_captions = list(filter(lambda x: x['clip_score'] >= densecap_args['min_clip_score'], dense_captions))
+        dense_cap_prompt = []
+        for cap in dense_captions:
+            x, y, w, h = cap['bbox']
+            cx, cy = x + w/2, (y + h/2)
+            dense_cap_prompt.append("({}: X:{:.0f}, Y:{:.0f}, Width:{:.0f}, Height:{:.0f})".format(cap['generated_captions']['raw_caption'], cx, cy, w, h))
+        if verbose:
+            all_masks = [np.array(item['mask'].convert('P')) for item in dense_captions]
+            new_image = mask_painter_foreground_all(np.array(image), all_masks, background_alpha=0.4)
+            save_path = 'result/dense_caption_mask.png'
+            Image.fromarray(new_image).save(save_path)
+            print(f'Dense captioning mask saved in {save_path}')
+            vis_path = 'result/dense_caption_vis_{}.png'.format(time.time())
+            dense_cap_painter_input = [{'bbox': xywh_to_x1y1x2y2(cap['bbox']),
+                                        'caption': cap['generated_captions']['raw_caption']} for cap in dense_captions]
+            draw_bbox(load_image(image, return_type='numpy'), vis_path, dense_cap_painter_input, show_caption=True)
+            print(f'Dense Captioning visualization saved in {vis_path}')
+        return ','.join(dense_cap_prompt)
+    def parse_ocr(self, image, thres=0.2):
+        width, height = get_image_shape(image)
+        image = load_image(image, return_type='numpy')
+        bounds = self.reader.readtext(image)
+        bounds = [bound for bound in bounds if bound[2] > thres]
+        print('Process OCR Text:\n', bounds)
+        ocr_prompt = []
+        for box, text, conf in bounds:
+            p0, p1, p2, p3 = box
+            ocr_prompt.append('(\"{}\": X:{:.0f}, Y:{:.0f})'.format(text, (p0[0]+p1[0]+p2[0]+p3[0])/4, (p0[1]+p1[1]+p2[1]+p3[1])/4))
+        ocr_prompt = '\n'.join(ocr_prompt)
+        # ocr_prompt = self.text_refiner.llm(f'The image have some scene texts with their locations: {ocr_prompt}. Please group these individual words into one or several phrase based on their relative positions (only give me your answer, do not show explanination)').strip()
+        # ocr_prefix1 = f'The image have some scene texts with their locations: {ocr_prompt}. Please group these individual words into one or several phrase based on their relative positions (only give me your answer, do not show explanination)'
+        # ocr_prefix2 = f'Please group these individual words into 1-3 phrases, given scene texts with their locations: {ocr_prompt}. You return is one or several strings and infer their locations. (only give me your answer like (“man working”, X: value, Y: value), do not show explanination)'
+        # ocr_prefix4 = f'summarize the individual scene text words detected by OCR tools into a fluent sentence based on their positions and distances. You should strictly describe all of the given scene text words. Do not miss any given word. Do not create non-exist words. Do not appear numeric positions. The individual words are given:\n{ocr_prompt}\n'
+        # ocr_prefix3 = f'combine the individual scene text words detected by OCR tools into one/several fluent phrases/sentences based on their positions and distances. You should strictly copy or correct all of the given scene text words. Do not miss any given word. Do not create non-exist words. The response is several strings seperate with their location (X, Y), each of which represents a phrase. The individual words are given:\n{ocr_prompt}\n'
+        # response = self.text_refiner.llm(ocr_prefix3).strip() if len(ocr_prompt) else ""
+        return ocr_prompt
+    def inference_cap_everything(self, image, verbose=False):
+        image = load_image(image, return_type='pil')
+        image = image_resize(image, res=1024)
+        width, height = get_image_shape(image)
+        other_args = {'text_prompt': ""} if self.require_caption_prompt else {}
+        img_caption = self.captioner.inference(image, filter=False, args=other_args)['caption']
+        dense_caption_prompt = self.parse_dense_caption(image, topN=10, verbose=verbose, reference_caption=[])
+        scene_text_prompt = self.parse_ocr(image, thres=0.2)
+        # scene_text_prompt = "N/A"
+        # the summarize_prompt is modified from https://github.com/JialianW/GRiT and https://github.com/showlab/Image2Paragraph
+        summarize_prompt = "Imagine you are a blind but intelligent image captioner. You should generate a descriptive, coherent and human-like paragraph based on the given information (a,b,c,d) instead of imagination:\na) Image Resolution: {image_size}\nb) Image Caption:{image_caption}\nc) Dense Caption: {dense_caption}\nd) Scene Text: {scene_text}\nThere are some rules for your response: Show objects with their attributes (e.g. position, color, size, shape, texture).\nPrimarily describe common objects with large size.\nProvide context of the image.\nShow relative position between objects.\nLess than 6 sentences.\nDo not appear number.\nDo not describe any individual letter.\nDo not show the image resolution.\nIngore the white background."
+        prompt = summarize_prompt.format(**{
+            "image_size": "width {} height {}".format(width, height),
+            "image_caption":img_caption,
+            "dense_caption": dense_caption_prompt,
+            "scene_text": scene_text_prompt})
+        print(f'caption everything prompt: {prompt}')
+        response = self.text_refiner.llm(prompt).strip()
+        # chinese_response = self.text_refiner.llm('Translate it into Chinese: {}'.format(response)).strip()
+        return response
 if __name__ == "__main__":
     from caption_anything.utils.parser import parse_augment
     args = parse_augment()
+    image_path = 'image/ocr/Untitled.png'
+    image = Image.open(image_path)
     prompts = [
         {
             "prompt_type": ["click"],
             "input_label": [1, 0],
             "multimask_output": "True",
         },
+        # {
+        #     "prompt_type": ["click"],
+        #     "input_point": [[300, 800]],
+        #     "input_label": [1],
+        #     "multimask_output": "True",
+        # }
     ]
     controls = {
         "length": "30",
     }
     model = CaptionAnything(args, os.environ['OPENAI_API_KEY'])
+    img_dir = 'test_images/memes'
+    for image_file in os.listdir(img_dir):
+        image_path = os.path.join(img_dir, image_file)
+        print('image_path:', image_path)
+        paragraph = model.inference_cap_everything(image_path, verbose=True)
+        print('Caption Everything:\n', paragraph)
+        ocr = model.parse_ocr(image_path)
+        print('OCR', ocr)

caption_anything/segmenter/__init__.py CHANGED Viewed

@@ -1,5 +1,14 @@
 from .base_segmenter import BaseSegmenter
 from caption_anything.utils.utils import seg_model_map
-def build_segmenter(model_name, device, args=None, model=None):
-        return BaseSegmenter(device, args.segmenter_checkpoint, model_name, reuse_feature=not args.disable_reuse_features, model=model)

 from .base_segmenter import BaseSegmenter
 from caption_anything.utils.utils import seg_model_map
+import copy
+def build_segmenter(model_name, device, args, model=None):
+        return BaseSegmenter(device, args.segmenter_checkpoint, model_name, reuse_feature=not args.disable_reuse_features, model=model, args=args)
+def build_segmenter_densecap(model_name, device, args, model=None):
+        args_for_densecap = copy.deepcopy(args)
+        args_for_densecap.pred_iou_thresh = 0.88
+        args_for_densecap.min_mask_region_area = 400
+        args_for_densecap.stability_score_thresh = 0.95
+        args_for_densecap.box_nms_thresh = 0.3
+        return BaseSegmenter(device, args.segmenter_checkpoint, model_name, reuse_feature=not args.disable_reuse_features, model=model, args=args)

caption_anything/segmenter/base_segmenter.py CHANGED Viewed

@@ -11,7 +11,7 @@ import PIL
 class BaseSegmenter:
-    def __init__(self, device, checkpoint, model_name='huge', reuse_feature=True, model=None):
         print(f"Initializing BaseSegmenter to {device}")
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
@@ -26,7 +26,10 @@ class BaseSegmenter:
             self.model = model
         self.reuse_feature = reuse_feature
         self.predictor = SamPredictor(self.model)
-        self.mask_generator = SamAutomaticMaskGenerator(self.model)
         self.image_embedding = None
         self.image = None
@@ -69,7 +72,9 @@ class BaseSegmenter:
         if 'everything' in control['prompt_type']:
             masks = self.mask_generator.generate(image)
             new_masks = np.concatenate([mask["segmentation"][np.newaxis, :] for mask in masks])
-            return new_masks
         else:
             if not self.reuse_feature or self.image_embedding is None:
                 self.set_image(image)

 class BaseSegmenter:
+    def __init__(self, device, checkpoint, model_name='huge', reuse_feature=True, model=None, args=None):
         print(f"Initializing BaseSegmenter to {device}")
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
             self.model = model
         self.reuse_feature = reuse_feature
         self.predictor = SamPredictor(self.model)
+        sam_generator_keys = ['pred_iou_thresh', 'min_mask_region_area', 'stability_score_thresh', 'box_nms_thresh']
+        generator_args = {k:v for k,v in vars(args).items() if k in sam_generator_keys}
+        self.mask_generator = SamAutomaticMaskGenerator(model=self.model, **generator_args)
         self.image_embedding = None
         self.image = None
         if 'everything' in control['prompt_type']:
             masks = self.mask_generator.generate(image)
             new_masks = np.concatenate([mask["segmentation"][np.newaxis, :] for mask in masks])
+            bbox = np.array([mask["bbox"] for mask in masks])
+            area = np.array([mask["area"] for mask in masks])
+            return new_masks, bbox, area
         else:
             if not self.reuse_feature or self.image_embedding is None:
                 self.set_image(image)

caption_anything/utils/densecap_painter.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import cv2
+import json
+import numpy as np
+from typing import List
+import random
+from typing import Union
+def draw_bbox(img: Union[np.ndarray, str], save_name: str, bbox: List[dict], show_caption: bool = False):
+    """
+        bbox: [{'image_id': str, 'bbox': [x1, y1, x2, y2], 'caption': str}, ...]
+    """
+    if isinstance(img, str):
+        img = cv2.imread(img)
+    RGB = [0, 50, 100, 150, 200, 250]
+    for box in bbox:
+        box['bbox'] = [int(_) for _ in box['bbox']]
+        x1, y1, x2, y2 = box['bbox']
+        caption = box['caption']
+        box_color = random.choices(RGB, k = 3)
+        (text_width, text_height), _ = cv2.getTextSize(caption, cv2.FONT_HERSHEY_SIMPLEX, fontScale = 0.5, thickness = 2)
+        cv2.rectangle(img, (x1, y1), (x2, y2), color = box_color, thickness = 2)
+        if show_caption:
+            cv2.putText(img, caption, (x1, y1 + text_height), cv2.FONT_HERSHEY_SIMPLEX, fontScale = 0.5, color = box_color, thickness = 2)
+    cv2.imwrite(save_name, img)
+    # cv2.imshow('visualise', img)
+    # cv2.waitKey(0)
+def parse_bbox(anno, image_id: int = None):
+    with open(anno, 'r') as f:
+        predictions = json.load(f)
+    if image_id is None:
+        image_id = next(iter(predictions))
+    return predictions[image_id]
+def gt_bbox(anno, img_name: int = None):
+    with open(anno, 'r') as f:
+        annotations = json.load(f)
+    annotations = annotations['annotations']
+    gt = []
+    img_name = int(img_name[:-4])
+    for annotation in annotations:
+        if annotation['image_id'] == 63:
+            x1, y1, w, h = annotation['bbox']
+            gt.append({'bbox': [x1, y1, x1 + w, y1 + h], 'caption': annotation['caption']})
+    return gt
+if __name__ == '__main__':
+    img_name = '63.jpg'
+    show_caption = True
+    anno = 'vg_dense_captioning_blip2_top48_0.88_1000_0.96_debugTrue_predictions_shard_all.json'
+    img = cv2.imread(img_name)
+    examp_bbox = parse_bbox(anno)
+    ground_truth_bbox = gt_bbox('test.json', img_name)
+    draw_bbox(img, 'GT.jpg', ground_truth_bbox, show_caption)
+    draw_bbox(img, 'Pred.jpg', examp_bbox, show_caption)

caption_anything/utils/parser.py CHANGED Viewed

@@ -22,6 +22,12 @@ def parse_augment():
     parser.add_argument('--disable_reuse_features', action="store_true", default=False)
     parser.add_argument('--enable_morphologyex', action="store_true", default=False)
     parser.add_argument('--chat_tools_dict', type=str, default='VisualQuestionAnswering_cuda:0', help='Visual ChatGPT tools, only useful when running gradio applications')
     args = parser.parse_args()
     if args.debug:

     parser.add_argument('--disable_reuse_features', action="store_true", default=False)
     parser.add_argument('--enable_morphologyex', action="store_true", default=False)
     parser.add_argument('--chat_tools_dict', type=str, default='VisualQuestionAnswering_cuda:0', help='Visual ChatGPT tools, only useful when running gradio applications')
+    parser.add_argument('--pred_iou_thresh', type=float, default=0.88, help="sam post-precessing")
+    parser.add_argument('--min_mask_region_area', type=int, default=0, help="sam post-precessing")
+    parser.add_argument('--stability_score_thresh', type=float, default=0.95, help='sam post-processing')
+    parser.add_argument('--box_nms_thresh', type=float, default=0.7, help='sam post-processing')
     args = parser.parse_args()
     if args.debug:

caption_anything/utils/utils.py CHANGED Viewed

@@ -29,6 +29,9 @@ def load_image(image: Union[np.ndarray, Image.Image, str], return_type='numpy'):
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
     if return_type == 'pil':
         return image
     elif return_type == 'numpy':
@@ -37,6 +40,34 @@ def load_image(image: Union[np.ndarray, Image.Image, str], return_type='numpy'):
         raise NotImplementedError()
 def is_platform_win():
     return sys.platform == "win32"

     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
+    if image.mode == "RGBA":
+        image = image.convert("RGB")
     if return_type == 'pil':
         return image
     elif return_type == 'numpy':
         raise NotImplementedError()
+def image_resize(image: Image.Image, res=1024):
+    width, height = org_size = image.size
+    ratio = min(1.0 * res / max(width, height), 1.0)
+    if ratio < 1.0:
+        image = image.resize((int(width * ratio), int(height * ratio)))
+        print('Scaling image from {} to {}'.format(org_size, image.size))
+    return image
+def xywh_to_x1y1x2y2(bbox):
+    x, y, w, h = bbox
+    return x,y,x+w,y+h
+def x1y1x2y2_to_xywh(bbox):
+    x1, y1, x2, y2 = bbox
+    return x1,y1,x2-x1,y2-y1
+def get_image_shape(image):
+    if isinstance(image, str):
+        return Image.open(image).size
+    elif isinstance(image, np.ndarray):
+        return image.shape
+    elif isinstance(image, Image.Image):
+        return image.size
+    else:
+        raise NotImplementedError
 def is_platform_win():
     return sys.platform == "win32"

requirements.txt CHANGED Viewed

@@ -17,4 +17,7 @@ onnxruntime
 onnx
 https://gradio-builds.s3.amazonaws.com/3e68e5e882a6790ac5b457bd33f4edf9b695af90/gradio-3.24.1-py3-none-any.whl
 accelerate
-bitsandbytes

 onnx
 https://gradio-builds.s3.amazonaws.com/3e68e5e882a6790ac5b457bd33f4edf9b695af90/gradio-3.24.1-py3-none-any.whl
 accelerate
+bitsandbytes
+packaging~=23.1
+easyocr
+tensorboardX