Spaces:

Samarth991
/

CV-Agent

Sleeping

Samarth991 commited on Feb 23

Commit

428e149

1 Parent(s): d766b17

added image detection code to display predicted bboxes

Files changed (1) hide show

extract_tools.py CHANGED Viewed

@@ -61,9 +61,10 @@ def panoptic_image_segemntation(image_path:str)->str:
     labels = []
     for segment in prediction['segments_info']:
         label_names = maskformer_model.config.id2label[segment['label_id']]
-        print(label_names)
         labels.append(label_names)
-    return 'Panoptic Segmentation image {} created with labels {} '.format(save_mask_path,labels)
 @tool
 def image_description(img_path:str)->str:
@@ -88,16 +89,16 @@ def image_description(img_path:str)->str:
     output = caption_model.generate(**inputs, max_new_tokens=50)
     caption = processor.decode(output[0], skip_special_tokens=True)
-    # conditional image captioning
-    obj_text = "Total number of objects in image "
-    inputs_2 = processor(image, obj_text ,return_tensors ='pt').to(device)
-    out_2 = caption_model.generate(**inputs_2,max_new_tokens=50)
-    object_caption = processor.decode(out_2[0], skip_special_tokens=True)
     ## clear the GPU cache
     with torch.no_grad():
         torch.cuda.empty_cache()
-    text = caption + " ."+ object_caption+" ."
     return text
@@ -120,7 +121,7 @@ def generate_bounding_box_tool(input_data:str)->str:
     data = input_data.split(",")
     image_path = data[0]
     object_prompts = data[1:]
-    object_data = yolo_world_model.run_inference(image_path,object_prompts)
     return object_data
 @tool

     labels = []
     for segment in prediction['segments_info']:
         label_names = maskformer_model.config.id2label[segment['label_id']]
         labels.append(label_names)
+    labels = " ".join([label_name for label_name in labels])
+    return 'Panoptic Segmentation image {} Found labels {} in the image '.format(save_mask_path,labels)
 @tool
 def image_description(img_path:str)->str:
     output = caption_model.generate(**inputs, max_new_tokens=50)
     caption = processor.decode(output[0], skip_special_tokens=True)
+    # # conditional image captioning
+    # obj_text = "Total number of objects in image "
+    # inputs_2 = processor(image, obj_text ,return_tensors ='pt').to(device)
+    # out_2 = caption_model.generate(**inputs_2,max_new_tokens=50)
+    # object_caption = processor.decode(out_2[0], skip_special_tokens=True)
     ## clear the GPU cache
     with torch.no_grad():
         torch.cuda.empty_cache()
+    text = caption + " ."
     return text
     data = input_data.split(",")
     image_path = data[0]
     object_prompts = data[1:]
+    object_data = yolo_world_model.run_yolo_infer(image_path,object_prompts)
     return object_data
 @tool