Spaces:

Yiming-M
/

CLIP-EBC

Running on Zero

App Files Files Community

Yiming-M commited on Mar 27

Commit

c38041d

1 Parent(s): e6bdb71

updated

Browse files

Files changed (3) hide show

app.py +75 -43
models/clip/_clip/__init__.py +0 -31
models/clip/_clip/prepare.py +1 -8

app.py CHANGED Viewed

@@ -40,8 +40,6 @@ truncation = 4
 reduction = 8
 granularity = "fine"
 anchor_points = "average"
-model_name = "clip_vit_l_14"
 input_size = 224
 # Comment the lines below to test non-CLIP models.
@@ -50,8 +48,19 @@ num_vpt = 32
 vpt_drop = 0.
 deep_vpt = True
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if truncation is None:  # regression, no truncation.
     bins, anchor_points = None, None
@@ -62,32 +71,48 @@ else:
     anchor_points = config["anchor_points"][granularity]["average"] if anchor_points == "average" else config["anchor_points"][granularity]["middle"]
     bins = [(float(b[0]), float(b[1])) for b in bins]
     anchor_points = [float(p) for p in anchor_points]
-model = get_model(
-    backbone=model_name,
-    input_size=input_size,
-    reduction=reduction,
-    bins=bins,
-    anchor_points=anchor_points,
-    # CLIP parameters
-    prompt_type=prompt_type,
-    num_vpt=num_vpt,
-    vpt_drop=vpt_drop,
-    deep_vpt=deep_vpt
-)
-repo_id = "Yiming-M/CLIP-EBC"
-filename = "nwpu_weights/CLIP_EBC_ViT_L_14/model.safetensors"
-weights_path = hf_hub_download(repo_id, filename)
-# weights_path = os.path.join("CLIP_EBC_ViT_L_14", "model.safetensors")
-state_dict = load_file(weights_path)
-new_state_dict = {}
-for k, v in state_dict.items():
-    new_state_dict[k.replace("model.", "")] = v
-model.load_state_dict(new_state_dict)
-model.to(device)
-model.eval()
 # -----------------------------
@@ -114,17 +139,22 @@ def transform(image: Image.Image):
 # -----------------------------
 # Inference function
 # -----------------------------
-def predict(image: Image.Image):
     """
     Given an input image, preprocess it, run the model to obtain a density map,
     compute the total crowd count, and prepare the density map for display.
     """
     # Preprocess the image
     input_width, input_height = image.size
     input_tensor = transform(image).to(device)  # shape: (1, 3, H, W)
     with torch.no_grad():
-        density_map = model(input_tensor)  # expected shape: (1, 1, H, W)
         total_count = density_map.sum().item()
         resized_density_map = resize_density_map(density_map, (input_height, input_width)).cpu().squeeze().numpy()
@@ -149,32 +179,34 @@ def predict(image: Image.Image):
 # Build Gradio Interface using Blocks for a two-column layout
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Crowd Counting Demo")
     gr.Markdown("Upload an image or select an example below to see the predicted crowd density map and total count.")
     with gr.Row():
         with gr.Column():
-            input_img = gr.Image(
-                label="Input Image",
-                sources=["upload", "clipboard"],
-                type="pil",
             )
             submit_btn = gr.Button("Predict")
         with gr.Column():
             output_img = gr.Image(label="Predicted Density Map", type="pil")
             output_text = gr.Textbox(label="Total Count")
-    submit_btn.click(fn=predict, inputs=input_img, outputs=[input_img, output_img, output_text])
-    # Optional: add example images. Ensure these files are in your repo.
     gr.Examples(
         examples=[
             ["example1.jpg"],
-            ["example2.jpg"]
         ],
         inputs=input_img,
         label="Try an example"
     )
-# Launch the app
-demo.launch()

 reduction = 8
 granularity = "fine"
 anchor_points = "average"
 input_size = 224
 # Comment the lines below to test non-CLIP models.
 vpt_drop = 0.
 deep_vpt = True
+repo_id = "Yiming-M/CLIP-EBC"
+model_configs = {
+    "CLIP_EBC_ViT_L_14": {
+        "model_name": "clip_vit_l_14",
+        "filename": "nwpu_weights/CLIP_EBC_ViT_L_14/model.safetensors",
+    },
+    "CLIP_EBC_ViT_B_16": {
+        "model_name": "clip_vit_b_16",
+        "filename": "nwpu_weights/CLIP_EBC_ViT_B_16/model.safetensors",
+    },
+}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if truncation is None:  # regression, no truncation.
     bins, anchor_points = None, None
     anchor_points = config["anchor_points"][granularity]["average"] if anchor_points == "average" else config["anchor_points"][granularity]["middle"]
     bins = [(float(b[0]), float(b[1])) for b in bins]
     anchor_points = [float(p) for p in anchor_points]
+# Use a global reference to store the model instance
+loaded_model = None
+def load_model(model_choice: str):
+    global loaded_model
+    config = model_configs[model_choice]
+    model_name = config["model_name"]
+    filename = config["filename"]
+    # Prepare bins and anchor_points if using classification
+    if truncation is None:
+        bins_, anchor_points_ = None, None
+    else:
+        with open(os.path.join("configs", f"reduction_{reduction}.json"), "r") as f:
+            config_json = json.load(f)[str(truncation)]["nwpu"]
+        bins_ = config_json["bins"][granularity]
+        anchor_points_ = config_json["anchor_points"][granularity]["average"] if anchor_points == "average" else config_json["anchor_points"][granularity]["middle"]
+        bins_ = [(float(b[0]), float(b[1])) for b in bins_]
+        anchor_points_ = [float(p) for p in anchor_points_]
+    # Build model
+    model = get_model(
+        backbone=model_name,
+        input_size=input_size,
+        reduction=reduction,
+        bins=bins_,
+        anchor_points=anchor_points_,
+        prompt_type=prompt_type,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        deep_vpt=deep_vpt,
+    )
+    weights_path = hf_hub_download(repo_id, filename)
+    state_dict = load_file(weights_path)
+    new_state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}
+    model.load_state_dict(new_state_dict)
+    model.to(device)
+    model.eval()
+    loaded_model = model
 # -----------------------------
 # -----------------------------
 # Inference function
 # -----------------------------
+def predict(image: Image.Image, model_choice: str = "CLIP_EBC_ViT_B_16"):
     """
     Given an input image, preprocess it, run the model to obtain a density map,
     compute the total crowd count, and prepare the density map for display.
     """
+    global loaded_model
+    if loaded_model is None or model_configs[model_choice]["model_name"] not in loaded_model.__class__.__name__:
+        load_model(model_choice)
     # Preprocess the image
     input_width, input_height = image.size
     input_tensor = transform(image).to(device)  # shape: (1, 3, H, W)
     with torch.no_grad():
+        density_map = loaded_model(input_tensor)  # expected shape: (1, 1, H, W)
         total_count = density_map.sum().item()
         resized_density_map = resize_density_map(density_map, (input_height, input_width)).cpu().squeeze().numpy()
 # Build Gradio Interface using Blocks for a two-column layout
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Crowd Counting by CLIP-EBC (Pre-trained on NWPU-Crowd)")
     gr.Markdown("Upload an image or select an example below to see the predicted crowd density map and total count.")
     with gr.Row():
         with gr.Column():
+            model_choice = gr.Dropdown(
+                choices=list(model_configs.keys()),
+                value="CLIP_EBC_ViT_B_16",
+                label="Select Model"
             )
+            input_img = gr.Image(label="Input Image", sources=["upload", "clipboard"], type="pil")
             submit_btn = gr.Button("Predict")
         with gr.Column():
             output_img = gr.Image(label="Predicted Density Map", type="pil")
             output_text = gr.Textbox(label="Total Count")
+    submit_btn.click(fn=predict, inputs=[input_img, model_choice], outputs=[input_img, output_img, output_text])
     gr.Examples(
         examples=[
             ["example1.jpg"],
+            ["example2.jpg"],
+            ["example3.jpg"],
+            ["example4.jpg"],
+            ["example5.jpg"],
         ],
         inputs=input_img,
         label="Try an example"
     )
+demo.launch()

models/clip/_clip/__init__.py CHANGED Viewed

@@ -13,15 +13,8 @@ from .model import CLIP
 curr_dir = os.path.dirname(os.path.abspath(__file__))
 clip_model_names = [
-    "clip_resnet50",
-    "clip_resnet101",
-    "clip_resnet50x4",
-    "clip_resnet50x16",
-    "clip_resnet50x64",
-    "clip_vit_b_32",
     "clip_vit_b_16",
     "clip_vit_l_14",
-    "clip_vit_l_14_336px",
 ]
 clip_image_encoder_names = [f"clip_image_encoder_{name[5:]}" for name in clip_model_names]
@@ -240,34 +233,10 @@ __all__ = [
     # utils
     "tokenize",
     "transform",
-    # clip models
-    "resnet50_clip",
-    "resnet101_clip",
-    "resnet50x4_clip",
-    "resnet50x16_clip",
-    "resnet50x64_clip",
-    "vit_b_32_clip",
-    "vit_b_16_clip",
-    "vit_l_14_clip",
-    "vit_l_14_336px_clip",
     # clip image encoders
-    "resnet50_img",
-    "resnet101_img",
-    "resnet50x4_img",
-    "resnet50x16_img",
-    "resnet50x64_img",
-    "vit_b_32_img",
     "vit_b_16_img",
     "vit_l_14_img",
-    "vit_l_14_336px_img",
     # clip text encoders
-    "resnet50_txt",
-    "resnet101_txt",
-    "resnet50x4_txt",
-    "resnet50x16_txt",
-    "resnet50x64_txt",
-    "vit_b_32_txt",
     "vit_b_16_txt",
     "vit_l_14_txt",
-    "vit_l_14_336px_txt",
 ]

 curr_dir = os.path.dirname(os.path.abspath(__file__))
 clip_model_names = [
     "clip_vit_b_16",
     "clip_vit_l_14",
 ]
 clip_image_encoder_names = [f"clip_image_encoder_{name[5:]}" for name in clip_model_names]
     # utils
     "tokenize",
     "transform",
     # clip image encoders
     "vit_b_16_img",
     "vit_l_14_img",
     # clip text encoders
     "vit_b_16_txt",
     "vit_l_14_txt",
 ]

models/clip/_clip/prepare.py CHANGED Viewed

@@ -9,15 +9,8 @@ from .utils import load
 model_name_map = {
-    "RN50": "resnet50",
-    "RN101": "resnet101",
-    "RN50x4": "resnet50x4",
-    "RN50x16": "resnet50x16",
-    "RN50x64": "resnet50x64",
-    "ViT-B/32": "vit_b_32",
     "ViT-B/16": "vit_b_16",
     "ViT-L/14": "vit_l_14",
-    "ViT-L/14@336px": "vit_l_14_336px",
 }
@@ -49,7 +42,7 @@ def prepare() -> None:
     os.makedirs(config_dir, exist_ok=True)
     device = torch.device("cpu")
-    for model_name in tqdm(["RN50", "RN101", "RN50x4", "RN50x16", "RN50x64", "ViT-B/32", "ViT-B/16", "ViT-L/14", "ViT-L/14@336px"]):
         model = load(model_name, device=device).to(device)
         image_encoder = model.visual.to(device)
         text_encoder = CLIPTextEncoderTemp(model).to(device)

 model_name_map = {
     "ViT-B/16": "vit_b_16",
     "ViT-L/14": "vit_l_14",
 }
     os.makedirs(config_dir, exist_ok=True)
     device = torch.device("cpu")
+    for model_name in tqdm(["ViT-B/16", "ViT-L/14"]):
         model = load(model_name, device=device).to(device)
         image_encoder = model.visual.to(device)
         text_encoder = CLIPTextEncoderTemp(model).to(device)