Spaces:

NeuralFalcon
/

Image-to-Prompt

Running

App Files Files Community

NeuralFalcon commited on Aug 8

Commit

4255784

verified ·

1 Parent(s): bcf1741

Create app.py

Browse files

Files changed (1) hide show

app.py +82 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+# https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct
+# https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+# model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+# model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+# Load model & processor
+model_name= "SmolVLM2-2.2B-Instruct"
+model_path=f"HuggingFaceTB/{model_name}"
+processor = AutoProcessor.from_pretrained(model_path)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModelForImageTextToText.from_pretrained(
+    model_path,
+    torch_dtype=torch.float16,  # Use FP16 for better performance on T4
+    device_map="auto"  # Auto-assign model to GPU
+).to(device)
+import torch
+import os
+def describe_image(image_path, user_prompt="Describe the image in detail.",system_role=""):
+    global model, processor
+    messages=[]
+    if not os.path.exists(image_path):
+      return None
+    if system_role!="":
+      messages.append( {
+                "role": "system",
+                "content": [{"type": "text", "text": system_role}]
+         })
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {"type": "image", "path": image_path},
+            ]
+        }
+    )
+    # Prepare input
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(model.device)
+    # Convert only float32 tensors to float16
+    for k, v in inputs.items():
+        if v.dtype == torch.float32:
+            inputs[k] = v.to(torch.float16)
+    # Generate response
+    generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=1024)
+    # Decode and return output
+    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    return generated_texts[0].split("Assistant:")[-1].replace("\n\n\n\n\n\n", "").strip()
+import gradio as gr
+def ui():
+    return gr.Interface(
+        fn=describe_image,
+        inputs=[
+            gr.Image(type="filepath", label="Upload Image"),
+            gr.Textbox(value="Describe the image in detail.", label="User Prompt"),
+            gr.Textbox(value="", label="System Role (Optional)")
+        ],
+        outputs=gr.Textbox(label="Image Description"),
+        title="Image Captioning App",
+        description="Upload an image and customize prompts to get a detailed description."
+    )
+demo=ui()
+demo.queue().launch()