broadfield-dev's picture
Update app.py
1aaaced verified
import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
# --- Configuration ---
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
CPU_DEVICE = "cpu"
# --- Model and Processor Loading ---
print("Loading model and processor... This will take a few minutes on a CPU.")
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
dtype="auto", # Use 'auto' for dtype for better compatibility
device_map="auto" # This is the key for CPU (and GPU) compatibility
)
print("Model and processor loaded successfully.")
# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
"""
Processes the image and text prompt, and generates a response from the model.
"""
if image_input is None or not text_prompt.strip():
return "Please provide both an image and a text prompt."
# Convert Gradio's numpy array to a PIL Image
pil_image = Image.fromarray(image_input)
# Prepare the messages payload for the model
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": pil_image},
{"type": "text", "text": text_prompt},
],
}
]
print("Processing inputs and generating response... This will be slow.")
try:
# Preparation for inference
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1024)
# To get only the new tokens, we trim the input IDs from the generated IDs
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode the trimmed IDs to text
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
# batch_decode returns a list, we return the first element
return output_text[0]
except Exception as e:
return f"An error occurred during generation: {str(e)}"
# --- Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown(
"""
# Qwen3-VL-2B-Instruct CPU Demo
This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
**Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience.
"""
)
with gr.Row():
with gr.Column():
image_input = gr.Image(type="numpy", label="Upload Image")
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
submit_button = gr.Button("Generate Response")
with gr.Column():
output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)
submit_button.click(
fn=process_and_generate,
inputs=[image_input, text_prompt],
outputs=output_text
)
if __name__ == "__main__":
demo.launch()