|
|
import gradio as gr |
|
|
import torch |
|
|
from PIL import Image |
|
|
import requests |
|
|
from io import BytesIO |
|
|
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor |
|
|
|
|
|
|
|
|
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct" |
|
|
CPU_DEVICE = "cpu" |
|
|
|
|
|
|
|
|
print("Loading model and processor... This will take a few minutes on a CPU.") |
|
|
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) |
|
|
model = Qwen3VLForConditionalGeneration.from_pretrained( |
|
|
MODEL_PATH, |
|
|
trust_remote_code=True, |
|
|
dtype="auto", |
|
|
device_map="auto" |
|
|
) |
|
|
print("Model and processor loaded successfully.") |
|
|
|
|
|
|
|
|
def process_and_generate(image_input, text_prompt): |
|
|
""" |
|
|
Processes the image and text prompt, and generates a response from the model. |
|
|
""" |
|
|
if image_input is None or not text_prompt.strip(): |
|
|
return "Please provide both an image and a text prompt." |
|
|
|
|
|
|
|
|
pil_image = Image.fromarray(image_input) |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": pil_image}, |
|
|
{"type": "text", "text": text_prompt}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
print("Processing inputs and generating response... This will be slow.") |
|
|
try: |
|
|
|
|
|
inputs = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=True, |
|
|
add_generation_prompt=True, |
|
|
return_dict=True, |
|
|
return_tensors="pt" |
|
|
) |
|
|
inputs = inputs.to(model.device) |
|
|
|
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=1024) |
|
|
|
|
|
|
|
|
generated_ids_trimmed = [ |
|
|
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
|
|
] |
|
|
|
|
|
|
|
|
output_text = processor.batch_decode( |
|
|
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
|
) |
|
|
|
|
|
|
|
|
return output_text[0] |
|
|
|
|
|
except Exception as e: |
|
|
return f"An error occurred during generation: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Qwen3-VL-2B-Instruct CPU Demo |
|
|
This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library. |
|
|
**Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
image_input = gr.Image(type="numpy", label="Upload Image") |
|
|
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.") |
|
|
submit_button = gr.Button("Generate Response") |
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox(label="Model Output", lines=10, interactive=False) |
|
|
|
|
|
submit_button.click( |
|
|
fn=process_and_generate, |
|
|
inputs=[image_input, text_prompt], |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |