Spaces:

broadfield-dev
/

qwen3-vl-2b-instruct

Running

File size: 3,464 Bytes

c2ef06d
 
 
 
 
3356d92
c2ef06d
 
c4e46f7
3356d92
c2ef06d
3356d92
 
 
 
 
 
 
 
 
 
c2ef06d
 
 
 
3356d92
c2ef06d
3356d92
c2ef06d
 
0c9364d
c2ef06d
 
3356d92
 
 
 
 
 
 
 
 
 
c2ef06d
3356d92
c2ef06d
3356d92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c9364d
3356d92
 
0c9364d
c2ef06d
 
 
 
 
 
 
3356d92
 
1aaaced
c2ef06d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32dfa9a
c2ef06d

import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# --- Configuration ---
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
CPU_DEVICE = "cpu"

# --- Model and Processor Loading ---
print("Loading model and processor... This will take a few minutes on a CPU.")
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    dtype="auto",  # Use 'auto' for dtype for better compatibility
    device_map="auto" # This is the key for CPU (and GPU) compatibility
)
print("Model and processor loaded successfully.")

# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
    """
    Processes the image and text prompt, and generates a response from the model.
    """
    if image_input is None or not text_prompt.strip():
        return "Please provide both an image and a text prompt."

    # Convert Gradio's numpy array to a PIL Image
    pil_image = Image.fromarray(image_input)

    # Prepare the messages payload for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]

    print("Processing inputs and generating response... This will be slow.")
    try:
        # Preparation for inference
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=1024)

        # To get only the new tokens, we trim the input IDs from the generated IDs
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        # Decode the trimmed IDs to text
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        # batch_decode returns a list, we return the first element
        return output_text[0]
        
    except Exception as e:
        return f"An error occurred during generation: {str(e)}"

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen3-VL-2B-Instruct CPU Demo
        This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
        **Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience.
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")
            text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
            submit_button = gr.Button("Generate Response")
        with gr.Column():
            output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

    submit_button.click(
        fn=process_and_generate,
        inputs=[image_input, text_prompt],
        outputs=output_text
    )



if __name__ == "__main__":
    demo.launch()