File size: 4,068 Bytes
c2ef06d
 
 
 
 
3356d92
c2ef06d
 
c4e46f7
3356d92
c2ef06d
3356d92
 
 
 
 
 
 
 
 
 
 
 
c2ef06d
 
 
 
3356d92
c2ef06d
3356d92
c2ef06d
 
0c9364d
c2ef06d
 
3356d92
 
 
 
 
 
 
 
 
 
c2ef06d
3356d92
c2ef06d
3356d92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c9364d
3356d92
 
0c9364d
c2ef06d
 
 
 
 
 
 
3356d92
 
 
c2ef06d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3356d92
 
 
c2ef06d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# --- Configuration ---
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
CPU_DEVICE = "cpu"

# --- Model and Processor Loading ---
# This will be done once when the Space starts.
# 'device_map="auto"' will correctly assign the model to the CPU in this environment.
print("Loading model and processor... This will take a few minutes on a CPU.")
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    dtype="auto",  # Use 'auto' for dtype for better compatibility
    device_map="auto" # This is the key for CPU (and GPU) compatibility
)
print("Model and processor loaded successfully.")

# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
    """
    Processes the image and text prompt, and generates a response from the model.
    """
    if image_input is None or not text_prompt.strip():
        return "Please provide both an image and a text prompt."

    # Convert Gradio's numpy array to a PIL Image
    pil_image = Image.fromarray(image_input)

    # Prepare the messages payload for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]

    print("Processing inputs and generating response... This will be slow.")
    try:
        # Preparation for inference
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=1024)

        # To get only the new tokens, we trim the input IDs from the generated IDs
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        # Decode the trimmed IDs to text
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        # batch_decode returns a list, we return the first element
        return output_text[0]
        
    except Exception as e:
        return f"An error occurred during generation: {str(e)}"

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen3-VL-2B-Instruct CPU Demo
        This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
        **Warning:** Running this on a free CPU Space is **very slow**. Please be patient after clicking the generate button.
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")
            text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
            submit_button = gr.Button("Generate Response")
        with gr.Column():
            output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

    submit_button.click(
        fn=process_and_generate,
        inputs=[image_input, text_prompt],
        outputs=output_text
    )

    gr.Examples(
        examples=[
            ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", "Describe this image."],
            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read the text from this receipt."],
            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is inside the red box?"],
        ],
        inputs=[image_input, text_prompt]
    )

if __name__ == "__main__":
    demo.launch()