Spaces:

broadfield-dev
/

qwen3-vl-2b-instruct

Running

App Files Files Community

broadfield-dev commited on 18 days ago

Commit

3356d92

verified ·

1 Parent(s): 44d72e6

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -55

app.py CHANGED Viewed

@@ -1,86 +1,87 @@
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
 import requests
 from io import BytesIO
-import os
 # --- Configuration ---
-# Using a CPU-compatible model from the Qwen family
 MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
-CPU_DEVICE = "cpu" # Explicitly use CPU
-# --- Model and Tokenizer Loading ---
-# This will be done once when the Space starts. It will be slow on a CPU.
-print("Loading model and tokenizer... This may take a while on a CPU.")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
-# For CPU, we load the model in bfloat16 if supported, otherwise float32.
-# Note: This will consume a significant amount of RAM.
-try:
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_PATH,
-        device_map=CPU_DEVICE,
-        trust_remote_code=True,
-        bf16=torch.cuda.is_bf16_supported(), # bf16 on CPU can be slow, but uses less memory
-    ).eval()
-except RuntimeError:
-     # Fallback to float32 if bf16 is not supported or causes issues
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_PATH,
-        device_map=CPU_DEVICE,
-        trust_remote_code=True
-    ).eval()
-# Specify generation configuration
-model.generation_config = GenerationConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
-print("Model and tokenizer loaded successfully.")
 # --- Inference Function ---
 def process_and_generate(image_input, text_prompt):
     """
-    Processes the image and text prompt, and generates a response from the model on the CPU.
     """
-    if image_input is None or text_prompt.strip() == "":
         return "Please provide both an image and a text prompt."
     # Convert Gradio's numpy array to a PIL Image
     pil_image = Image.fromarray(image_input)
-    # Create a temporary path to save the image
-    temp_image_path = "temp_image.png"
-    pil_image.save(temp_image_path)
-    # The model's tokenizer can directly handle an image path.
-    # We construct the query according to the model's required format.
-    query = tokenizer.from_list_format([
-        {'image': temp_image_path},
-        {'text': text_prompt},
-    ])
-    print("Generating response... This will be slow.")
     try:
-        # Generate the response
-        response, history = model.chat(tokenizer, query=query, history=None)
-        # Clean up the temporary image file
-        os.remove(temp_image_path)
-        return response
     except Exception as e:
-        # Clean up even if there's an error
-        if os.path.exists(temp_image_path):
-            os.remove(temp_image_path)
         return f"An error occurred during generation: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # Qwen-VL-Chat CPU Demo
-        This Space demonstrates the `Qwen/Qwen-VL-Chat` model, a CPU-compatible alternative to Qwen3-VL.
-        **Warning:** Running this vision-language model on a CPU is very slow. Please be patient after clicking generate.
         """
     )
@@ -100,8 +101,9 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
-            ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "这是什么?"],
-            ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "框出图中礼服和帽子"],
         ],
         inputs=[image_input, text_prompt]
     )

 import gradio as gr
 import torch
 from PIL import Image
 import requests
 from io import BytesIO
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
 # --- Configuration ---
 MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
+CPU_DEVICE = "cpu"
+# --- Model and Processor Loading ---
+# This will be done once when the Space starts.
+# 'device_map="auto"' will correctly assign the model to the CPU in this environment.
+print("Loading model and processor... This will take a few minutes on a CPU.")
+processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+model = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    dtype="auto",  # Use 'auto' for dtype for better compatibility
+    device_map="auto" # This is the key for CPU (and GPU) compatibility
+)
+print("Model and processor loaded successfully.")
 # --- Inference Function ---
 def process_and_generate(image_input, text_prompt):
     """
+    Processes the image and text prompt, and generates a response from the model.
     """
+    if image_input is None or not text_prompt.strip():
         return "Please provide both an image and a text prompt."
     # Convert Gradio's numpy array to a PIL Image
     pil_image = Image.fromarray(image_input)
+    # Prepare the messages payload for the model
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": text_prompt},
+            ],
+        }
+    ]
+    print("Processing inputs and generating response... This will be slow.")
     try:
+        # Preparation for inference
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        )
+        inputs = inputs.to(model.device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, max_new_tokens=1024)
+        # To get only the new tokens, we trim the input IDs from the generated IDs
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode the trimmed IDs to text
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        # batch_decode returns a list, we return the first element
+        return output_text[0]
     except Exception as e:
         return f"An error occurred during generation: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # Qwen3-VL-2B-Instruct CPU Demo
+        This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
+        **Warning:** Running this on a free CPU Space is **very slow**. Please be patient after clicking the generate button.
         """
     )
     gr.Examples(
         examples=[
+            ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", "Describe this image."],
+            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read the text from this receipt."],
+            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is inside the red box?"],
         ],
         inputs=[image_input, text_prompt]
     )