|
|
import gradio as gr |
|
|
import torch |
|
|
from PIL import Image |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from transformers.generation import GenerationConfig |
|
|
import requests |
|
|
from io import BytesIO |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct" |
|
|
CPU_DEVICE = "cpu" |
|
|
|
|
|
|
|
|
|
|
|
print("Loading model and tokenizer... This may take a while on a CPU.") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_PATH, |
|
|
device_map=CPU_DEVICE, |
|
|
trust_remote_code=True, |
|
|
bf16=torch.cuda.is_bf16_supported(), |
|
|
).eval() |
|
|
except RuntimeError: |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_PATH, |
|
|
device_map=CPU_DEVICE, |
|
|
trust_remote_code=True |
|
|
).eval() |
|
|
|
|
|
|
|
|
model.generation_config = GenerationConfig.from_pretrained(MODEL_PATH, trust_remote_code=True) |
|
|
print("Model and tokenizer loaded successfully.") |
|
|
|
|
|
|
|
|
def process_and_generate(image_input, text_prompt): |
|
|
""" |
|
|
Processes the image and text prompt, and generates a response from the model on the CPU. |
|
|
""" |
|
|
if image_input is None or text_prompt.strip() == "": |
|
|
return "Please provide both an image and a text prompt." |
|
|
|
|
|
|
|
|
pil_image = Image.fromarray(image_input) |
|
|
|
|
|
temp_image_path = "temp_image.png" |
|
|
pil_image.save(temp_image_path) |
|
|
|
|
|
|
|
|
|
|
|
query = tokenizer.from_list_format([ |
|
|
{'image': temp_image_path}, |
|
|
{'text': text_prompt}, |
|
|
]) |
|
|
|
|
|
print("Generating response... This will be slow.") |
|
|
try: |
|
|
|
|
|
response, history = model.chat(tokenizer, query=query, history=None) |
|
|
|
|
|
|
|
|
os.remove(temp_image_path) |
|
|
|
|
|
return response |
|
|
except Exception as e: |
|
|
|
|
|
if os.path.exists(temp_image_path): |
|
|
os.remove(temp_image_path) |
|
|
return f"An error occurred during generation: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Qwen-VL-Chat CPU Demo |
|
|
This Space demonstrates the `Qwen/Qwen-VL-Chat` model, a CPU-compatible alternative to Qwen3-VL. |
|
|
**Warning:** Running this vision-language model on a CPU is very slow. Please be patient after clicking generate. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
image_input = gr.Image(type="numpy", label="Upload Image") |
|
|
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.") |
|
|
submit_button = gr.Button("Generate Response") |
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox(label="Model Output", lines=10, interactive=False) |
|
|
|
|
|
submit_button.click( |
|
|
fn=process_and_generate, |
|
|
inputs=[image_input, text_prompt], |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "这是什么?"], |
|
|
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "框出图中礼服和帽子"], |
|
|
], |
|
|
inputs=[image_input, text_prompt] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |