broadfield-dev's picture
Update app.py
c4e46f7 verified
raw
history blame
3.98 kB
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import requests
from io import BytesIO
import os
# --- Configuration ---
# Using a CPU-compatible model from the Qwen family
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
CPU_DEVICE = "cpu" # Explicitly use CPU
# --- Model and Tokenizer Loading ---
# This will be done once when the Space starts. It will be slow on a CPU.
print("Loading model and tokenizer... This may take a while on a CPU.")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
# For CPU, we load the model in bfloat16 if supported, otherwise float32.
# Note: This will consume a significant amount of RAM.
try:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
device_map=CPU_DEVICE,
trust_remote_code=True,
bf16=torch.cuda.is_bf16_supported(), # bf16 on CPU can be slow, but uses less memory
).eval()
except RuntimeError:
# Fallback to float32 if bf16 is not supported or causes issues
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
device_map=CPU_DEVICE,
trust_remote_code=True
).eval()
# Specify generation configuration
model.generation_config = GenerationConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
print("Model and tokenizer loaded successfully.")
# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
"""
Processes the image and text prompt, and generates a response from the model on the CPU.
"""
if image_input is None or text_prompt.strip() == "":
return "Please provide both an image and a text prompt."
# Convert Gradio's numpy array to a PIL Image
pil_image = Image.fromarray(image_input)
# Create a temporary path to save the image
temp_image_path = "temp_image.png"
pil_image.save(temp_image_path)
# The model's tokenizer can directly handle an image path.
# We construct the query according to the model's required format.
query = tokenizer.from_list_format([
{'image': temp_image_path},
{'text': text_prompt},
])
print("Generating response... This will be slow.")
try:
# Generate the response
response, history = model.chat(tokenizer, query=query, history=None)
# Clean up the temporary image file
os.remove(temp_image_path)
return response
except Exception as e:
# Clean up even if there's an error
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
return f"An error occurred during generation: {str(e)}"
# --- Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown(
"""
# Qwen-VL-Chat CPU Demo
This Space demonstrates the `Qwen/Qwen-VL-Chat` model, a CPU-compatible alternative to Qwen3-VL.
**Warning:** Running this vision-language model on a CPU is very slow. Please be patient after clicking generate.
"""
)
with gr.Row():
with gr.Column():
image_input = gr.Image(type="numpy", label="Upload Image")
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
submit_button = gr.Button("Generate Response")
with gr.Column():
output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)
submit_button.click(
fn=process_and_generate,
inputs=[image_input, text_prompt],
outputs=output_text
)
gr.Examples(
examples=[
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "这是什么?"],
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "框出图中礼服和帽子"],
],
inputs=[image_input, text_prompt]
)
if __name__ == "__main__":
demo.launch()