Spaces:

broadfield-dev
/

qwen3-vl-2b-instruct

Running

App Files Files Community

qwen3-vl-2b-instruct / app.py

broadfield-dev

Update app.py

c4e46f7 verified about 1 month ago

raw

history blame

3.98 kB

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from transformers.generation import GenerationConfig
	import requests
	from io import BytesIO
	import os

	# --- Configuration ---
	# Using a CPU-compatible model from the Qwen family
	MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
	CPU_DEVICE = "cpu" # Explicitly use CPU

	# --- Model and Tokenizer Loading ---
	# This will be done once when the Space starts. It will be slow on a CPU.
	print("Loading model and tokenizer... This may take a while on a CPU.")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

	# For CPU, we load the model in bfloat16 if supported, otherwise float32.
	# Note: This will consume a significant amount of RAM.
	try:
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	device_map=CPU_DEVICE,
	trust_remote_code=True,
	bf16=torch.cuda.is_bf16_supported(), # bf16 on CPU can be slow, but uses less memory
	).eval()
	except RuntimeError:
	# Fallback to float32 if bf16 is not supported or causes issues
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	device_map=CPU_DEVICE,
	trust_remote_code=True
	).eval()

	# Specify generation configuration
	model.generation_config = GenerationConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
	print("Model and tokenizer loaded successfully.")

	# --- Inference Function ---
	def process_and_generate(image_input, text_prompt):
	"""
	Processes the image and text prompt, and generates a response from the model on the CPU.
	"""
	if image_input is None or text_prompt.strip() == "":
	return "Please provide both an image and a text prompt."

	# Convert Gradio's numpy array to a PIL Image
	pil_image = Image.fromarray(image_input)
	# Create a temporary path to save the image
	temp_image_path = "temp_image.png"
	pil_image.save(temp_image_path)

	# The model's tokenizer can directly handle an image path.
	# We construct the query according to the model's required format.
	query = tokenizer.from_list_format([
	{'image': temp_image_path},
	{'text': text_prompt},
	])

	print("Generating response... This will be slow.")
	try:
	# Generate the response
	response, history = model.chat(tokenizer, query=query, history=None)

	# Clean up the temporary image file
	os.remove(temp_image_path)

	return response
	except Exception as e:
	# Clean up even if there's an error
	if os.path.exists(temp_image_path):
	os.remove(temp_image_path)
	return f"An error occurred during generation: {str(e)}"

	# --- Gradio Interface ---
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Qwen-VL-Chat CPU Demo
	This Space demonstrates the `Qwen/Qwen-VL-Chat` model, a CPU-compatible alternative to Qwen3-VL.
	Warning: Running this vision-language model on a CPU is very slow. Please be patient after clicking generate.
	"""
	)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="numpy", label="Upload Image")
	text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
	submit_button = gr.Button("Generate Response")
	with gr.Column():
	output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

	submit_button.click(
	fn=process_and_generate,
	inputs=[image_input, text_prompt],
	outputs=output_text
	)

	gr.Examples(
	examples=[
	["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "这是什么?"],
	["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "框出图中礼服和帽子"],
	],
	inputs=[image_input, text_prompt]
	)

	if __name__ == "__main__":
	demo.launch()