Qwen2-VL-2B

Build error

App Files Files Community

Qwen2-VL-2B / app.py

vykanand

Update app.py

73d58c2 verified 10 months ago

raw

history blame

1.68 kB

	import torch
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from PIL import Image
	import requests
	from io import BytesIO

	# Initialize the model and processor
	model_name = "Qwen/Qwen2-VL-2B-Instruct"
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model = Qwen2VLForConditionalGeneration.from_pretrained(model_name).to(device)
	processor = AutoProcessor.from_pretrained(model_name)

	# Load the image from URL
	image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
	response = requests.get(image_url)
	img = Image.open(BytesIO(response.content))

	# Automatically preprocess the image and text input using the processor
	text_input = "Describe this image."

	# The processor automatically handles resizing, normalization, and tokenization
	inputs = processor(
	images=img,
	text=text_input,
	return_tensors="pt",
	padding=True, # Automatically pad to match model input size
	)

	# Check the number of tokens generated by the processor and the shape of inputs
	print("Input tokens:", inputs.input_ids.shape)
	print("Image features shape:", inputs.pixel_values.shape)

	# Ensure image and text are properly tokenized and features align
	assert inputs.input_ids.shape[1] > 0, "No tokens generated for text input!"
	assert inputs.pixel_values.shape[0] > 0, "No features generated for the image!"

	# Move inputs to the device (either GPU or CPU)
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Inference
	generated_ids = model.generate(**inputs, max_new_tokens=128)

	# Decode the output
	output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
	print(output_text)