Spaces:
Build error
Build error
File size: 2,181 Bytes
5ca9963 2319467 6f17c34 5ca9963 5a6e90f ed275c9 5ca9963 5a6e90f 5ca9963 699fe26 5ca9963 5a6e90f 5dda5d9 2319467 6f17c34 2319467 5a6e90f 2319467 5a6e90f e5e2e79 5a6e90f 5ca9963 5a6e90f 5ca9963 5a6e90f 5ca9963 5a6e90f 5ca9963 5a6e90f 5ca9963 2319467 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import torch
from PIL import Image
import requests
from io import BytesIO # Importing BytesIO from the io module
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
# Check if CUDA is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the model on the available device
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)
model = model.to(device)
# Default processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
# Resize the image to a smaller resolution (e.g., 512x512)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
response = requests.get(image_url)
img = Image.open(BytesIO(response.content)) # Using BytesIO to handle image from the byte stream
# Resize the image
img_resized = img.resize((512, 512)) # Resize the image to 512x512
image_inputs = processor(images=img_resized, return_tensors="pt").to(device)
# Prepare the text input
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": img_resized,
},
{"type": "text", "text": "Describe this image."},
],
}
]
# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(device) # Move inputs to the same device as the model
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
# Trim the output tokens
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode the generated text
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
# Print the output
print(output_text)
|