File size: 2,181 Bytes
5ca9963
2319467
 
6f17c34
5ca9963
5a6e90f
ed275c9
5ca9963
 
 
 
5a6e90f
 
 
5ca9963
699fe26
5ca9963
5a6e90f
5dda5d9
2319467
 
 
6f17c34
2319467
 
 
 
 
 
5a6e90f
 
 
 
 
 
2319467
5a6e90f
 
 
 
 
e5e2e79
5a6e90f
 
 
 
 
 
 
 
 
 
 
 
5ca9963
5a6e90f
 
 
5ca9963
 
5a6e90f
5ca9963
5a6e90f
5ca9963
 
5a6e90f
 
 
5ca9963
 
2319467
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch
from PIL import Image
import requests
from io import BytesIO  # Importing BytesIO from the io module
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Check if CUDA is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model on the available device
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)
model = model.to(device)

# Default processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Resize the image to a smaller resolution (e.g., 512x512)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))  # Using BytesIO to handle image from the byte stream

# Resize the image
img_resized = img.resize((512, 512))  # Resize the image to 512x512
image_inputs = processor(images=img_resized, return_tensors="pt").to(device)

# Prepare the text input
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_resized,
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device)  # Move inputs to the same device as the model

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)

# Trim the output tokens
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# Decode the generated text
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

# Print the output
print(output_text)