fix recursion, None Type of min_pixels & max_pixels, and enforced bf16 on SM75 and lower devices
Browse files- modeling_dots_ocr_vllm.py +16 -7
modeling_dots_ocr_vllm.py
CHANGED
|
@@ -110,12 +110,16 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
|
|
| 110 |
size: Optional[dict[str, int]] = None,
|
| 111 |
**kwargs: object,
|
| 112 |
) -> Qwen2VLProcessor:
|
| 113 |
-
|
| 114 |
-
processor =
|
| 115 |
-
|
| 116 |
-
|
|
|
|
| 117 |
**kwargs,
|
| 118 |
)
|
|
|
|
|
|
|
|
|
|
| 119 |
processor.image_token = "<|imgpad|>"
|
| 120 |
processor.video_token = "<|video_pad|>"
|
| 121 |
return processor
|
|
@@ -139,12 +143,13 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
|
|
| 139 |
temporal_patch_size = vision_config.temporal_patch_size
|
| 140 |
|
| 141 |
if do_resize:
|
|
|
|
| 142 |
resized_height, resized_width = smart_resize(
|
| 143 |
height=image_height,
|
| 144 |
width=image_width,
|
| 145 |
factor=patch_size * merge_size,
|
| 146 |
-
min_pixels=image_processor.min_pixels,
|
| 147 |
-
max_pixels=image_processor.max_pixels,
|
| 148 |
)
|
| 149 |
preprocessed_size = ImageSize(width=resized_width, height=resized_height)
|
| 150 |
else:
|
|
@@ -285,7 +290,11 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal):
|
|
| 285 |
idx_end = image_sizes_consum[tp_rank].item()
|
| 286 |
pixel_values_part = pixel_values[idx_start:idx_end]
|
| 287 |
image_grid_thw_part = image_grid_thw_chunk[tp_rank]
|
| 288 |
-
image_embedding_part = self.vision_tower(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
image_embedding[idx_start // merge_size_square : idx_end // merge_size_square] = image_embedding_part
|
| 290 |
|
| 291 |
group = get_tensor_model_parallel_group().device_group
|
|
|
|
| 110 |
size: Optional[dict[str, int]] = None,
|
| 111 |
**kwargs: object,
|
| 112 |
) -> Qwen2VLProcessor:
|
| 113 |
+
# 首先,使用 super() 调用父类的方法来正确地构建处理器,避免递归。
|
| 114 |
+
processor = super().get_hf_processor(
|
| 115 |
+
min_pixels=min_pixels,
|
| 116 |
+
max_pixels=max_pixels,
|
| 117 |
+
size=size,
|
| 118 |
**kwargs,
|
| 119 |
)
|
| 120 |
+
|
| 121 |
+
# 然后,在父类返回的 processor 对象上应用修改。
|
| 122 |
+
self.get_tokenizer().image_token = "<|imgpad|>"
|
| 123 |
processor.image_token = "<|imgpad|>"
|
| 124 |
processor.video_token = "<|video_pad|>"
|
| 125 |
return processor
|
|
|
|
| 143 |
temporal_patch_size = vision_config.temporal_patch_size
|
| 144 |
|
| 145 |
if do_resize:
|
| 146 |
+
# 添加默认值以避免 TypeError,定义来自preprocessor_config.json
|
| 147 |
resized_height, resized_width = smart_resize(
|
| 148 |
height=image_height,
|
| 149 |
width=image_width,
|
| 150 |
factor=patch_size * merge_size,
|
| 151 |
+
min_pixels=image_processor.min_pixels if image_processor.min_pixels is not None else 3136,
|
| 152 |
+
max_pixels=image_processor.max_pixels if image_processor.max_pixels is not None else 11289600,
|
| 153 |
)
|
| 154 |
preprocessed_size = ImageSize(width=resized_width, height=resized_height)
|
| 155 |
else:
|
|
|
|
| 290 |
idx_end = image_sizes_consum[tp_rank].item()
|
| 291 |
pixel_values_part = pixel_values[idx_start:idx_end]
|
| 292 |
image_grid_thw_part = image_grid_thw_chunk[tp_rank]
|
| 293 |
+
image_embedding_part = self.vision_tower(
|
| 294 |
+
pixel_values_part,
|
| 295 |
+
image_grid_thw_part,
|
| 296 |
+
bf16=(self.vision_tower.dtype == torch.bfloat16), # 尝试修复SM75及之前不支持BF16设备的报错
|
| 297 |
+
)
|
| 298 |
image_embedding[idx_start // merge_size_square : idx_end // merge_size_square] = image_embedding_part
|
| 299 |
|
| 300 |
group = get_tensor_model_parallel_group().device_group
|