RoadToNowhere commited on
Commit
aa609e3
·
verified ·
1 Parent(s): ba670c5

fix recursion, None Type of min_pixels & max_pixels, and enforced bf16 on SM75 and lower devices

Browse files
Files changed (1) hide show
  1. modeling_dots_ocr_vllm.py +16 -7
modeling_dots_ocr_vllm.py CHANGED
@@ -110,12 +110,16 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
110
  size: Optional[dict[str, int]] = None,
111
  **kwargs: object,
112
  ) -> Qwen2VLProcessor:
113
- self.get_tokenizer().image_token = "<|imgpad|>" # Ensure image token is set
114
- processor = self.ctx.get_hf_processor(
115
- Qwen2VLProcessor,
116
- image_processor=self.get_image_processor(min_pixels=min_pixels, max_pixels=max_pixels, size=size),
 
117
  **kwargs,
118
  )
 
 
 
119
  processor.image_token = "<|imgpad|>"
120
  processor.video_token = "<|video_pad|>"
121
  return processor
@@ -139,12 +143,13 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
139
  temporal_patch_size = vision_config.temporal_patch_size
140
 
141
  if do_resize:
 
142
  resized_height, resized_width = smart_resize(
143
  height=image_height,
144
  width=image_width,
145
  factor=patch_size * merge_size,
146
- min_pixels=image_processor.min_pixels,
147
- max_pixels=image_processor.max_pixels,
148
  )
149
  preprocessed_size = ImageSize(width=resized_width, height=resized_height)
150
  else:
@@ -285,7 +290,11 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal):
285
  idx_end = image_sizes_consum[tp_rank].item()
286
  pixel_values_part = pixel_values[idx_start:idx_end]
287
  image_grid_thw_part = image_grid_thw_chunk[tp_rank]
288
- image_embedding_part = self.vision_tower(pixel_values_part, image_grid_thw_part)
 
 
 
 
289
  image_embedding[idx_start // merge_size_square : idx_end // merge_size_square] = image_embedding_part
290
 
291
  group = get_tensor_model_parallel_group().device_group
 
110
  size: Optional[dict[str, int]] = None,
111
  **kwargs: object,
112
  ) -> Qwen2VLProcessor:
113
+ # 首先,使用 super() 调用父类的方法来正确地构建处理器,避免递归。
114
+ processor = super().get_hf_processor(
115
+ min_pixels=min_pixels,
116
+ max_pixels=max_pixels,
117
+ size=size,
118
  **kwargs,
119
  )
120
+
121
+ # 然后,在父类返回的 processor 对象上应用修改。
122
+ self.get_tokenizer().image_token = "<|imgpad|>"
123
  processor.image_token = "<|imgpad|>"
124
  processor.video_token = "<|video_pad|>"
125
  return processor
 
143
  temporal_patch_size = vision_config.temporal_patch_size
144
 
145
  if do_resize:
146
+ # 添加默认值以避免 TypeError,定义来自preprocessor_config.json
147
  resized_height, resized_width = smart_resize(
148
  height=image_height,
149
  width=image_width,
150
  factor=patch_size * merge_size,
151
+ min_pixels=image_processor.min_pixels if image_processor.min_pixels is not None else 3136,
152
+ max_pixels=image_processor.max_pixels if image_processor.max_pixels is not None else 11289600,
153
  )
154
  preprocessed_size = ImageSize(width=resized_width, height=resized_height)
155
  else:
 
290
  idx_end = image_sizes_consum[tp_rank].item()
291
  pixel_values_part = pixel_values[idx_start:idx_end]
292
  image_grid_thw_part = image_grid_thw_chunk[tp_rank]
293
+ image_embedding_part = self.vision_tower(
294
+ pixel_values_part,
295
+ image_grid_thw_part,
296
+ bf16=(self.vision_tower.dtype == torch.bfloat16), # 尝试修复SM75及之前不支持BF16设备的报错
297
+ )
298
  image_embedding[idx_start // merge_size_square : idx_end // merge_size_square] = image_embedding_part
299
 
300
  group = get_tensor_model_parallel_group().device_group