fix recursion, None Type of min_pixels & max_pixels, and enforced bf16 on SM75 and lower devices

Browse files

Files changed (1) hide show

modeling_dots_ocr_vllm.py +16 -7

modeling_dots_ocr_vllm.py CHANGED Viewed

@@ -110,12 +110,16 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
     ) -> Qwen2VLProcessor:
-        self.get_tokenizer().image_token = "<|imgpad|>" # Ensure image token is set
-        processor = self.ctx.get_hf_processor(
-            Qwen2VLProcessor,
-            image_processor=self.get_image_processor(min_pixels=min_pixels, max_pixels=max_pixels, size=size),
             **kwargs,
         )
         processor.image_token = "<|imgpad|>"
         processor.video_token = "<|video_pad|>"
         return processor
@@ -139,12 +143,13 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
         temporal_patch_size = vision_config.temporal_patch_size
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * merge_size,
-                min_pixels=image_processor.min_pixels,
-                max_pixels=image_processor.max_pixels,
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
@@ -285,7 +290,11 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal):
             idx_end = image_sizes_consum[tp_rank].item()
             pixel_values_part = pixel_values[idx_start:idx_end]
             image_grid_thw_part = image_grid_thw_chunk[tp_rank]
-            image_embedding_part = self.vision_tower(pixel_values_part, image_grid_thw_part)
             image_embedding[idx_start // merge_size_square : idx_end // merge_size_square] = image_embedding_part
         group = get_tensor_model_parallel_group().device_group

         size: Optional[dict[str, int]] = None,
         **kwargs: object,
     ) -> Qwen2VLProcessor:
+        # 首先，使用 super() 调用父类的方法来正确地构建处理器，避免递归。
+        processor = super().get_hf_processor(
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            size=size,
             **kwargs,
         )
+        # 然后，在父类返回的 processor 对象上应用修改。
+        self.get_tokenizer().image_token = "<|imgpad|>"
         processor.image_token = "<|imgpad|>"
         processor.video_token = "<|video_pad|>"
         return processor
         temporal_patch_size = vision_config.temporal_patch_size
         if do_resize:
+            # 添加默认值以避免 TypeError，定义来自preprocessor_config.json
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels if image_processor.min_pixels is not None else 3136,
+                max_pixels=image_processor.max_pixels if image_processor.max_pixels is not None else 11289600,
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
             idx_end = image_sizes_consum[tp_rank].item()
             pixel_values_part = pixel_values[idx_start:idx_end]
             image_grid_thw_part = image_grid_thw_chunk[tp_rank]
+            image_embedding_part = self.vision_tower(
+                pixel_values_part,
+                image_grid_thw_part,
+                bf16=(self.vision_tower.dtype == torch.bfloat16), # 尝试修复SM75及之前不支持BF16设备的报错
+            )
             image_embedding[idx_start // merge_size_square : idx_end // merge_size_square] = image_embedding_part
         group = get_tensor_model_parallel_group().device_group