Fix mdoel loading (#17)

- Fix mdoel loading (bd36ab0de70e8c918ebaef21ab674e45760e489b)
- Update README.md (333d6e0ec3210304dc43a4c7d75731378c4aa612)
- Update modeling_gme_qwen2vl.py (280b82673f3afce78487f0287755a7cc23199420)
- Update config.json (4174ed03d1015398958742e9dcc4c34277640a4f)
- Update README.md (dca92bddc7528a380b87c21f5383c8c856c27143)
- Update modeling_gme_qwen2vl.py (39953e5c545ac83822ad50cb3ef09214db7fbba5)
- Update README.md (62f5d69e260988e066a2e60c59180d25151d5e9b)

Files changed (3) hide show

README.md +12 -0
config.json +7 -4
modeling_gme_qwen2vl.py +39 -16

README.md CHANGED Viewed

@@ -3696,7 +3696,19 @@ The `GME` models support three types of input: **text**, **image**, and **image-
 **Transformers**
 ```python
 t2i_prompt = 'Find an image that matches the given text.'
 texts = [
     "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",

 **Transformers**
+The remote code has some issues with `transformers>=4.52.0`, please downgrade or use `sentence_transformers`
 ```python
+from transformers import AutoModel
+from transformers.utils.versions import require_version
+require_version(
+    "transformers<4.52.0",
+    "The remote code has some issues with transformers>=4.52.0, please downgrade: pip install transformers==4.51.3"
+)
 t2i_prompt = 'Find an image that matches the given text.'
 texts = [
     "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",

config.json CHANGED Viewed

@@ -1,9 +1,12 @@
 {
   "_name_or_path": "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
-  "architectures": ["GmeQwen2VLForVision2Seq"],
   "auto_map": {
-    "AutoModel": "modeling_gme_qwen2vl.GmeQwen2VLForVision2Seq",
-    "AutoConfig": "modeling_gme_qwen2vl.GmeQwen2VLConfig"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
@@ -15,7 +18,7 @@
   "intermediate_size": 8960,
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
-  "model_type": "gme_qwen2_vl",
   "num_attention_heads": 12,
   "num_hidden_layers": 28,
   "num_key_value_heads": 2,

 {
   "_name_or_path": "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
+  "architectures": [
+    "Qwen2VLForConditionalGeneration",
+    "GmeQwen2VL"
+  ],
   "auto_map": {
+    "AutoConfig": "modeling_gme_qwen2vl.GmeQwen2VLConfig",
+    "AutoModel": "modeling_gme_qwen2vl.GmeQwen2VL"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "intermediate_size": 8960,
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
+  "model_type": "qwen2_vl",
   "num_attention_heads": 12,
   "num_hidden_layers": 28,
   "num_key_value_heads": 2,

modeling_gme_qwen2vl.py CHANGED Viewed

@@ -12,16 +12,25 @@ import torch
 from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
-from transformers import (
-    AutoProcessor,
-    PreTrainedModel,
     Qwen2VLConfig,
     Qwen2VLForConditionalGeneration,
 )
-import os
 class GmeQwen2VLConfig(Qwen2VLConfig):
     def __init__(
         self,
         min_image_tokens: int = 256,
@@ -35,14 +44,25 @@ class GmeQwen2VLConfig(Qwen2VLConfig):
         self.max_length = max_length
-class GmeQwen2VLForVision2Seq(PreTrainedModel):
     config_class = GmeQwen2VLConfig
-    base_model_prefix: str = "base"
     def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
         super().__init__(config)
-        self.base = Qwen2VLForConditionalGeneration.from_pretrained(config._name_or_path)
-        self.base.tie_weights()  # It's important to produce same outputs.
         min_pixels: int = config.min_image_tokens * 28 * 28
         max_pixels: int = config.max_image_tokens * 28 * 28
@@ -55,6 +75,9 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         self.default_instruction: str = "You are a helpful assistant."
         self.sep: str = " "
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -70,21 +93,21 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         **kwargs
     ) -> torch.Tensor:
         if inputs_embeds is None:
-            inputs_embeds = self.base.model.embed_tokens(input_ids)
             if pixel_values is not None:
-                pixel_values = pixel_values.type(self.base.visual.get_dtype())
-                image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
-                image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             # if pixel_values_videos is not None:
-            #     pixel_values_videos = pixel_values_videos.type(self.base.visual.get_dtype())
-            #     video_embeds = self.base.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
-            #     video_mask = input_ids == self.base.config.video_token_id
             #     inputs_embeds[video_mask] = video_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
-        outputs = self.base.model(
             input_ids=None,
             position_ids=position_ids,
             attention_mask=attention_mask,

 from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
+from transformers import AutoProcessor, PreTrainedModel
+from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+    Qwen2VisionTransformerPretrainedModel,
     Qwen2VLConfig,
     Qwen2VLForConditionalGeneration,
+    Qwen2VLModel,
+)
+from transformers.utils.versions import require_version
+require_version(
+    "transformers<4.52.0",
+    "This code has some issues with transformers>=4.52.0, please downgrade: pip install transformers==4.51.3"
 )
 class GmeQwen2VLConfig(Qwen2VLConfig):
+    # model_type = ''
     def __init__(
         self,
         min_image_tokens: int = 256,
         self.max_length = max_length
+class GmeQwen2VL(PreTrainedModel):
     config_class = GmeQwen2VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
+    # _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    # _supports_cache_class = True
+    _supports_static_cache = False  # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
+    # _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
         super().__init__(config)
+        self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
+        self.model = Qwen2VLModel(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rope_deltas = None  # cache rope_deltas here
         min_pixels: int = config.min_image_tokens * 28 * 28
         max_pixels: int = config.max_image_tokens * 28 * 28
         self.default_instruction: str = "You are a helpful assistant."
         self.sep: str = " "
+        # Initialize weights and apply final processing
+        self.post_init()
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         **kwargs
     ) -> torch.Tensor:
         if inputs_embeds is None:
+            inputs_embeds = self.model.get_input_embeddings()(input_ids)
             if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.get_dtype())
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
+                image_mask = input_ids == self.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             # if pixel_values_videos is not None:
+            #     pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
+            #     video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
+            #     video_mask = input_ids == self.config.video_token_id
             #     inputs_embeds[video_mask] = video_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs = self.model(
             input_ids=None,
             position_ids=position_ids,
             attention_mask=attention_mask,