Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
 - added_tokens.json +47 -0
 - chat_template.jinja +15 -0
 - config.json +99 -0
 - configuration_dots.py +77 -0
 - generation_config.json +8 -0
 - merges.txt +0 -0
 - model.safetensors +3 -0
 - modeling_dots_ocr.py +131 -0
 - modeling_dots_vision.py +520 -0
 - special_tokens_map.json +31 -0
 - tokenizer.json +3 -0
 - tokenizer_config.json +395 -0
 - vocab.json +0 -0
 
    	
        .gitattributes
    CHANGED
    
    | 
         @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 
     | 
|
| 33 | 
         
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         
     | 
| 34 | 
         
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         
     | 
| 35 | 
         
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 
         | 
| 
         | 
|
| 33 | 
         
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         
     | 
| 34 | 
         
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         
     | 
| 35 | 
         
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 36 | 
         
            +
            tokenizer.json filter=lfs diff=lfs merge=lfs -text
         
     | 
    	
        added_tokens.json
    ADDED
    
    | 
         @@ -0,0 +1,47 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "</tool_call>": 151658,
         
     | 
| 3 | 
         
            +
              "<tool_call>": 151657,
         
     | 
| 4 | 
         
            +
              "<|assistant|>": 151672,
         
     | 
| 5 | 
         
            +
              "<|box_end|>": 151649,
         
     | 
| 6 | 
         
            +
              "<|box_start|>": 151648,
         
     | 
| 7 | 
         
            +
              "<|endofassistant|>": 151673,
         
     | 
| 8 | 
         
            +
              "<|endofimg|>": 151667,
         
     | 
| 9 | 
         
            +
              "<|endofslice|>": 151682,
         
     | 
| 10 | 
         
            +
              "<|endofsystemprompt|>": 151669,
         
     | 
| 11 | 
         
            +
              "<|endoftext|>": 151643,
         
     | 
| 12 | 
         
            +
              "<|endofuser|>": 151671,
         
     | 
| 13 | 
         
            +
              "<|file_sep|>": 151664,
         
     | 
| 14 | 
         
            +
              "<|fim_middle|>": 151660,
         
     | 
| 15 | 
         
            +
              "<|fim_pad|>": 151662,
         
     | 
| 16 | 
         
            +
              "<|fim_prefix|>": 151659,
         
     | 
| 17 | 
         
            +
              "<|fim_suffix|>": 151661,
         
     | 
| 18 | 
         
            +
              "<|im_end|>": 151645,
         
     | 
| 19 | 
         
            +
              "<|im_start|>": 151644,
         
     | 
| 20 | 
         
            +
              "<|image_gen_end|>": 151687,
         
     | 
| 21 | 
         
            +
              "<|image_gen_start|>": 151686,
         
     | 
| 22 | 
         
            +
              "<|image_pad|>": 151655,
         
     | 
| 23 | 
         
            +
              "<|imgpad|>": 151665,
         
     | 
| 24 | 
         
            +
              "<|imgrowend|>": 151683,
         
     | 
| 25 | 
         
            +
              "<|img|>": 151666,
         
     | 
| 26 | 
         
            +
              "<|object_ref_end|>": 151647,
         
     | 
| 27 | 
         
            +
              "<|object_ref_start|>": 151646,
         
     | 
| 28 | 
         
            +
              "<|pictotext|>": 151679,
         
     | 
| 29 | 
         
            +
              "<|pic|>": 151677,
         
     | 
| 30 | 
         
            +
              "<|polygon_end|>": 151685,
         
     | 
| 31 | 
         
            +
              "<|polygon_start|>": 151684,
         
     | 
| 32 | 
         
            +
              "<|quad_end|>": 151651,
         
     | 
| 33 | 
         
            +
              "<|quad_start|>": 151650,
         
     | 
| 34 | 
         
            +
              "<|ref_end|>": 151675,
         
     | 
| 35 | 
         
            +
              "<|ref_start|>": 151674,
         
     | 
| 36 | 
         
            +
              "<|repo_name|>": 151663,
         
     | 
| 37 | 
         
            +
              "<|slice|>": 151681,
         
     | 
| 38 | 
         
            +
              "<|systemprompt|>": 151668,
         
     | 
| 39 | 
         
            +
              "<|text|>": 151678,
         
     | 
| 40 | 
         
            +
              "<|user|>": 151670,
         
     | 
| 41 | 
         
            +
              "<|video_pad|>": 151656,
         
     | 
| 42 | 
         
            +
              "<|vision_end|>": 151653,
         
     | 
| 43 | 
         
            +
              "<|vision_pad|>": 151654,
         
     | 
| 44 | 
         
            +
              "<|vision_start|>": 151652,
         
     | 
| 45 | 
         
            +
              "[PAD]": 151680,
         
     | 
| 46 | 
         
            +
              "[SEP]": 151676
         
     | 
| 47 | 
         
            +
            }
         
     | 
    	
        chat_template.jinja
    ADDED
    
    | 
         @@ -0,0 +1,15 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {%- for m in messages %}
         
     | 
| 2 | 
         
            +
                {%- if m.role == 'system' %}
         
     | 
| 3 | 
         
            +
                    {{- '<|system|>' + m.content + '<|endofsystem|>\n' }}
         
     | 
| 4 | 
         
            +
                {%- elif m.role == 'user' %}
         
     | 
| 5 | 
         
            +
                    {{- '<|user|>' + m.content + '<|endofuser|>' }}
         
     | 
| 6 | 
         
            +
                {%- elif m.role == 'assistant' %}
         
     | 
| 7 | 
         
            +
                    {{- '<|assistant|>' + m.content }}
         
     | 
| 8 | 
         
            +
                    {%- if not loop.last %}
         
     | 
| 9 | 
         
            +
                        {{- '<|endofassistant|>' }}
         
     | 
| 10 | 
         
            +
                    {%- endif %}
         
     | 
| 11 | 
         
            +
                {%- endif %}
         
     | 
| 12 | 
         
            +
            {%- endfor %}
         
     | 
| 13 | 
         
            +
            {%- if messages[-1].role != 'assistant' %}
         
     | 
| 14 | 
         
            +
                {{- '<|assistant|>' }}
         
     | 
| 15 | 
         
            +
            {%- endif %}
         
     | 
    	
        config.json
    ADDED
    
    | 
         @@ -0,0 +1,99 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "architectures": [
         
     | 
| 3 | 
         
            +
                "DotsOCRForCausalLM"
         
     | 
| 4 | 
         
            +
              ],
         
     | 
| 5 | 
         
            +
              "attention_bias": true,
         
     | 
| 6 | 
         
            +
              "attention_dropout": 0.0,
         
     | 
| 7 | 
         
            +
              "auto_map": {
         
     | 
| 8 | 
         
            +
                "AutoConfig": "configuration_dots.DotsOCRConfig",
         
     | 
| 9 | 
         
            +
                "AutoModelForCausalLM": "modeling_dots_ocr.DotsOCRForCausalLM"
         
     | 
| 10 | 
         
            +
              },
         
     | 
| 11 | 
         
            +
              "dtype": "float16",
         
     | 
| 12 | 
         
            +
              "hidden_act": "silu",
         
     | 
| 13 | 
         
            +
              "hidden_size": 1536,
         
     | 
| 14 | 
         
            +
              "image_token_id": 151665,
         
     | 
| 15 | 
         
            +
              "initializer_range": 0.02,
         
     | 
| 16 | 
         
            +
              "intermediate_size": 8960,
         
     | 
| 17 | 
         
            +
              "layer_types": [
         
     | 
| 18 | 
         
            +
                "full_attention",
         
     | 
| 19 | 
         
            +
                "full_attention",
         
     | 
| 20 | 
         
            +
                "full_attention",
         
     | 
| 21 | 
         
            +
                "full_attention",
         
     | 
| 22 | 
         
            +
                "full_attention",
         
     | 
| 23 | 
         
            +
                "full_attention",
         
     | 
| 24 | 
         
            +
                "full_attention",
         
     | 
| 25 | 
         
            +
                "full_attention",
         
     | 
| 26 | 
         
            +
                "full_attention",
         
     | 
| 27 | 
         
            +
                "full_attention",
         
     | 
| 28 | 
         
            +
                "full_attention",
         
     | 
| 29 | 
         
            +
                "full_attention",
         
     | 
| 30 | 
         
            +
                "full_attention",
         
     | 
| 31 | 
         
            +
                "full_attention",
         
     | 
| 32 | 
         
            +
                "full_attention",
         
     | 
| 33 | 
         
            +
                "full_attention",
         
     | 
| 34 | 
         
            +
                "full_attention",
         
     | 
| 35 | 
         
            +
                "full_attention",
         
     | 
| 36 | 
         
            +
                "full_attention",
         
     | 
| 37 | 
         
            +
                "full_attention",
         
     | 
| 38 | 
         
            +
                "full_attention",
         
     | 
| 39 | 
         
            +
                "full_attention",
         
     | 
| 40 | 
         
            +
                "full_attention",
         
     | 
| 41 | 
         
            +
                "full_attention",
         
     | 
| 42 | 
         
            +
                "full_attention",
         
     | 
| 43 | 
         
            +
                "full_attention",
         
     | 
| 44 | 
         
            +
                "full_attention",
         
     | 
| 45 | 
         
            +
                "full_attention"
         
     | 
| 46 | 
         
            +
              ],
         
     | 
| 47 | 
         
            +
              "max_position_embeddings": 131072,
         
     | 
| 48 | 
         
            +
              "max_window_layers": 28,
         
     | 
| 49 | 
         
            +
              "model_type": "dots_ocr",
         
     | 
| 50 | 
         
            +
              "num_attention_heads": 12,
         
     | 
| 51 | 
         
            +
              "num_hidden_layers": 28,
         
     | 
| 52 | 
         
            +
              "num_key_value_heads": 2,
         
     | 
| 53 | 
         
            +
              "quantization_config": {
         
     | 
| 54 | 
         
            +
                "_load_in_4bit": true,
         
     | 
| 55 | 
         
            +
                "_load_in_8bit": false,
         
     | 
| 56 | 
         
            +
                "bnb_4bit_compute_dtype": "bfloat16",
         
     | 
| 57 | 
         
            +
                "bnb_4bit_quant_storage": "uint8",
         
     | 
| 58 | 
         
            +
                "bnb_4bit_quant_type": "nf4",
         
     | 
| 59 | 
         
            +
                "bnb_4bit_use_double_quant": true,
         
     | 
| 60 | 
         
            +
                "llm_int8_enable_fp32_cpu_offload": false,
         
     | 
| 61 | 
         
            +
                "llm_int8_has_fp16_weight": false,
         
     | 
| 62 | 
         
            +
                "llm_int8_skip_modules": null,
         
     | 
| 63 | 
         
            +
                "llm_int8_threshold": 6.0,
         
     | 
| 64 | 
         
            +
                "load_in_4bit": true,
         
     | 
| 65 | 
         
            +
                "load_in_8bit": false,
         
     | 
| 66 | 
         
            +
                "quant_method": "bitsandbytes"
         
     | 
| 67 | 
         
            +
              },
         
     | 
| 68 | 
         
            +
              "rms_norm_eps": 1e-06,
         
     | 
| 69 | 
         
            +
              "rope_scaling": null,
         
     | 
| 70 | 
         
            +
              "rope_theta": 1000000,
         
     | 
| 71 | 
         
            +
              "sliding_window": null,
         
     | 
| 72 | 
         
            +
              "tie_word_embeddings": false,
         
     | 
| 73 | 
         
            +
              "transformers_version": "4.56.2",
         
     | 
| 74 | 
         
            +
              "use_cache": true,
         
     | 
| 75 | 
         
            +
              "use_sliding_window": false,
         
     | 
| 76 | 
         
            +
              "video_token_id": 151656,
         
     | 
| 77 | 
         
            +
              "vision_config": {
         
     | 
| 78 | 
         
            +
                "_attn_implementation_autoset": true,
         
     | 
| 79 | 
         
            +
                "attn_implementation": "flash_attention_2",
         
     | 
| 80 | 
         
            +
                "embed_dim": 1536,
         
     | 
| 81 | 
         
            +
                "gradient_checkpointing": false,
         
     | 
| 82 | 
         
            +
                "hidden_size": 1536,
         
     | 
| 83 | 
         
            +
                "init_merger_std": 0.02,
         
     | 
| 84 | 
         
            +
                "initializer_range": 0.02,
         
     | 
| 85 | 
         
            +
                "intermediate_size": 4224,
         
     | 
| 86 | 
         
            +
                "is_causal": false,
         
     | 
| 87 | 
         
            +
                "model_type": "dots_vit",
         
     | 
| 88 | 
         
            +
                "num_attention_heads": 12,
         
     | 
| 89 | 
         
            +
                "num_channels": 3,
         
     | 
| 90 | 
         
            +
                "num_hidden_layers": 42,
         
     | 
| 91 | 
         
            +
                "patch_size": 14,
         
     | 
| 92 | 
         
            +
                "post_norm": true,
         
     | 
| 93 | 
         
            +
                "rms_norm_eps": 1e-05,
         
     | 
| 94 | 
         
            +
                "spatial_merge_size": 2,
         
     | 
| 95 | 
         
            +
                "temporal_patch_size": 1,
         
     | 
| 96 | 
         
            +
                "use_bias": false
         
     | 
| 97 | 
         
            +
              },
         
     | 
| 98 | 
         
            +
              "vocab_size": 151936
         
     | 
| 99 | 
         
            +
            }
         
     | 
    	
        configuration_dots.py
    ADDED
    
    | 
         @@ -0,0 +1,77 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from typing import Any, Optional
         
     | 
| 2 | 
         
            +
            from transformers.configuration_utils import PretrainedConfig
         
     | 
| 3 | 
         
            +
            from transformers.models.qwen2 import Qwen2Config
         
     | 
| 4 | 
         
            +
            from transformers import Qwen2_5_VLProcessor, AutoProcessor
         
     | 
| 5 | 
         
            +
            from transformers.models.auto.configuration_auto import CONFIG_MAPPING
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            class DotsVisionConfig(PretrainedConfig):
         
     | 
| 9 | 
         
            +
                model_type: str = "dots_vit"
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
                def __init__(
         
     | 
| 12 | 
         
            +
                    self,
         
     | 
| 13 | 
         
            +
                    embed_dim: int = 1536,  # vision encoder embed size
         
     | 
| 14 | 
         
            +
                    hidden_size: int = 1536,  # after merger hidden size
         
     | 
| 15 | 
         
            +
                    intermediate_size: int = 4224,
         
     | 
| 16 | 
         
            +
                    num_hidden_layers: int = 42,
         
     | 
| 17 | 
         
            +
                    num_attention_heads: int = 12,
         
     | 
| 18 | 
         
            +
                    num_channels: int = 3,
         
     | 
| 19 | 
         
            +
                    patch_size: int = 14,
         
     | 
| 20 | 
         
            +
                    spatial_merge_size: int = 2,
         
     | 
| 21 | 
         
            +
                    temporal_patch_size: int = 1,
         
     | 
| 22 | 
         
            +
                    rms_norm_eps: float = 1e-5,
         
     | 
| 23 | 
         
            +
                    use_bias: bool = False,
         
     | 
| 24 | 
         
            +
                    attn_implementation="flash_attention_2",  # "eager","sdpa","flash_attention_2"
         
     | 
| 25 | 
         
            +
                    initializer_range=0.02,
         
     | 
| 26 | 
         
            +
                    init_merger_std=0.02,
         
     | 
| 27 | 
         
            +
                    is_causal=False,  # ve causal forward
         
     | 
| 28 | 
         
            +
                    post_norm=True,
         
     | 
| 29 | 
         
            +
                    gradient_checkpointing=False,
         
     | 
| 30 | 
         
            +
                    **kwargs: Any,
         
     | 
| 31 | 
         
            +
                ):
         
     | 
| 32 | 
         
            +
                    super().__init__(**kwargs)
         
     | 
| 33 | 
         
            +
                    self.embed_dim = embed_dim
         
     | 
| 34 | 
         
            +
                    self.hidden_size = hidden_size
         
     | 
| 35 | 
         
            +
                    self.intermediate_size = intermediate_size
         
     | 
| 36 | 
         
            +
                    self.num_hidden_layers = num_hidden_layers
         
     | 
| 37 | 
         
            +
                    self.num_attention_heads = num_attention_heads
         
     | 
| 38 | 
         
            +
                    self.num_channels = num_channels
         
     | 
| 39 | 
         
            +
                    self.patch_size = patch_size
         
     | 
| 40 | 
         
            +
                    self.spatial_merge_size = spatial_merge_size
         
     | 
| 41 | 
         
            +
                    self.temporal_patch_size = temporal_patch_size
         
     | 
| 42 | 
         
            +
                    self.rms_norm_eps = rms_norm_eps
         
     | 
| 43 | 
         
            +
                    self.use_bias = use_bias
         
     | 
| 44 | 
         
            +
                    self.attn_implementation = attn_implementation
         
     | 
| 45 | 
         
            +
                    self.initializer_range = initializer_range
         
     | 
| 46 | 
         
            +
                    self.init_merger_std = init_merger_std
         
     | 
| 47 | 
         
            +
                    self.is_causal = is_causal
         
     | 
| 48 | 
         
            +
                    self.post_norm = post_norm
         
     | 
| 49 | 
         
            +
                    self.gradient_checkpointing = gradient_checkpointing
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
            class DotsOCRConfig(Qwen2Config):
         
     | 
| 54 | 
         
            +
                model_type = "dots_ocr"
         
     | 
| 55 | 
         
            +
                def __init__(self, 
         
     | 
| 56 | 
         
            +
                    image_token_id = 151665, 
         
     | 
| 57 | 
         
            +
                    video_token_id = 151656,
         
     | 
| 58 | 
         
            +
                    vision_config: Optional[dict] = None, *args, **kwargs):
         
     | 
| 59 | 
         
            +
                    super().__init__(*args, **kwargs)
         
     | 
| 60 | 
         
            +
                    self.image_token_id = image_token_id
         
     | 
| 61 | 
         
            +
                    self.video_token_id = video_token_id
         
     | 
| 62 | 
         
            +
                    self.vision_config = DotsVisionConfig(**(vision_config or {}))
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
                def save_pretrained(self, save_directory, **kwargs):
         
     | 
| 65 | 
         
            +
                    self._auto_class = None
         
     | 
| 66 | 
         
            +
                    super().save_pretrained(save_directory, **kwargs)
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
             
     | 
| 69 | 
         
            +
            class DotsVLProcessor(Qwen2_5_VLProcessor):
         
     | 
| 70 | 
         
            +
                def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         
     | 
| 71 | 
         
            +
                    super().__init__(image_processor, tokenizer, chat_template=chat_template)
         
     | 
| 72 | 
         
            +
                    self.image_token = "<|imgpad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         
     | 
| 73 | 
         
            +
                    self.image_token_id = 151665 if not hasattr(tokenizer, "image_token_id") else tokenizer.image_token_id
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
            AutoProcessor.register("dots_ocr", DotsVLProcessor)
         
     | 
| 77 | 
         
            +
            CONFIG_MAPPING.register("dots_ocr", DotsOCRConfig)
         
     | 
    	
        generation_config.json
    ADDED
    
    | 
         @@ -0,0 +1,8 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "eos_token_id": [
         
     | 
| 3 | 
         
            +
                151643,
         
     | 
| 4 | 
         
            +
                151673
         
     | 
| 5 | 
         
            +
              ],
         
     | 
| 6 | 
         
            +
              "max_length": 32768,
         
     | 
| 7 | 
         
            +
              "transformers_version": "4.56.2"
         
     | 
| 8 | 
         
            +
            }
         
     | 
    	
        merges.txt
    ADDED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         | 
    	
        model.safetensors
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:4981cbc9901ae21fa5e7716bdb53f156b62016db3a9d0f70b374a8b68fdc1b85
         
     | 
| 3 | 
         
            +
            size 2263132677
         
     | 
    	
        modeling_dots_ocr.py
    ADDED
    
    | 
         @@ -0,0 +1,131 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from typing import List, Optional, Tuple, Union
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            import torch
         
     | 
| 4 | 
         
            +
            from transformers.modeling_outputs import CausalLMOutputWithPast
         
     | 
| 5 | 
         
            +
            from transformers.models.qwen2 import Qwen2ForCausalLM
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            from .configuration_dots import DotsVisionConfig, DotsOCRConfig
         
     | 
| 8 | 
         
            +
            from .modeling_dots_vision import DotsVisionTransformer
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            DOTS_VLM_MAX_IMAGES = 200
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            class DotsOCRForCausalLM(Qwen2ForCausalLM):
         
     | 
| 15 | 
         
            +
                config_class = DotsOCRConfig
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
                def __init__(self, config: DotsOCRConfig):
         
     | 
| 18 | 
         
            +
                    super().__init__(config)
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
                    if isinstance(self.config.vision_config, dict):
         
     | 
| 21 | 
         
            +
                        vision_config = DotsVisionConfig(**self.config.vision_config)
         
     | 
| 22 | 
         
            +
                        self.config.vision_config = vision_config
         
     | 
| 23 | 
         
            +
                    else:
         
     | 
| 24 | 
         
            +
                        vision_config = self.config.vision_config
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
                    self.vision_tower = DotsVisionTransformer(vision_config)
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
                def prepare_inputs_embeds(
         
     | 
| 29 | 
         
            +
                    self,
         
     | 
| 30 | 
         
            +
                    input_ids: torch.LongTensor,
         
     | 
| 31 | 
         
            +
                    pixel_values: Optional[torch.FloatTensor] = None,
         
     | 
| 32 | 
         
            +
                    grid_thw: Optional[torch.FloatTensor] = None,
         
     | 
| 33 | 
         
            +
                    img_mask: Optional[torch.BoolTensor] = None,
         
     | 
| 34 | 
         
            +
                ) -> torch.Tensor:
         
     | 
| 35 | 
         
            +
                    inputs_embeds = self.get_input_embeddings()(input_ids)
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
                    if pixel_values is not None:
         
     | 
| 38 | 
         
            +
                        assert img_mask is not None
         
     | 
| 39 | 
         
            +
                        if grid_thw.shape[0] > DOTS_VLM_MAX_IMAGES:
         
     | 
| 40 | 
         
            +
                            print(
         
     | 
| 41 | 
         
            +
                                f"Num image exceeded: {grid_thw.shape[0]} > {DOTS_VLM_MAX_IMAGES}, which may cause FSDP hang"
         
     | 
| 42 | 
         
            +
                            )
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                        vision_embeddings = self.vision_tower(pixel_values, grid_thw)
         
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
                        true_indices = torch.nonzero(img_mask).squeeze()
         
     | 
| 47 | 
         
            +
                        if len(true_indices) > vision_embeddings.size(0):
         
     | 
| 48 | 
         
            +
                            print(
         
     | 
| 49 | 
         
            +
                                f"img_mask sum > VE and will be truncated, mask.sum()={len(true_indices)} {vision_embeddings.size(0)=}"
         
     | 
| 50 | 
         
            +
                            )
         
     | 
| 51 | 
         
            +
                            true_indices = true_indices[: vision_embeddings.size(0)]
         
     | 
| 52 | 
         
            +
                            new_img_mask = torch.zeros_like(img_mask, device=img_mask.device)
         
     | 
| 53 | 
         
            +
                            new_img_mask[true_indices[:, 0], true_indices[:, 1]] = True
         
     | 
| 54 | 
         
            +
                        else:
         
     | 
| 55 | 
         
            +
                            new_img_mask = img_mask
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
                        assert (
         
     | 
| 58 | 
         
            +
                            vision_embeddings.size(0) == new_img_mask.sum()
         
     | 
| 59 | 
         
            +
                        ), f"{vision_embeddings.size(0)=}, {new_img_mask.sum()=}"
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
                        inputs_embeds = inputs_embeds.masked_scatter(
         
     | 
| 62 | 
         
            +
                            new_img_mask.to(inputs_embeds.device).unsqueeze(-1).expand_as(inputs_embeds),
         
     | 
| 63 | 
         
            +
                            vision_embeddings.to(inputs_embeds.device).type(inputs_embeds.dtype),
         
     | 
| 64 | 
         
            +
                        )
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
                    return inputs_embeds
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
                def forward(
         
     | 
| 69 | 
         
            +
                    self,
         
     | 
| 70 | 
         
            +
                    input_ids: torch.LongTensor,
         
     | 
| 71 | 
         
            +
                    pixel_values: Optional[torch.FloatTensor] = None,
         
     | 
| 72 | 
         
            +
                    image_grid_thw: Optional[torch.FloatTensor] = None,
         
     | 
| 73 | 
         
            +
                    inputs_embeds: Optional[torch.Tensor] = None,
         
     | 
| 74 | 
         
            +
                    attention_mask: Optional[torch.Tensor] = None,
         
     | 
| 75 | 
         
            +
                    position_ids: Optional[torch.LongTensor] = None,
         
     | 
| 76 | 
         
            +
                    past_key_values: Optional[List[torch.FloatTensor]] = None,
         
     | 
| 77 | 
         
            +
                    labels: Optional[torch.LongTensor] = None,
         
     | 
| 78 | 
         
            +
                    output_attentions: Optional[bool] = None,
         
     | 
| 79 | 
         
            +
                    output_hidden_states: Optional[bool] = None,
         
     | 
| 80 | 
         
            +
                    return_dict: Optional[bool] = None,
         
     | 
| 81 | 
         
            +
                    use_cache: Optional[bool] = None,
         
     | 
| 82 | 
         
            +
                    logits_to_keep: int = 0,
         
     | 
| 83 | 
         
            +
                    **loss_kwargs,
         
     | 
| 84 | 
         
            +
                ) -> Union[Tuple, CausalLMOutputWithPast]:
         
     | 
| 85 | 
         
            +
                    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         
     | 
| 86 | 
         
            +
                    assert len(input_ids) >= 1, f"empty input_ids {input_ids.shape=} will cause gradnorm nan"
         
     | 
| 87 | 
         
            +
                    if inputs_embeds is None:
         
     | 
| 88 | 
         
            +
                        img_mask = input_ids == self.config.image_token_id
         
     | 
| 89 | 
         
            +
                        inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, image_grid_thw, img_mask)
         
     | 
| 90 | 
         
            +
             
     | 
| 91 | 
         
            +
                    outputs = super().forward(
         
     | 
| 92 | 
         
            +
                        inputs_embeds=inputs_embeds,
         
     | 
| 93 | 
         
            +
                        attention_mask=attention_mask,
         
     | 
| 94 | 
         
            +
                        position_ids=position_ids,
         
     | 
| 95 | 
         
            +
                        past_key_values=past_key_values,
         
     | 
| 96 | 
         
            +
                        labels=labels,
         
     | 
| 97 | 
         
            +
                        use_cache=use_cache if use_cache is not None else self.config.use_cache,
         
     | 
| 98 | 
         
            +
                        output_attentions=output_attentions,
         
     | 
| 99 | 
         
            +
                        output_hidden_states=output_hidden_states,
         
     | 
| 100 | 
         
            +
                        # return_dict=return_dict,
         
     | 
| 101 | 
         
            +
                        logits_to_keep=logits_to_keep,
         
     | 
| 102 | 
         
            +
                        **loss_kwargs,
         
     | 
| 103 | 
         
            +
                    )
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
                    return outputs
         
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
                def prepare_inputs_for_generation(
         
     | 
| 108 | 
         
            +
                    self,
         
     | 
| 109 | 
         
            +
                    input_ids,
         
     | 
| 110 | 
         
            +
                    past_key_values=None,
         
     | 
| 111 | 
         
            +
                    inputs_embeds=None,
         
     | 
| 112 | 
         
            +
                    pixel_values=None,
         
     | 
| 113 | 
         
            +
                    attention_mask=None,
         
     | 
| 114 | 
         
            +
                    cache_position=None,
         
     | 
| 115 | 
         
            +
                    num_logits_to_keep=None,
         
     | 
| 116 | 
         
            +
                    **kwargs,
         
     | 
| 117 | 
         
            +
                ):
         
     | 
| 118 | 
         
            +
                    model_inputs = super().prepare_inputs_for_generation(
         
     | 
| 119 | 
         
            +
                        input_ids,
         
     | 
| 120 | 
         
            +
                        past_key_values=past_key_values,
         
     | 
| 121 | 
         
            +
                        inputs_embeds=inputs_embeds,
         
     | 
| 122 | 
         
            +
                        attention_mask=attention_mask,
         
     | 
| 123 | 
         
            +
                        cache_position=cache_position,
         
     | 
| 124 | 
         
            +
                        num_logits_to_keep=num_logits_to_keep,
         
     | 
| 125 | 
         
            +
                        **kwargs,
         
     | 
| 126 | 
         
            +
                    )
         
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
                    if cache_position[0] == 0:
         
     | 
| 129 | 
         
            +
                        model_inputs["pixel_values"] = pixel_values
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
                    return model_inputs
         
     | 
    	
        modeling_dots_vision.py
    ADDED
    
    | 
         @@ -0,0 +1,520 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import math
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            import torch
         
     | 
| 4 | 
         
            +
            import torch.nn as nn
         
     | 
| 5 | 
         
            +
            import torch.nn.functional as F
         
     | 
| 6 | 
         
            +
            import torch.utils.checkpoint
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            flash_attn_available = True
         
     | 
| 9 | 
         
            +
            npu_available = True
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            try:
         
     | 
| 12 | 
         
            +
                from flash_attn import flash_attn_varlen_func
         
     | 
| 13 | 
         
            +
            except ImportError:
         
     | 
| 14 | 
         
            +
                flash_attn_available = False
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            from torch.nn import LayerNorm
         
     | 
| 17 | 
         
            +
            from transformers.modeling_utils import PreTrainedModel
         
     | 
| 18 | 
         
            +
            from .configuration_dots import DotsVisionConfig
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            try:
         
     | 
| 21 | 
         
            +
                import torch_npu
         
     | 
| 22 | 
         
            +
            except ImportError:
         
     | 
| 23 | 
         
            +
                npu_available = False
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
            def rotate_half(x):
         
     | 
| 27 | 
         
            +
                """Rotates half the hidden dims of the input."""
         
     | 
| 28 | 
         
            +
                x1 = x[..., : x.shape[-1] // 2]
         
     | 
| 29 | 
         
            +
                x2 = x[..., x.shape[-1] // 2:]
         
     | 
| 30 | 
         
            +
                return torch.cat((-x2, x1), dim=-1)
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
         
     | 
| 34 | 
         
            +
                orig_dtype = tensor.dtype
         
     | 
| 35 | 
         
            +
                tensor = tensor.float()
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
                cos = freqs.cos()
         
     | 
| 38 | 
         
            +
                sin = freqs.sin()
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
         
     | 
| 41 | 
         
            +
                sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
                output = (tensor * cos) + (rotate_half(tensor) * sin)
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
                output = output.to(orig_dtype)
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
                return output
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
             
     | 
| 50 | 
         
            +
            class VisionRotaryEmbedding(nn.Module):
         
     | 
| 51 | 
         
            +
                def __init__(self, dim: int, theta: float = 10000.0) -> None:
         
     | 
| 52 | 
         
            +
                    super().__init__()
         
     | 
| 53 | 
         
            +
                    inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
         
     | 
| 54 | 
         
            +
                    self.register_buffer("inv_freq", inv_freq, persistent=False)
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
                def forward(self, seqlen: int) -> torch.Tensor:
         
     | 
| 57 | 
         
            +
                    seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
         
     | 
| 58 | 
         
            +
                    freqs = torch.outer(seq, self.inv_freq)
         
     | 
| 59 | 
         
            +
                    return freqs
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            class PatchMerger(nn.Module):
         
     | 
| 63 | 
         
            +
                def __init__(
         
     | 
| 64 | 
         
            +
                        self,
         
     | 
| 65 | 
         
            +
                        dim: int,
         
     | 
| 66 | 
         
            +
                        context_dim: int,
         
     | 
| 67 | 
         
            +
                        spatial_merge_size: int = 2,
         
     | 
| 68 | 
         
            +
                        pre_norm="layernorm",
         
     | 
| 69 | 
         
            +
                        init_merger_std=None,
         
     | 
| 70 | 
         
            +
                ) -> None:
         
     | 
| 71 | 
         
            +
                    super().__init__()
         
     | 
| 72 | 
         
            +
                    self.hidden_size = context_dim * (spatial_merge_size ** 2)
         
     | 
| 73 | 
         
            +
                    self.pre_norm = pre_norm
         
     | 
| 74 | 
         
            +
                    if self.pre_norm == "layernorm":
         
     | 
| 75 | 
         
            +
                        self.ln_q = LayerNorm(context_dim, eps=1e-6)
         
     | 
| 76 | 
         
            +
                    elif self.pre_norm == "rmsnorm":
         
     | 
| 77 | 
         
            +
                        self.ln_q = RMSNorm(context_dim, eps=1e-6)
         
     | 
| 78 | 
         
            +
                    else:
         
     | 
| 79 | 
         
            +
                        print("no norm in patch merger")
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
                    self.mlp = nn.Sequential(
         
     | 
| 82 | 
         
            +
                        nn.Linear(self.hidden_size, self.hidden_size),
         
     | 
| 83 | 
         
            +
                        nn.GELU(),
         
     | 
| 84 | 
         
            +
                        nn.Linear(self.hidden_size, dim),
         
     | 
| 85 | 
         
            +
                    )
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
                    if init_merger_std is not None:
         
     | 
| 88 | 
         
            +
                        nn.init.normal_(self.mlp[0].weight, mean=0.0, std=init_merger_std)
         
     | 
| 89 | 
         
            +
                        nn.init.zeros_(self.mlp[0].bias)
         
     | 
| 90 | 
         
            +
                        nn.init.normal_(self.mlp[2].weight, mean=0.0, std=init_merger_std)
         
     | 
| 91 | 
         
            +
                        nn.init.zeros_(self.mlp[2].bias)
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
                def forward(self, x: torch.Tensor) -> torch.Tensor:
         
     | 
| 94 | 
         
            +
                    if self.pre_norm:
         
     | 
| 95 | 
         
            +
                        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
         
     | 
| 96 | 
         
            +
                    else:
         
     | 
| 97 | 
         
            +
                        x = self.mlp(x.view(-1, self.hidden_size))
         
     | 
| 98 | 
         
            +
                    return x
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
            class VisionAttention(nn.Module):
         
     | 
| 102 | 
         
            +
                def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
         
     | 
| 103 | 
         
            +
                    super().__init__()
         
     | 
| 104 | 
         
            +
                    self.num_heads = num_heads
         
     | 
| 105 | 
         
            +
                    self.head_dim = dim // num_heads
         
     | 
| 106 | 
         
            +
                    self.qkv = nn.Linear(dim, dim * 3, bias=bias)
         
     | 
| 107 | 
         
            +
                    self.proj = nn.Linear(dim, dim, bias=bias)
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
                def forward(
         
     | 
| 110 | 
         
            +
                        self,
         
     | 
| 111 | 
         
            +
                        hidden_states: torch.Tensor,
         
     | 
| 112 | 
         
            +
                        cu_seqlens: torch.Tensor,
         
     | 
| 113 | 
         
            +
                        rotary_pos_emb: torch.Tensor = None,
         
     | 
| 114 | 
         
            +
                ) -> torch.Tensor:
         
     | 
| 115 | 
         
            +
                    seq_length = hidden_states.shape[0]
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
                    q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
         
     | 
| 118 | 
         
            +
                    q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 119 | 
         
            +
                    k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
                    attention_mask = torch.full(
         
     | 
| 122 | 
         
            +
                        [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
         
     | 
| 123 | 
         
            +
                    )
         
     | 
| 124 | 
         
            +
                    for i in range(1, len(cu_seqlens)):
         
     | 
| 125 | 
         
            +
                        attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = 0
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
                    q = q.transpose(0, 1)
         
     | 
| 128 | 
         
            +
                    k = k.transpose(0, 1)
         
     | 
| 129 | 
         
            +
                    v = v.transpose(0, 1)
         
     | 
| 130 | 
         
            +
                    attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
         
     | 
| 131 | 
         
            +
                    attn_weights = attn_weights + attention_mask
         
     | 
| 132 | 
         
            +
                    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
         
     | 
| 133 | 
         
            +
                    attn_output = torch.matmul(attn_weights, v)
         
     | 
| 134 | 
         
            +
                    attn_output = attn_output.transpose(0, 1)
         
     | 
| 135 | 
         
            +
                    attn_output = attn_output.reshape(seq_length, -1)
         
     | 
| 136 | 
         
            +
                    attn_output = self.proj(attn_output)
         
     | 
| 137 | 
         
            +
                    return attn_output
         
     | 
| 138 | 
         
            +
             
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
            class VisionFlashAttention2(nn.Module):
         
     | 
| 141 | 
         
            +
                def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
         
     | 
| 142 | 
         
            +
                    super().__init__()
         
     | 
| 143 | 
         
            +
                    self.num_heads = num_heads
         
     | 
| 144 | 
         
            +
                    self.qkv = nn.Linear(dim, dim * 3, bias=bias)
         
     | 
| 145 | 
         
            +
                    self.proj = nn.Linear(dim, dim, bias=bias)
         
     | 
| 146 | 
         
            +
                    self.config = config
         
     | 
| 147 | 
         
            +
                    self.is_causal = config.is_causal
         
     | 
| 148 | 
         
            +
             
     | 
| 149 | 
         
            +
                def forward(
         
     | 
| 150 | 
         
            +
                        self,
         
     | 
| 151 | 
         
            +
                        hidden_states: torch.Tensor,
         
     | 
| 152 | 
         
            +
                        cu_seqlens: torch.Tensor,
         
     | 
| 153 | 
         
            +
                        rotary_pos_emb: torch.Tensor = None,
         
     | 
| 154 | 
         
            +
                ) -> torch.Tensor:
         
     | 
| 155 | 
         
            +
                    seq_length = hidden_states.shape[0]
         
     | 
| 156 | 
         
            +
                    q, k, v = (
         
     | 
| 157 | 
         
            +
                        self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
         
     | 
| 158 | 
         
            +
                    )  # 'shd'
         
     | 
| 159 | 
         
            +
                    q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 160 | 
         
            +
                    k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 161 | 
         
            +
                    max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         
     | 
| 162 | 
         
            +
                    attn_output = flash_attn_varlen_func(
         
     | 
| 163 | 
         
            +
                        q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, causal=self.is_causal
         
     | 
| 164 | 
         
            +
                    ).reshape(seq_length, -1)
         
     | 
| 165 | 
         
            +
                    attn_output = self.proj(attn_output)
         
     | 
| 166 | 
         
            +
             
     | 
| 167 | 
         
            +
                    return attn_output
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
             
     | 
| 170 | 
         
            +
            class VisionAttentionV2(nn.Module):
         
     | 
| 171 | 
         
            +
                def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
         
     | 
| 172 | 
         
            +
                    super().__init__()
         
     | 
| 173 | 
         
            +
                    self.num_heads = num_heads
         
     | 
| 174 | 
         
            +
                    self.head_dim = dim // num_heads
         
     | 
| 175 | 
         
            +
                    self.qkv = nn.Linear(dim, dim * 3, bias=bias)
         
     | 
| 176 | 
         
            +
                    self.proj = nn.Linear(dim, dim, bias=bias)
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
                def forward(
         
     | 
| 179 | 
         
            +
                        self,
         
     | 
| 180 | 
         
            +
                        hidden_states: torch.Tensor,
         
     | 
| 181 | 
         
            +
                        cu_seqlens: torch.Tensor,
         
     | 
| 182 | 
         
            +
                        rotary_pos_emb: torch.Tensor = None,
         
     | 
| 183 | 
         
            +
                ) -> torch.Tensor:
         
     | 
| 184 | 
         
            +
                    seq_length = hidden_states.shape[0]
         
     | 
| 185 | 
         
            +
             
     | 
| 186 | 
         
            +
                    q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
         
     | 
| 187 | 
         
            +
                    q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 188 | 
         
            +
                    k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 189 | 
         
            +
             
     | 
| 190 | 
         
            +
                    seqlens = torch.diff(cu_seqlens).tolist()
         
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
                    q_list = torch.split(q, seqlens, 0)
         
     | 
| 193 | 
         
            +
                    k_list = torch.split(k, seqlens, 0)
         
     | 
| 194 | 
         
            +
                    v_list = torch.split(v, seqlens, 0)
         
     | 
| 195 | 
         
            +
                    # eager attention  空间复杂度为 O(n^2) , n 为  b*s(batch_size * seq_len),  序列太长容易OOM, 这个实现 更具batch 切分 seq
         
     | 
| 196 | 
         
            +
                    # 减少内存需求, 计算相对 continus  batching 较慢。
         
     | 
| 197 | 
         
            +
                    outputs = []
         
     | 
| 198 | 
         
            +
                    for q_i, k_i, v_i in zip(q_list, k_list, v_list):
         
     | 
| 199 | 
         
            +
                        q_i = q_i.transpose(0, 1)
         
     | 
| 200 | 
         
            +
                        k_i = k_i.transpose(0, 1)
         
     | 
| 201 | 
         
            +
                        v_i = v_i.transpose(0, 1)
         
     | 
| 202 | 
         
            +
                        out = torch.matmul(q_i, k_i.transpose(1, 2)) / math.sqrt(self.head_dim)
         
     | 
| 203 | 
         
            +
                        out = nn.functional.softmax(out, dim=-1, dtype=torch.float32).to(q.dtype)
         
     | 
| 204 | 
         
            +
                        out = torch.matmul(out, v_i)
         
     | 
| 205 | 
         
            +
                        out = out.transpose(0, 1)
         
     | 
| 206 | 
         
            +
                        outputs.append(out)
         
     | 
| 207 | 
         
            +
             
     | 
| 208 | 
         
            +
                    attn_output = torch.concat(outputs, dim=0)
         
     | 
| 209 | 
         
            +
                    attn_output = attn_output.reshape(seq_length, -1)
         
     | 
| 210 | 
         
            +
                    attn_output = self.proj(attn_output)
         
     | 
| 211 | 
         
            +
                    return attn_output
         
     | 
| 212 | 
         
            +
             
     | 
| 213 | 
         
            +
             
     | 
| 214 | 
         
            +
            class VisionAscendAttention(nn.Module):
         
     | 
| 215 | 
         
            +
                def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
         
     | 
| 216 | 
         
            +
                    super().__init__()
         
     | 
| 217 | 
         
            +
                    self.num_heads = num_heads
         
     | 
| 218 | 
         
            +
                    self.head_dim = dim // num_heads
         
     | 
| 219 | 
         
            +
                    self.qkv = nn.Linear(dim, dim * 3, bias=bias)
         
     | 
| 220 | 
         
            +
                    self.proj = nn.Linear(dim, dim, bias=bias)
         
     | 
| 221 | 
         
            +
                    self.config = config
         
     | 
| 222 | 
         
            +
             
     | 
| 223 | 
         
            +
                def forward(
         
     | 
| 224 | 
         
            +
                        self,
         
     | 
| 225 | 
         
            +
                        hidden_states: torch.Tensor,
         
     | 
| 226 | 
         
            +
                        cu_seqlens: torch.Tensor,
         
     | 
| 227 | 
         
            +
                        rotary_pos_emb: torch.Tensor = None,
         
     | 
| 228 | 
         
            +
                ) -> torch.Tensor:
         
     | 
| 229 | 
         
            +
                    seq_length = hidden_states.shape[0]
         
     | 
| 230 | 
         
            +
                    q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
         
     | 
| 231 | 
         
            +
             
     | 
| 232 | 
         
            +
                    q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 233 | 
         
            +
                    k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 234 | 
         
            +
             
     | 
| 235 | 
         
            +
                    attention_mask = torch.ones([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
         
     | 
| 236 | 
         
            +
                    for i in range(1, len(cu_seqlens)):
         
     | 
| 237 | 
         
            +
                        attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = False
         
     | 
| 238 | 
         
            +
             
     | 
| 239 | 
         
            +
                    q = q.transpose(0, 1).unsqueeze(0)
         
     | 
| 240 | 
         
            +
                    k = k.transpose(0, 1).unsqueeze(0)
         
     | 
| 241 | 
         
            +
                    v = v.transpose(0, 1).unsqueeze(0)
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
                    attn_output = torch_npu.npu_prompt_flash_attention(q, k, v,
         
     | 
| 244 | 
         
            +
                                                                       atten_mask=attention_mask,
         
     | 
| 245 | 
         
            +
                                                                       num_heads=self.num_heads, input_layout="BNSD",
         
     | 
| 246 | 
         
            +
                                                                       scale_value=self.head_dim ** -0.5)
         
     | 
| 247 | 
         
            +
                    attn_output = attn_output.squeeze(0).transpose(0, 1)
         
     | 
| 248 | 
         
            +
                    attn_output = attn_output.reshape(seq_length, -1)
         
     | 
| 249 | 
         
            +
                    attn_output = self.proj(attn_output)
         
     | 
| 250 | 
         
            +
                    return attn_output
         
     | 
| 251 | 
         
            +
             
     | 
| 252 | 
         
            +
             
     | 
| 253 | 
         
            +
            class VisionSdpaAttention(nn.Module):
         
     | 
| 254 | 
         
            +
                def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
         
     | 
| 255 | 
         
            +
                    super().__init__()
         
     | 
| 256 | 
         
            +
                    self.num_heads = num_heads
         
     | 
| 257 | 
         
            +
                    self.qkv = nn.Linear(dim, dim * 3, bias=bias)
         
     | 
| 258 | 
         
            +
                    self.proj = nn.Linear(dim, dim, bias=bias)
         
     | 
| 259 | 
         
            +
                    self.config = config
         
     | 
| 260 | 
         
            +
             
     | 
| 261 | 
         
            +
                def forward(
         
     | 
| 262 | 
         
            +
                        self,
         
     | 
| 263 | 
         
            +
                        hidden_states: torch.Tensor,
         
     | 
| 264 | 
         
            +
                        cu_seqlens: torch.Tensor,
         
     | 
| 265 | 
         
            +
                        rotary_pos_emb: torch.Tensor = None,
         
     | 
| 266 | 
         
            +
                ) -> torch.Tensor:
         
     | 
| 267 | 
         
            +
                    seq_length = hidden_states.shape[0]
         
     | 
| 268 | 
         
            +
                    q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
         
     | 
| 269 | 
         
            +
             
     | 
| 270 | 
         
            +
                    q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 271 | 
         
            +
                    k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
         
     | 
| 272 | 
         
            +
             
     | 
| 273 | 
         
            +
                    attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
         
     | 
| 274 | 
         
            +
                    for i in range(1, len(cu_seqlens)):
         
     | 
| 275 | 
         
            +
                        attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
         
     | 
| 276 | 
         
            +
             
     | 
| 277 | 
         
            +
                    # Convert q, k, v to 4D to enable : (1, num_heads, seq_length, head_dim)
         
     | 
| 278 | 
         
            +
                    q = q.transpose(0, 1).unsqueeze(0)   # (1, num_heads, seq_length, head_dim)
         
     | 
| 279 | 
         
            +
                    k = k.transpose(0, 1).unsqueeze(0)
         
     | 
| 280 | 
         
            +
                    v = v.transpose(0, 1).unsqueeze(0)
         
     | 
| 281 | 
         
            +
             
     | 
| 282 | 
         
            +
                    # See: https://github.com/pytorch/pytorch/issues/127523
         
     | 
| 283 | 
         
            +
                    if attention_mask.stride(-1) != 1:
         
     | 
| 284 | 
         
            +
                        attention_mask = torch.empty_like(attention_mask, memory_format=torch.contiguous_format).copy_(attention_mask)
         
     | 
| 285 | 
         
            +
             
     | 
| 286 | 
         
            +
                    # use memory efficient backend
         
     | 
| 287 | 
         
            +
                    from torch.nn.attention import SDPBackend, sdpa_kernel
         
     | 
| 288 | 
         
            +
                    with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
         
     | 
| 289 | 
         
            +
                        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
         
     | 
| 290 | 
         
            +
             
     | 
| 291 | 
         
            +
                    attn_output = attn_output.squeeze(0).transpose(0, 1)  # (seq_length, num_heads, head_dim)
         
     | 
| 292 | 
         
            +
                    attn_output = attn_output.reshape(seq_length, -1)
         
     | 
| 293 | 
         
            +
             
     | 
| 294 | 
         
            +
                    attn_output = self.proj(attn_output)
         
     | 
| 295 | 
         
            +
                    return attn_output
         
     | 
| 296 | 
         
            +
             
     | 
| 297 | 
         
            +
             
     | 
| 298 | 
         
            +
            DOTS_VISION_ATTENTION_CLASSES = {
         
     | 
| 299 | 
         
            +
                "eager": VisionAttention,
         
     | 
| 300 | 
         
            +
                "eager_v2": VisionAttentionV2,  # 内存更少
         
     | 
| 301 | 
         
            +
                "flash_attention_2": VisionFlashAttention2,
         
     | 
| 302 | 
         
            +
                "sdpa": VisionSdpaAttention,
         
     | 
| 303 | 
         
            +
                "ascend_fa": VisionAscendAttention,  # ascend, 长序列精度下降严重。
         
     | 
| 304 | 
         
            +
            }
         
     | 
| 305 | 
         
            +
             
     | 
| 306 | 
         
            +
             
     | 
| 307 | 
         
            +
            class RMSNorm(nn.Module):
         
     | 
| 308 | 
         
            +
                def __init__(self, dim: int, eps: float = 1e-6):
         
     | 
| 309 | 
         
            +
                    super().__init__()
         
     | 
| 310 | 
         
            +
                    self.weight = nn.Parameter(torch.ones(dim))
         
     | 
| 311 | 
         
            +
                    self.eps = eps
         
     | 
| 312 | 
         
            +
             
     | 
| 313 | 
         
            +
                def forward(self, x: torch.Tensor) -> torch.Tensor:
         
     | 
| 314 | 
         
            +
                    output = self._norm(x.float()).type_as(x)
         
     | 
| 315 | 
         
            +
                    return output * self.weight
         
     | 
| 316 | 
         
            +
             
     | 
| 317 | 
         
            +
                def extra_repr(self) -> str:
         
     | 
| 318 | 
         
            +
                    return f"{tuple(self.weight.shape)}, eps={self.eps}"
         
     | 
| 319 | 
         
            +
             
     | 
| 320 | 
         
            +
                def _norm(self, x: torch.Tensor) -> torch.Tensor:
         
     | 
| 321 | 
         
            +
                    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
         
     | 
| 322 | 
         
            +
             
     | 
| 323 | 
         
            +
             
     | 
| 324 | 
         
            +
            class DotsSwiGLUFFN(nn.Module):
         
     | 
| 325 | 
         
            +
                def __init__(self, config):
         
     | 
| 326 | 
         
            +
                    super().__init__()
         
     | 
| 327 | 
         
            +
                    hidden_features = config.intermediate_size
         
     | 
| 328 | 
         
            +
                    in_features = config.embed_dim
         
     | 
| 329 | 
         
            +
                    bias = config.use_bias
         
     | 
| 330 | 
         
            +
             
     | 
| 331 | 
         
            +
                    self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
         
     | 
| 332 | 
         
            +
                    self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
         
     | 
| 333 | 
         
            +
                    self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
         
     | 
| 334 | 
         
            +
             
     | 
| 335 | 
         
            +
                def forward(self, x: torch.Tensor) -> torch.Tensor:
         
     | 
| 336 | 
         
            +
                    x = F.silu(self.fc1(x)) * self.fc3(x)
         
     | 
| 337 | 
         
            +
                    x = self.fc2(x)
         
     | 
| 338 | 
         
            +
                    return x
         
     | 
| 339 | 
         
            +
             
     | 
| 340 | 
         
            +
             
     | 
| 341 | 
         
            +
            class DotsPatchEmbed(nn.Module):
         
     | 
| 342 | 
         
            +
                def __init__(self, config):
         
     | 
| 343 | 
         
            +
                    super().__init__()
         
     | 
| 344 | 
         
            +
                    self.num_channels = config.num_channels
         
     | 
| 345 | 
         
            +
                    self.patch_size = config.patch_size
         
     | 
| 346 | 
         
            +
                    self.temporal_patch_size = config.temporal_patch_size
         
     | 
| 347 | 
         
            +
                    self.embed_dim = config.embed_dim
         
     | 
| 348 | 
         
            +
                    self.config = config
         
     | 
| 349 | 
         
            +
                    self.proj = nn.Conv2d(
         
     | 
| 350 | 
         
            +
                        config.num_channels,
         
     | 
| 351 | 
         
            +
                        config.embed_dim,
         
     | 
| 352 | 
         
            +
                        kernel_size=(config.patch_size, config.patch_size),
         
     | 
| 353 | 
         
            +
                        stride=(config.patch_size, config.patch_size),
         
     | 
| 354 | 
         
            +
                    )
         
     | 
| 355 | 
         
            +
                    self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
         
     | 
| 356 | 
         
            +
             
     | 
| 357 | 
         
            +
                def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
         
     | 
| 358 | 
         
            +
                    x = x.view(-1, self.num_channels, self.temporal_patch_size, self.patch_size, self.patch_size)[:, :, 0]
         
     | 
| 359 | 
         
            +
                    x = self.proj(x).view(-1, self.embed_dim)
         
     | 
| 360 | 
         
            +
                    x = self.norm(x)
         
     | 
| 361 | 
         
            +
                    return x
         
     | 
| 362 | 
         
            +
             
     | 
| 363 | 
         
            +
             
     | 
| 364 | 
         
            +
            class DotsViTPreprocessor(nn.Module):
         
     | 
| 365 | 
         
            +
                def __init__(self, config):
         
     | 
| 366 | 
         
            +
                    super().__init__()
         
     | 
| 367 | 
         
            +
                    self.patch_h = config.patch_size
         
     | 
| 368 | 
         
            +
                    self.patch_w = config.patch_size
         
     | 
| 369 | 
         
            +
                    self.embed_dim = config.embed_dim
         
     | 
| 370 | 
         
            +
                    self.config = config
         
     | 
| 371 | 
         
            +
                    self.patchifier = DotsPatchEmbed(config)
         
     | 
| 372 | 
         
            +
             
     | 
| 373 | 
         
            +
                def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
         
     | 
| 374 | 
         
            +
                    tokens = self.patchifier(x, grid_thw)
         
     | 
| 375 | 
         
            +
                    return tokens
         
     | 
| 376 | 
         
            +
             
     | 
| 377 | 
         
            +
             
     | 
| 378 | 
         
            +
            class DotsVisionBlock(nn.Module):
         
     | 
| 379 | 
         
            +
                def __init__(self, config, attn_implementation: str = "flash_attention_2"):
         
     | 
| 380 | 
         
            +
                    super().__init__()
         
     | 
| 381 | 
         
            +
             
     | 
| 382 | 
         
            +
                    if attn_implementation == "flash_attention_2" and not flash_attn_available:
         
     | 
| 383 | 
         
            +
                        # fallback to eager
         
     | 
| 384 | 
         
            +
                        attn_implementation = "eager"
         
     | 
| 385 | 
         
            +
                        print("flash attention not available! fallback to eager implementation ")
         
     | 
| 386 | 
         
            +
             
     | 
| 387 | 
         
            +
                    if attn_implementation == "ascend_fa" and not npu_available:
         
     | 
| 388 | 
         
            +
                        attn_implementation = "eager"
         
     | 
| 389 | 
         
            +
                        print("flash attention not available! fallback to eager implementation ")
         
     | 
| 390 | 
         
            +
             
     | 
| 391 | 
         
            +
                    self.attn = DOTS_VISION_ATTENTION_CLASSES[attn_implementation](
         
     | 
| 392 | 
         
            +
                        config, config.embed_dim, num_heads=config.num_attention_heads, bias=config.use_bias
         
     | 
| 393 | 
         
            +
                    )
         
     | 
| 394 | 
         
            +
                    self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
         
     | 
| 395 | 
         
            +
                    self.mlp = DotsSwiGLUFFN(config)
         
     | 
| 396 | 
         
            +
                    self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
         
     | 
| 397 | 
         
            +
             
     | 
| 398 | 
         
            +
                def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
         
     | 
| 399 | 
         
            +
                    hidden_states = hidden_states + self.attn(
         
     | 
| 400 | 
         
            +
                        self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
         
     | 
| 401 | 
         
            +
                    )
         
     | 
| 402 | 
         
            +
                    hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         
     | 
| 403 | 
         
            +
                    return hidden_states
         
     | 
| 404 | 
         
            +
             
     | 
| 405 | 
         
            +
             
     | 
| 406 | 
         
            +
            class DotsVisionTransformer(PreTrainedModel):
         
     | 
| 407 | 
         
            +
                def __init__(self, config: DotsVisionConfig) -> None:
         
     | 
| 408 | 
         
            +
                    super().__init__(config)
         
     | 
| 409 | 
         
            +
                    self.config = config
         
     | 
| 410 | 
         
            +
                    self.spatial_merge_size = config.spatial_merge_size
         
     | 
| 411 | 
         
            +
             
     | 
| 412 | 
         
            +
                    self.patch_embed = DotsViTPreprocessor(config)
         
     | 
| 413 | 
         
            +
                    self._init_weights(self.patch_embed.patchifier.proj)
         
     | 
| 414 | 
         
            +
             
     | 
| 415 | 
         
            +
                    head_dim = config.embed_dim // config.num_attention_heads
         
     | 
| 416 | 
         
            +
             
     | 
| 417 | 
         
            +
                    self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
         
     | 
| 418 | 
         
            +
             
     | 
| 419 | 
         
            +
                    _num_hidden_layers = config.num_hidden_layers
         
     | 
| 420 | 
         
            +
                    self.blocks = nn.ModuleList(
         
     | 
| 421 | 
         
            +
                        [DotsVisionBlock(config, config.attn_implementation) for _ in range(_num_hidden_layers)]
         
     | 
| 422 | 
         
            +
                    )
         
     | 
| 423 | 
         
            +
             
     | 
| 424 | 
         
            +
                    if self.config.post_norm:
         
     | 
| 425 | 
         
            +
                        self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
         
     | 
| 426 | 
         
            +
             
     | 
| 427 | 
         
            +
                    self.merger = PatchMerger(
         
     | 
| 428 | 
         
            +
                        dim=config.hidden_size,
         
     | 
| 429 | 
         
            +
                        context_dim=config.embed_dim,
         
     | 
| 430 | 
         
            +
                        spatial_merge_size=config.spatial_merge_size,
         
     | 
| 431 | 
         
            +
                        init_merger_std=self.config.init_merger_std,
         
     | 
| 432 | 
         
            +
                    )
         
     | 
| 433 | 
         
            +
             
     | 
| 434 | 
         
            +
                    self.gradient_checkpointing = False
         
     | 
| 435 | 
         
            +
                    self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
         
     | 
| 436 | 
         
            +
             
     | 
| 437 | 
         
            +
                def _init_weights(self, module):
         
     | 
| 438 | 
         
            +
                    std = self.config.initializer_range
         
     | 
| 439 | 
         
            +
                    if isinstance(module, (nn.Linear, nn.Conv3d)):
         
     | 
| 440 | 
         
            +
                        module.weight.data.normal_(mean=0.0, std=std)
         
     | 
| 441 | 
         
            +
                        if module.bias is not None:
         
     | 
| 442 | 
         
            +
                            module.bias.data.zero_()
         
     | 
| 443 | 
         
            +
                    elif isinstance(module, nn.Embedding):
         
     | 
| 444 | 
         
            +
                        module.weight.data.normal_(mean=0.0, std=std)
         
     | 
| 445 | 
         
            +
                        if module.padding_idx is not None:
         
     | 
| 446 | 
         
            +
                            module.weight.data[module.padding_idx].zero_()
         
     | 
| 447 | 
         
            +
             
     | 
| 448 | 
         
            +
                @property
         
     | 
| 449 | 
         
            +
                def dtype(self) -> torch.dtype:
         
     | 
| 450 | 
         
            +
                    return self.blocks[0].mlp.fc2.weight.dtype
         
     | 
| 451 | 
         
            +
             
     | 
| 452 | 
         
            +
                @property
         
     | 
| 453 | 
         
            +
                def device(self) -> torch.device:
         
     | 
| 454 | 
         
            +
                    return self.blocks[0].mlp.fc2.weight.device
         
     | 
| 455 | 
         
            +
             
     | 
| 456 | 
         
            +
                def get_pos_ids_by_grid(self, grid_thw):
         
     | 
| 457 | 
         
            +
                    pos_ids = []
         
     | 
| 458 | 
         
            +
                    for t, h, w in grid_thw:
         
     | 
| 459 | 
         
            +
                        hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
         
     | 
| 460 | 
         
            +
                        hpos_ids = hpos_ids.reshape(
         
     | 
| 461 | 
         
            +
                            h // self.spatial_merge_size,
         
     | 
| 462 | 
         
            +
                            self.spatial_merge_size,
         
     | 
| 463 | 
         
            +
                            w // self.spatial_merge_size,
         
     | 
| 464 | 
         
            +
                            self.spatial_merge_size,
         
     | 
| 465 | 
         
            +
                        )
         
     | 
| 466 | 
         
            +
                        hpos_ids = hpos_ids.permute(0, 2, 1, 3)
         
     | 
| 467 | 
         
            +
                        hpos_ids = hpos_ids.flatten()
         
     | 
| 468 | 
         
            +
             
     | 
| 469 | 
         
            +
                        wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
         
     | 
| 470 | 
         
            +
                        wpos_ids = wpos_ids.reshape(
         
     | 
| 471 | 
         
            +
                            h // self.spatial_merge_size,
         
     | 
| 472 | 
         
            +
                            self.spatial_merge_size,
         
     | 
| 473 | 
         
            +
                            w // self.spatial_merge_size,
         
     | 
| 474 | 
         
            +
                            self.spatial_merge_size,
         
     | 
| 475 | 
         
            +
                        )
         
     | 
| 476 | 
         
            +
                        wpos_ids = wpos_ids.permute(0, 2, 1, 3)
         
     | 
| 477 | 
         
            +
                        wpos_ids = wpos_ids.flatten()
         
     | 
| 478 | 
         
            +
                        pos_ids.append(
         
     | 
| 479 | 
         
            +
                            torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
         
     | 
| 480 | 
         
            +
                        )
         
     | 
| 481 | 
         
            +
             
     | 
| 482 | 
         
            +
                    return pos_ids
         
     | 
| 483 | 
         
            +
             
     | 
| 484 | 
         
            +
                def rot_pos_emb(self, grid_thw):
         
     | 
| 485 | 
         
            +
                    pos_ids = self.get_pos_ids_by_grid(grid_thw)
         
     | 
| 486 | 
         
            +
                    pos_ids = torch.cat(pos_ids, dim=0)
         
     | 
| 487 | 
         
            +
                    max_grid_size = grid_thw[:, 1:].max()
         
     | 
| 488 | 
         
            +
                    rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         
     | 
| 489 | 
         
            +
                    rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         
     | 
| 490 | 
         
            +
                    return rotary_pos_emb
         
     | 
| 491 | 
         
            +
             
     | 
| 492 | 
         
            +
                def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, bf16=True) -> torch.Tensor:
         
     | 
| 493 | 
         
            +
                    if bf16:
         
     | 
| 494 | 
         
            +
                        hidden_states = hidden_states.bfloat16()
         
     | 
| 495 | 
         
            +
                    hidden_states = self.patch_embed(hidden_states, grid_thw)
         
     | 
| 496 | 
         
            +
             
     | 
| 497 | 
         
            +
                    rotary_pos_emb = self.rot_pos_emb(grid_thw)
         
     | 
| 498 | 
         
            +
             
     | 
| 499 | 
         
            +
                    cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
         
     | 
| 500 | 
         
            +
                        dim=0,
         
     | 
| 501 | 
         
            +
                        dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
         
     | 
| 502 | 
         
            +
                    )
         
     | 
| 503 | 
         
            +
                    cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
         
     | 
| 504 | 
         
            +
             
     | 
| 505 | 
         
            +
                    for blk in self.blocks:
         
     | 
| 506 | 
         
            +
                        if self.gradient_checkpointing and self.training:
         
     | 
| 507 | 
         
            +
                            hidden_states = self._gradient_checkpointing_func(
         
     | 
| 508 | 
         
            +
                                blk.__call__,
         
     | 
| 509 | 
         
            +
                                hidden_states,
         
     | 
| 510 | 
         
            +
                                cu_seqlens,
         
     | 
| 511 | 
         
            +
                                rotary_pos_emb,
         
     | 
| 512 | 
         
            +
                            )
         
     | 
| 513 | 
         
            +
                        else:
         
     | 
| 514 | 
         
            +
                            hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
         
     | 
| 515 | 
         
            +
             
     | 
| 516 | 
         
            +
                    if self.config.post_norm:
         
     | 
| 517 | 
         
            +
                        hidden_states = self.post_trunk_norm(hidden_states)
         
     | 
| 518 | 
         
            +
             
     | 
| 519 | 
         
            +
                    hidden_states = self.merger(hidden_states)
         
     | 
| 520 | 
         
            +
                    return hidden_states
         
     | 
    	
        special_tokens_map.json
    ADDED
    
    | 
         @@ -0,0 +1,31 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "additional_special_tokens": [
         
     | 
| 3 | 
         
            +
                "<|im_start|>",
         
     | 
| 4 | 
         
            +
                "<|im_end|>",
         
     | 
| 5 | 
         
            +
                "<|object_ref_start|>",
         
     | 
| 6 | 
         
            +
                "<|object_ref_end|>",
         
     | 
| 7 | 
         
            +
                "<|box_start|>",
         
     | 
| 8 | 
         
            +
                "<|box_end|>",
         
     | 
| 9 | 
         
            +
                "<|quad_start|>",
         
     | 
| 10 | 
         
            +
                "<|quad_end|>",
         
     | 
| 11 | 
         
            +
                "<|vision_start|>",
         
     | 
| 12 | 
         
            +
                "<|vision_end|>",
         
     | 
| 13 | 
         
            +
                "<|vision_pad|>",
         
     | 
| 14 | 
         
            +
                "<|image_pad|>",
         
     | 
| 15 | 
         
            +
                "<|video_pad|>"
         
     | 
| 16 | 
         
            +
              ],
         
     | 
| 17 | 
         
            +
              "eos_token": {
         
     | 
| 18 | 
         
            +
                "content": "<|endoftext|>",
         
     | 
| 19 | 
         
            +
                "lstrip": false,
         
     | 
| 20 | 
         
            +
                "normalized": false,
         
     | 
| 21 | 
         
            +
                "rstrip": false,
         
     | 
| 22 | 
         
            +
                "single_word": false
         
     | 
| 23 | 
         
            +
              },
         
     | 
| 24 | 
         
            +
              "pad_token": {
         
     | 
| 25 | 
         
            +
                "content": "[PAD]",
         
     | 
| 26 | 
         
            +
                "lstrip": false,
         
     | 
| 27 | 
         
            +
                "normalized": false,
         
     | 
| 28 | 
         
            +
                "rstrip": false,
         
     | 
| 29 | 
         
            +
                "single_word": false
         
     | 
| 30 | 
         
            +
              }
         
     | 
| 31 | 
         
            +
            }
         
     | 
    	
        tokenizer.json
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:904d81ff0cfa066dbc0b6a21e10ded6ebb7c2d8df14100d851f90bb7878bd5de
         
     | 
| 3 | 
         
            +
            size 11426251
         
     | 
    	
        tokenizer_config.json
    ADDED
    
    | 
         @@ -0,0 +1,395 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "add_bos_token": false,
         
     | 
| 3 | 
         
            +
              "add_prefix_space": false,
         
     | 
| 4 | 
         
            +
              "added_tokens_decoder": {
         
     | 
| 5 | 
         
            +
                "151643": {
         
     | 
| 6 | 
         
            +
                  "content": "<|endoftext|>",
         
     | 
| 7 | 
         
            +
                  "lstrip": false,
         
     | 
| 8 | 
         
            +
                  "normalized": false,
         
     | 
| 9 | 
         
            +
                  "rstrip": false,
         
     | 
| 10 | 
         
            +
                  "single_word": false,
         
     | 
| 11 | 
         
            +
                  "special": true
         
     | 
| 12 | 
         
            +
                },
         
     | 
| 13 | 
         
            +
                "151644": {
         
     | 
| 14 | 
         
            +
                  "content": "<|im_start|>",
         
     | 
| 15 | 
         
            +
                  "lstrip": false,
         
     | 
| 16 | 
         
            +
                  "normalized": false,
         
     | 
| 17 | 
         
            +
                  "rstrip": false,
         
     | 
| 18 | 
         
            +
                  "single_word": false,
         
     | 
| 19 | 
         
            +
                  "special": true
         
     | 
| 20 | 
         
            +
                },
         
     | 
| 21 | 
         
            +
                "151645": {
         
     | 
| 22 | 
         
            +
                  "content": "<|im_end|>",
         
     | 
| 23 | 
         
            +
                  "lstrip": false,
         
     | 
| 24 | 
         
            +
                  "normalized": false,
         
     | 
| 25 | 
         
            +
                  "rstrip": false,
         
     | 
| 26 | 
         
            +
                  "single_word": false,
         
     | 
| 27 | 
         
            +
                  "special": true
         
     | 
| 28 | 
         
            +
                },
         
     | 
| 29 | 
         
            +
                "151646": {
         
     | 
| 30 | 
         
            +
                  "content": "<|object_ref_start|>",
         
     | 
| 31 | 
         
            +
                  "lstrip": false,
         
     | 
| 32 | 
         
            +
                  "normalized": false,
         
     | 
| 33 | 
         
            +
                  "rstrip": false,
         
     | 
| 34 | 
         
            +
                  "single_word": false,
         
     | 
| 35 | 
         
            +
                  "special": true
         
     | 
| 36 | 
         
            +
                },
         
     | 
| 37 | 
         
            +
                "151647": {
         
     | 
| 38 | 
         
            +
                  "content": "<|object_ref_end|>",
         
     | 
| 39 | 
         
            +
                  "lstrip": false,
         
     | 
| 40 | 
         
            +
                  "normalized": false,
         
     | 
| 41 | 
         
            +
                  "rstrip": false,
         
     | 
| 42 | 
         
            +
                  "single_word": false,
         
     | 
| 43 | 
         
            +
                  "special": true
         
     | 
| 44 | 
         
            +
                },
         
     | 
| 45 | 
         
            +
                "151648": {
         
     | 
| 46 | 
         
            +
                  "content": "<|box_start|>",
         
     | 
| 47 | 
         
            +
                  "lstrip": false,
         
     | 
| 48 | 
         
            +
                  "normalized": false,
         
     | 
| 49 | 
         
            +
                  "rstrip": false,
         
     | 
| 50 | 
         
            +
                  "single_word": false,
         
     | 
| 51 | 
         
            +
                  "special": true
         
     | 
| 52 | 
         
            +
                },
         
     | 
| 53 | 
         
            +
                "151649": {
         
     | 
| 54 | 
         
            +
                  "content": "<|box_end|>",
         
     | 
| 55 | 
         
            +
                  "lstrip": false,
         
     | 
| 56 | 
         
            +
                  "normalized": false,
         
     | 
| 57 | 
         
            +
                  "rstrip": false,
         
     | 
| 58 | 
         
            +
                  "single_word": false,
         
     | 
| 59 | 
         
            +
                  "special": true
         
     | 
| 60 | 
         
            +
                },
         
     | 
| 61 | 
         
            +
                "151650": {
         
     | 
| 62 | 
         
            +
                  "content": "<|quad_start|>",
         
     | 
| 63 | 
         
            +
                  "lstrip": false,
         
     | 
| 64 | 
         
            +
                  "normalized": false,
         
     | 
| 65 | 
         
            +
                  "rstrip": false,
         
     | 
| 66 | 
         
            +
                  "single_word": false,
         
     | 
| 67 | 
         
            +
                  "special": true
         
     | 
| 68 | 
         
            +
                },
         
     | 
| 69 | 
         
            +
                "151651": {
         
     | 
| 70 | 
         
            +
                  "content": "<|quad_end|>",
         
     | 
| 71 | 
         
            +
                  "lstrip": false,
         
     | 
| 72 | 
         
            +
                  "normalized": false,
         
     | 
| 73 | 
         
            +
                  "rstrip": false,
         
     | 
| 74 | 
         
            +
                  "single_word": false,
         
     | 
| 75 | 
         
            +
                  "special": true
         
     | 
| 76 | 
         
            +
                },
         
     | 
| 77 | 
         
            +
                "151652": {
         
     | 
| 78 | 
         
            +
                  "content": "<|vision_start|>",
         
     | 
| 79 | 
         
            +
                  "lstrip": false,
         
     | 
| 80 | 
         
            +
                  "normalized": false,
         
     | 
| 81 | 
         
            +
                  "rstrip": false,
         
     | 
| 82 | 
         
            +
                  "single_word": false,
         
     | 
| 83 | 
         
            +
                  "special": true
         
     | 
| 84 | 
         
            +
                },
         
     | 
| 85 | 
         
            +
                "151653": {
         
     | 
| 86 | 
         
            +
                  "content": "<|vision_end|>",
         
     | 
| 87 | 
         
            +
                  "lstrip": false,
         
     | 
| 88 | 
         
            +
                  "normalized": false,
         
     | 
| 89 | 
         
            +
                  "rstrip": false,
         
     | 
| 90 | 
         
            +
                  "single_word": false,
         
     | 
| 91 | 
         
            +
                  "special": true
         
     | 
| 92 | 
         
            +
                },
         
     | 
| 93 | 
         
            +
                "151654": {
         
     | 
| 94 | 
         
            +
                  "content": "<|vision_pad|>",
         
     | 
| 95 | 
         
            +
                  "lstrip": false,
         
     | 
| 96 | 
         
            +
                  "normalized": false,
         
     | 
| 97 | 
         
            +
                  "rstrip": false,
         
     | 
| 98 | 
         
            +
                  "single_word": false,
         
     | 
| 99 | 
         
            +
                  "special": true
         
     | 
| 100 | 
         
            +
                },
         
     | 
| 101 | 
         
            +
                "151655": {
         
     | 
| 102 | 
         
            +
                  "content": "<|image_pad|>",
         
     | 
| 103 | 
         
            +
                  "lstrip": false,
         
     | 
| 104 | 
         
            +
                  "normalized": false,
         
     | 
| 105 | 
         
            +
                  "rstrip": false,
         
     | 
| 106 | 
         
            +
                  "single_word": false,
         
     | 
| 107 | 
         
            +
                  "special": true
         
     | 
| 108 | 
         
            +
                },
         
     | 
| 109 | 
         
            +
                "151656": {
         
     | 
| 110 | 
         
            +
                  "content": "<|video_pad|>",
         
     | 
| 111 | 
         
            +
                  "lstrip": false,
         
     | 
| 112 | 
         
            +
                  "normalized": false,
         
     | 
| 113 | 
         
            +
                  "rstrip": false,
         
     | 
| 114 | 
         
            +
                  "single_word": false,
         
     | 
| 115 | 
         
            +
                  "special": true
         
     | 
| 116 | 
         
            +
                },
         
     | 
| 117 | 
         
            +
                "151657": {
         
     | 
| 118 | 
         
            +
                  "content": "<tool_call>",
         
     | 
| 119 | 
         
            +
                  "lstrip": false,
         
     | 
| 120 | 
         
            +
                  "normalized": false,
         
     | 
| 121 | 
         
            +
                  "rstrip": false,
         
     | 
| 122 | 
         
            +
                  "single_word": false,
         
     | 
| 123 | 
         
            +
                  "special": false
         
     | 
| 124 | 
         
            +
                },
         
     | 
| 125 | 
         
            +
                "151658": {
         
     | 
| 126 | 
         
            +
                  "content": "</tool_call>",
         
     | 
| 127 | 
         
            +
                  "lstrip": false,
         
     | 
| 128 | 
         
            +
                  "normalized": false,
         
     | 
| 129 | 
         
            +
                  "rstrip": false,
         
     | 
| 130 | 
         
            +
                  "single_word": false,
         
     | 
| 131 | 
         
            +
                  "special": false
         
     | 
| 132 | 
         
            +
                },
         
     | 
| 133 | 
         
            +
                "151659": {
         
     | 
| 134 | 
         
            +
                  "content": "<|fim_prefix|>",
         
     | 
| 135 | 
         
            +
                  "lstrip": false,
         
     | 
| 136 | 
         
            +
                  "normalized": false,
         
     | 
| 137 | 
         
            +
                  "rstrip": false,
         
     | 
| 138 | 
         
            +
                  "single_word": false,
         
     | 
| 139 | 
         
            +
                  "special": false
         
     | 
| 140 | 
         
            +
                },
         
     | 
| 141 | 
         
            +
                "151660": {
         
     | 
| 142 | 
         
            +
                  "content": "<|fim_middle|>",
         
     | 
| 143 | 
         
            +
                  "lstrip": false,
         
     | 
| 144 | 
         
            +
                  "normalized": false,
         
     | 
| 145 | 
         
            +
                  "rstrip": false,
         
     | 
| 146 | 
         
            +
                  "single_word": false,
         
     | 
| 147 | 
         
            +
                  "special": false
         
     | 
| 148 | 
         
            +
                },
         
     | 
| 149 | 
         
            +
                "151661": {
         
     | 
| 150 | 
         
            +
                  "content": "<|fim_suffix|>",
         
     | 
| 151 | 
         
            +
                  "lstrip": false,
         
     | 
| 152 | 
         
            +
                  "normalized": false,
         
     | 
| 153 | 
         
            +
                  "rstrip": false,
         
     | 
| 154 | 
         
            +
                  "single_word": false,
         
     | 
| 155 | 
         
            +
                  "special": false
         
     | 
| 156 | 
         
            +
                },
         
     | 
| 157 | 
         
            +
                "151662": {
         
     | 
| 158 | 
         
            +
                  "content": "<|fim_pad|>",
         
     | 
| 159 | 
         
            +
                  "lstrip": false,
         
     | 
| 160 | 
         
            +
                  "normalized": false,
         
     | 
| 161 | 
         
            +
                  "rstrip": false,
         
     | 
| 162 | 
         
            +
                  "single_word": false,
         
     | 
| 163 | 
         
            +
                  "special": false
         
     | 
| 164 | 
         
            +
                },
         
     | 
| 165 | 
         
            +
                "151663": {
         
     | 
| 166 | 
         
            +
                  "content": "<|repo_name|>",
         
     | 
| 167 | 
         
            +
                  "lstrip": false,
         
     | 
| 168 | 
         
            +
                  "normalized": false,
         
     | 
| 169 | 
         
            +
                  "rstrip": false,
         
     | 
| 170 | 
         
            +
                  "single_word": false,
         
     | 
| 171 | 
         
            +
                  "special": false
         
     | 
| 172 | 
         
            +
                },
         
     | 
| 173 | 
         
            +
                "151664": {
         
     | 
| 174 | 
         
            +
                  "content": "<|file_sep|>",
         
     | 
| 175 | 
         
            +
                  "lstrip": false,
         
     | 
| 176 | 
         
            +
                  "normalized": false,
         
     | 
| 177 | 
         
            +
                  "rstrip": false,
         
     | 
| 178 | 
         
            +
                  "single_word": false,
         
     | 
| 179 | 
         
            +
                  "special": false
         
     | 
| 180 | 
         
            +
                },
         
     | 
| 181 | 
         
            +
                "151665": {
         
     | 
| 182 | 
         
            +
                  "content": "<|imgpad|>",
         
     | 
| 183 | 
         
            +
                  "lstrip": false,
         
     | 
| 184 | 
         
            +
                  "normalized": false,
         
     | 
| 185 | 
         
            +
                  "rstrip": false,
         
     | 
| 186 | 
         
            +
                  "single_word": false,
         
     | 
| 187 | 
         
            +
                  "special": true
         
     | 
| 188 | 
         
            +
                },
         
     | 
| 189 | 
         
            +
                "151666": {
         
     | 
| 190 | 
         
            +
                  "content": "<|img|>",
         
     | 
| 191 | 
         
            +
                  "lstrip": false,
         
     | 
| 192 | 
         
            +
                  "normalized": false,
         
     | 
| 193 | 
         
            +
                  "rstrip": false,
         
     | 
| 194 | 
         
            +
                  "single_word": false,
         
     | 
| 195 | 
         
            +
                  "special": true
         
     | 
| 196 | 
         
            +
                },
         
     | 
| 197 | 
         
            +
                "151667": {
         
     | 
| 198 | 
         
            +
                  "content": "<|endofimg|>",
         
     | 
| 199 | 
         
            +
                  "lstrip": false,
         
     | 
| 200 | 
         
            +
                  "normalized": false,
         
     | 
| 201 | 
         
            +
                  "rstrip": false,
         
     | 
| 202 | 
         
            +
                  "single_word": false,
         
     | 
| 203 | 
         
            +
                  "special": true
         
     | 
| 204 | 
         
            +
                },
         
     | 
| 205 | 
         
            +
                "151668": {
         
     | 
| 206 | 
         
            +
                  "content": "<|systemprompt|>",
         
     | 
| 207 | 
         
            +
                  "lstrip": false,
         
     | 
| 208 | 
         
            +
                  "normalized": false,
         
     | 
| 209 | 
         
            +
                  "rstrip": false,
         
     | 
| 210 | 
         
            +
                  "single_word": false,
         
     | 
| 211 | 
         
            +
                  "special": true
         
     | 
| 212 | 
         
            +
                },
         
     | 
| 213 | 
         
            +
                "151669": {
         
     | 
| 214 | 
         
            +
                  "content": "<|endofsystemprompt|>",
         
     | 
| 215 | 
         
            +
                  "lstrip": false,
         
     | 
| 216 | 
         
            +
                  "normalized": false,
         
     | 
| 217 | 
         
            +
                  "rstrip": false,
         
     | 
| 218 | 
         
            +
                  "single_word": false,
         
     | 
| 219 | 
         
            +
                  "special": true
         
     | 
| 220 | 
         
            +
                },
         
     | 
| 221 | 
         
            +
                "151670": {
         
     | 
| 222 | 
         
            +
                  "content": "<|user|>",
         
     | 
| 223 | 
         
            +
                  "lstrip": false,
         
     | 
| 224 | 
         
            +
                  "normalized": false,
         
     | 
| 225 | 
         
            +
                  "rstrip": false,
         
     | 
| 226 | 
         
            +
                  "single_word": false,
         
     | 
| 227 | 
         
            +
                  "special": true
         
     | 
| 228 | 
         
            +
                },
         
     | 
| 229 | 
         
            +
                "151671": {
         
     | 
| 230 | 
         
            +
                  "content": "<|endofuser|>",
         
     | 
| 231 | 
         
            +
                  "lstrip": false,
         
     | 
| 232 | 
         
            +
                  "normalized": false,
         
     | 
| 233 | 
         
            +
                  "rstrip": false,
         
     | 
| 234 | 
         
            +
                  "single_word": false,
         
     | 
| 235 | 
         
            +
                  "special": true
         
     | 
| 236 | 
         
            +
                },
         
     | 
| 237 | 
         
            +
                "151672": {
         
     | 
| 238 | 
         
            +
                  "content": "<|assistant|>",
         
     | 
| 239 | 
         
            +
                  "lstrip": false,
         
     | 
| 240 | 
         
            +
                  "normalized": false,
         
     | 
| 241 | 
         
            +
                  "rstrip": false,
         
     | 
| 242 | 
         
            +
                  "single_word": false,
         
     | 
| 243 | 
         
            +
                  "special": true
         
     | 
| 244 | 
         
            +
                },
         
     | 
| 245 | 
         
            +
                "151673": {
         
     | 
| 246 | 
         
            +
                  "content": "<|endofassistant|>",
         
     | 
| 247 | 
         
            +
                  "lstrip": false,
         
     | 
| 248 | 
         
            +
                  "normalized": false,
         
     | 
| 249 | 
         
            +
                  "rstrip": false,
         
     | 
| 250 | 
         
            +
                  "single_word": false,
         
     | 
| 251 | 
         
            +
                  "special": true
         
     | 
| 252 | 
         
            +
                },
         
     | 
| 253 | 
         
            +
                "151674": {
         
     | 
| 254 | 
         
            +
                  "content": "<|ref_start|>",
         
     | 
| 255 | 
         
            +
                  "lstrip": false,
         
     | 
| 256 | 
         
            +
                  "normalized": false,
         
     | 
| 257 | 
         
            +
                  "rstrip": false,
         
     | 
| 258 | 
         
            +
                  "single_word": false,
         
     | 
| 259 | 
         
            +
                  "special": true
         
     | 
| 260 | 
         
            +
                },
         
     | 
| 261 | 
         
            +
                "151675": {
         
     | 
| 262 | 
         
            +
                  "content": "<|ref_end|>",
         
     | 
| 263 | 
         
            +
                  "lstrip": false,
         
     | 
| 264 | 
         
            +
                  "normalized": false,
         
     | 
| 265 | 
         
            +
                  "rstrip": false,
         
     | 
| 266 | 
         
            +
                  "single_word": false,
         
     | 
| 267 | 
         
            +
                  "special": true
         
     | 
| 268 | 
         
            +
                },
         
     | 
| 269 | 
         
            +
                "151676": {
         
     | 
| 270 | 
         
            +
                  "content": "[SEP]",
         
     | 
| 271 | 
         
            +
                  "lstrip": false,
         
     | 
| 272 | 
         
            +
                  "normalized": false,
         
     | 
| 273 | 
         
            +
                  "rstrip": false,
         
     | 
| 274 | 
         
            +
                  "single_word": false,
         
     | 
| 275 | 
         
            +
                  "special": true
         
     | 
| 276 | 
         
            +
                },
         
     | 
| 277 | 
         
            +
                "151677": {
         
     | 
| 278 | 
         
            +
                  "content": "<|pic|>",
         
     | 
| 279 | 
         
            +
                  "lstrip": false,
         
     | 
| 280 | 
         
            +
                  "normalized": false,
         
     | 
| 281 | 
         
            +
                  "rstrip": false,
         
     | 
| 282 | 
         
            +
                  "single_word": false,
         
     | 
| 283 | 
         
            +
                  "special": true
         
     | 
| 284 | 
         
            +
                },
         
     | 
| 285 | 
         
            +
                "151678": {
         
     | 
| 286 | 
         
            +
                  "content": "<|text|>",
         
     | 
| 287 | 
         
            +
                  "lstrip": false,
         
     | 
| 288 | 
         
            +
                  "normalized": false,
         
     | 
| 289 | 
         
            +
                  "rstrip": false,
         
     | 
| 290 | 
         
            +
                  "single_word": false,
         
     | 
| 291 | 
         
            +
                  "special": true
         
     | 
| 292 | 
         
            +
                },
         
     | 
| 293 | 
         
            +
                "151679": {
         
     | 
| 294 | 
         
            +
                  "content": "<|pictotext|>",
         
     | 
| 295 | 
         
            +
                  "lstrip": false,
         
     | 
| 296 | 
         
            +
                  "normalized": false,
         
     | 
| 297 | 
         
            +
                  "rstrip": false,
         
     | 
| 298 | 
         
            +
                  "single_word": false,
         
     | 
| 299 | 
         
            +
                  "special": true
         
     | 
| 300 | 
         
            +
                },
         
     | 
| 301 | 
         
            +
                "151680": {
         
     | 
| 302 | 
         
            +
                  "content": "[PAD]",
         
     | 
| 303 | 
         
            +
                  "lstrip": false,
         
     | 
| 304 | 
         
            +
                  "normalized": false,
         
     | 
| 305 | 
         
            +
                  "rstrip": false,
         
     | 
| 306 | 
         
            +
                  "single_word": false,
         
     | 
| 307 | 
         
            +
                  "special": true
         
     | 
| 308 | 
         
            +
                },
         
     | 
| 309 | 
         
            +
                "151681": {
         
     | 
| 310 | 
         
            +
                  "content": "<|slice|>",
         
     | 
| 311 | 
         
            +
                  "lstrip": false,
         
     | 
| 312 | 
         
            +
                  "normalized": false,
         
     | 
| 313 | 
         
            +
                  "rstrip": false,
         
     | 
| 314 | 
         
            +
                  "single_word": false,
         
     | 
| 315 | 
         
            +
                  "special": true
         
     | 
| 316 | 
         
            +
                },
         
     | 
| 317 | 
         
            +
                "151682": {
         
     | 
| 318 | 
         
            +
                  "content": "<|endofslice|>",
         
     | 
| 319 | 
         
            +
                  "lstrip": false,
         
     | 
| 320 | 
         
            +
                  "normalized": false,
         
     | 
| 321 | 
         
            +
                  "rstrip": false,
         
     | 
| 322 | 
         
            +
                  "single_word": false,
         
     | 
| 323 | 
         
            +
                  "special": true
         
     | 
| 324 | 
         
            +
                },
         
     | 
| 325 | 
         
            +
                "151683": {
         
     | 
| 326 | 
         
            +
                  "content": "<|imgrowend|>",
         
     | 
| 327 | 
         
            +
                  "lstrip": false,
         
     | 
| 328 | 
         
            +
                  "normalized": false,
         
     | 
| 329 | 
         
            +
                  "rstrip": false,
         
     | 
| 330 | 
         
            +
                  "single_word": false,
         
     | 
| 331 | 
         
            +
                  "special": true
         
     | 
| 332 | 
         
            +
                },
         
     | 
| 333 | 
         
            +
                "151684": {
         
     | 
| 334 | 
         
            +
                  "content": "<|polygon_start|>",
         
     | 
| 335 | 
         
            +
                  "lstrip": false,
         
     | 
| 336 | 
         
            +
                  "normalized": false,
         
     | 
| 337 | 
         
            +
                  "rstrip": false,
         
     | 
| 338 | 
         
            +
                  "single_word": false,
         
     | 
| 339 | 
         
            +
                  "special": true
         
     | 
| 340 | 
         
            +
                },
         
     | 
| 341 | 
         
            +
                "151685": {
         
     | 
| 342 | 
         
            +
                  "content": "<|polygon_end|>",
         
     | 
| 343 | 
         
            +
                  "lstrip": false,
         
     | 
| 344 | 
         
            +
                  "normalized": false,
         
     | 
| 345 | 
         
            +
                  "rstrip": false,
         
     | 
| 346 | 
         
            +
                  "single_word": false,
         
     | 
| 347 | 
         
            +
                  "special": true
         
     | 
| 348 | 
         
            +
                },
         
     | 
| 349 | 
         
            +
                "151686": {
         
     | 
| 350 | 
         
            +
                  "content": "<|image_gen_start|>",
         
     | 
| 351 | 
         
            +
                  "lstrip": false,
         
     | 
| 352 | 
         
            +
                  "normalized": false,
         
     | 
| 353 | 
         
            +
                  "rstrip": false,
         
     | 
| 354 | 
         
            +
                  "single_word": false,
         
     | 
| 355 | 
         
            +
                  "special": true
         
     | 
| 356 | 
         
            +
                },
         
     | 
| 357 | 
         
            +
                "151687": {
         
     | 
| 358 | 
         
            +
                  "content": "<|image_gen_end|>",
         
     | 
| 359 | 
         
            +
                  "lstrip": false,
         
     | 
| 360 | 
         
            +
                  "normalized": false,
         
     | 
| 361 | 
         
            +
                  "rstrip": false,
         
     | 
| 362 | 
         
            +
                  "single_word": false,
         
     | 
| 363 | 
         
            +
                  "special": true
         
     | 
| 364 | 
         
            +
                }
         
     | 
| 365 | 
         
            +
              },
         
     | 
| 366 | 
         
            +
              "additional_special_tokens": [
         
     | 
| 367 | 
         
            +
                "<|im_start|>",
         
     | 
| 368 | 
         
            +
                "<|im_end|>",
         
     | 
| 369 | 
         
            +
                "<|object_ref_start|>",
         
     | 
| 370 | 
         
            +
                "<|object_ref_end|>",
         
     | 
| 371 | 
         
            +
                "<|box_start|>",
         
     | 
| 372 | 
         
            +
                "<|box_end|>",
         
     | 
| 373 | 
         
            +
                "<|quad_start|>",
         
     | 
| 374 | 
         
            +
                "<|quad_end|>",
         
     | 
| 375 | 
         
            +
                "<|vision_start|>",
         
     | 
| 376 | 
         
            +
                "<|vision_end|>",
         
     | 
| 377 | 
         
            +
                "<|vision_pad|>",
         
     | 
| 378 | 
         
            +
                "<|image_pad|>",
         
     | 
| 379 | 
         
            +
                "<|video_pad|>"
         
     | 
| 380 | 
         
            +
              ],
         
     | 
| 381 | 
         
            +
              "auto_map": {
         
     | 
| 382 | 
         
            +
                "AutoProcessor": "configuration_dots.DotsVLProcessor"
         
     | 
| 383 | 
         
            +
              },
         
     | 
| 384 | 
         
            +
              "bos_token": null,
         
     | 
| 385 | 
         
            +
              "clean_up_tokenization_spaces": false,
         
     | 
| 386 | 
         
            +
              "eos_token": "<|endoftext|>",
         
     | 
| 387 | 
         
            +
              "errors": "replace",
         
     | 
| 388 | 
         
            +
              "extra_special_tokens": {},
         
     | 
| 389 | 
         
            +
              "model_max_length": 131072,
         
     | 
| 390 | 
         
            +
              "pad_token": "[PAD]",
         
     | 
| 391 | 
         
            +
              "processor_class": "DotsVLProcessor",
         
     | 
| 392 | 
         
            +
              "split_special_tokens": false,
         
     | 
| 393 | 
         
            +
              "tokenizer_class": "Qwen2Tokenizer",
         
     | 
| 394 | 
         
            +
              "unk_token": null
         
     | 
| 395 | 
         
            +
            }
         
     | 
    	
        vocab.json
    ADDED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         |