Upload tokenizer
Browse files- special_tokens_map.json +21 -3
- tokenizer.json +60 -3
- tokenizer_config.json +0 -0
special_tokens_map.json
CHANGED
|
@@ -101,7 +101,25 @@
|
|
| 101 |
"<extra_id_98>",
|
| 102 |
"<extra_id_99>"
|
| 103 |
],
|
| 104 |
-
"eos_token":
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
}
|
|
|
|
| 101 |
"<extra_id_98>",
|
| 102 |
"<extra_id_99>"
|
| 103 |
],
|
| 104 |
+
"eos_token": {
|
| 105 |
+
"content": "</s>",
|
| 106 |
+
"lstrip": false,
|
| 107 |
+
"normalized": false,
|
| 108 |
+
"rstrip": false,
|
| 109 |
+
"single_word": false
|
| 110 |
+
},
|
| 111 |
+
"pad_token": {
|
| 112 |
+
"content": "<pad>",
|
| 113 |
+
"lstrip": false,
|
| 114 |
+
"normalized": false,
|
| 115 |
+
"rstrip": false,
|
| 116 |
+
"single_word": false
|
| 117 |
+
},
|
| 118 |
+
"unk_token": {
|
| 119 |
+
"content": "<unk>",
|
| 120 |
+
"lstrip": false,
|
| 121 |
+
"normalized": false,
|
| 122 |
+
"rstrip": false,
|
| 123 |
+
"single_word": false
|
| 124 |
+
}
|
| 125 |
}
|
tokenizer.json
CHANGED
|
@@ -35,6 +35,24 @@
|
|
| 35 |
"normalized": false,
|
| 36 |
"special": true
|
| 37 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
{
|
| 39 |
"id": 32000,
|
| 40 |
"content": "<extra_id_99>",
|
|
@@ -5776,6 +5794,42 @@
|
|
| 5776 |
"rstrip": false,
|
| 5777 |
"normalized": true,
|
| 5778 |
"special": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5779 |
}
|
| 5780 |
],
|
| 5781 |
"normalizer": {
|
|
@@ -5791,7 +5845,8 @@
|
|
| 5791 |
{
|
| 5792 |
"type": "Metaspace",
|
| 5793 |
"replacement": "▁",
|
| 5794 |
-
"
|
|
|
|
| 5795 |
}
|
| 5796 |
]
|
| 5797 |
},
|
|
@@ -5852,7 +5907,8 @@
|
|
| 5852 |
"decoder": {
|
| 5853 |
"type": "Metaspace",
|
| 5854 |
"replacement": "▁",
|
| 5855 |
-
"
|
|
|
|
| 5856 |
},
|
| 5857 |
"model": {
|
| 5858 |
"type": "Unigram",
|
|
@@ -134258,6 +134314,7 @@
|
|
| 134258 |
"<extra_id_0>",
|
| 134259 |
0.0
|
| 134260 |
]
|
| 134261 |
-
]
|
|
|
|
| 134262 |
}
|
| 134263 |
}
|
|
|
|
| 35 |
"normalized": false,
|
| 36 |
"special": true
|
| 37 |
},
|
| 38 |
+
{
|
| 39 |
+
"id": 834,
|
| 40 |
+
"content": "_",
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"lstrip": false,
|
| 43 |
+
"rstrip": false,
|
| 44 |
+
"normalized": true,
|
| 45 |
+
"special": false
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"id": 3229,
|
| 49 |
+
"content": "$",
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"lstrip": false,
|
| 52 |
+
"rstrip": false,
|
| 53 |
+
"normalized": true,
|
| 54 |
+
"special": false
|
| 55 |
+
},
|
| 56 |
{
|
| 57 |
"id": 32000,
|
| 58 |
"content": "<extra_id_99>",
|
|
|
|
| 5794 |
"rstrip": false,
|
| 5795 |
"normalized": true,
|
| 5796 |
"special": false
|
| 5797 |
+
},
|
| 5798 |
+
{
|
| 5799 |
+
"id": 32638,
|
| 5800 |
+
"content": "\\",
|
| 5801 |
+
"single_word": false,
|
| 5802 |
+
"lstrip": false,
|
| 5803 |
+
"rstrip": false,
|
| 5804 |
+
"normalized": true,
|
| 5805 |
+
"special": false
|
| 5806 |
+
},
|
| 5807 |
+
{
|
| 5808 |
+
"id": 32639,
|
| 5809 |
+
"content": "^",
|
| 5810 |
+
"single_word": false,
|
| 5811 |
+
"lstrip": false,
|
| 5812 |
+
"rstrip": false,
|
| 5813 |
+
"normalized": true,
|
| 5814 |
+
"special": false
|
| 5815 |
+
},
|
| 5816 |
+
{
|
| 5817 |
+
"id": 32640,
|
| 5818 |
+
"content": "{",
|
| 5819 |
+
"single_word": false,
|
| 5820 |
+
"lstrip": false,
|
| 5821 |
+
"rstrip": false,
|
| 5822 |
+
"normalized": true,
|
| 5823 |
+
"special": false
|
| 5824 |
+
},
|
| 5825 |
+
{
|
| 5826 |
+
"id": 32641,
|
| 5827 |
+
"content": "}",
|
| 5828 |
+
"single_word": false,
|
| 5829 |
+
"lstrip": false,
|
| 5830 |
+
"rstrip": false,
|
| 5831 |
+
"normalized": true,
|
| 5832 |
+
"special": false
|
| 5833 |
}
|
| 5834 |
],
|
| 5835 |
"normalizer": {
|
|
|
|
| 5845 |
{
|
| 5846 |
"type": "Metaspace",
|
| 5847 |
"replacement": "▁",
|
| 5848 |
+
"prepend_scheme": "always",
|
| 5849 |
+
"split": true
|
| 5850 |
}
|
| 5851 |
]
|
| 5852 |
},
|
|
|
|
| 5907 |
"decoder": {
|
| 5908 |
"type": "Metaspace",
|
| 5909 |
"replacement": "▁",
|
| 5910 |
+
"prepend_scheme": "always",
|
| 5911 |
+
"split": true
|
| 5912 |
},
|
| 5913 |
"model": {
|
| 5914 |
"type": "Unigram",
|
|
|
|
| 134314 |
"<extra_id_0>",
|
| 134315 |
0.0
|
| 134316 |
]
|
| 134317 |
+
],
|
| 134318 |
+
"byte_fallback": false
|
| 134319 |
}
|
| 134320 |
}
|
tokenizer_config.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|