Spaces:
Running
Running
Update tokenization_small100.py
Browse files- tokenization_small100.py +16 -15
tokenization_small100.py
CHANGED
|
@@ -145,19 +145,6 @@ class SMALL100Tokenizer(PreTrainedTokenizer):
|
|
| 145 |
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
|
| 146 |
]
|
| 147 |
|
| 148 |
-
super().__init__(
|
| 149 |
-
tgt_lang=tgt_lang,
|
| 150 |
-
bos_token=bos_token,
|
| 151 |
-
eos_token=eos_token,
|
| 152 |
-
sep_token=sep_token,
|
| 153 |
-
unk_token=unk_token,
|
| 154 |
-
pad_token=pad_token,
|
| 155 |
-
language_codes=language_codes,
|
| 156 |
-
sp_model_kwargs=self.sp_model_kwargs,
|
| 157 |
-
num_madeup_words=num_madeup_words,
|
| 158 |
-
**kwargs,
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
self.vocab_file = vocab_file
|
| 162 |
self.encoder = load_json(vocab_file)
|
| 163 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
|
@@ -174,9 +161,23 @@ class SMALL100Tokenizer(PreTrainedTokenizer):
|
|
| 174 |
|
| 175 |
self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
|
| 176 |
self.cur_lang_id = self.get_lang_id(self._tgt_lang)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
self.set_lang_special_tokens(self._tgt_lang)
|
| 178 |
|
| 179 |
-
self.num_madeup_words = num_madeup_words
|
| 180 |
|
| 181 |
@property
|
| 182 |
def vocab_size(self) -> int:
|
|
@@ -361,4 +362,4 @@ def load_json(path: str) -> Union[Dict, List]:
|
|
| 361 |
|
| 362 |
def save_json(data, path: str) -> None:
|
| 363 |
with open(path, "w") as f:
|
| 364 |
-
json.dump(data, f, indent=2)
|
|
|
|
| 145 |
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
|
| 146 |
]
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
self.vocab_file = vocab_file
|
| 149 |
self.encoder = load_json(vocab_file)
|
| 150 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
|
|
|
| 161 |
|
| 162 |
self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
|
| 163 |
self.cur_lang_id = self.get_lang_id(self._tgt_lang)
|
| 164 |
+
self.num_madeup_words = num_madeup_words
|
| 165 |
+
|
| 166 |
+
super().__init__(
|
| 167 |
+
tgt_lang=tgt_lang,
|
| 168 |
+
bos_token=bos_token,
|
| 169 |
+
eos_token=eos_token,
|
| 170 |
+
sep_token=sep_token,
|
| 171 |
+
unk_token=unk_token,
|
| 172 |
+
pad_token=pad_token,
|
| 173 |
+
language_codes=language_codes,
|
| 174 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
| 175 |
+
num_madeup_words=num_madeup_words,
|
| 176 |
+
**kwargs,
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
self.set_lang_special_tokens(self._tgt_lang)
|
| 180 |
|
|
|
|
| 181 |
|
| 182 |
@property
|
| 183 |
def vocab_size(self) -> int:
|
|
|
|
| 362 |
|
| 363 |
def save_json(data, path: str) -> None:
|
| 364 |
with open(path, "w") as f:
|
| 365 |
+
json.dump(data, f, indent=2)
|