Upload tokenization_chatglm.py
Browse files- tokenization_chatglm.py +7 -14
tokenization_chatglm.py
CHANGED
|
@@ -66,7 +66,6 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
| 67 |
|
| 68 |
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
|
| 69 |
-
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
| 70 |
self.name = "GLMTokenizer"
|
| 71 |
|
| 72 |
self.vocab_file = vocab_file
|
|
@@ -76,6 +75,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 76 |
"<eos>": self.tokenizer.eos_id,
|
| 77 |
"<pad>": self.tokenizer.pad_id
|
| 78 |
}
|
|
|
|
| 79 |
|
| 80 |
def get_command(self, token):
|
| 81 |
if token in self.special_tokens:
|
|
@@ -225,7 +225,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 225 |
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
| 226 |
"""
|
| 227 |
# Load from model defaults
|
| 228 |
-
|
| 229 |
|
| 230 |
required_input = encoded_inputs[self.model_input_names[0]]
|
| 231 |
seq_length = len(required_input)
|
|
@@ -248,17 +248,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 248 |
if needs_to_be_padded:
|
| 249 |
difference = max_length - len(required_input)
|
| 250 |
|
| 251 |
-
if
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
| 257 |
-
else:
|
| 258 |
-
if "attention_mask" in encoded_inputs:
|
| 259 |
-
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
| 260 |
-
if "position_ids" in encoded_inputs:
|
| 261 |
-
encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
|
| 262 |
-
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
| 263 |
|
| 264 |
return encoded_inputs
|
|
|
|
| 66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
| 67 |
|
| 68 |
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
|
|
|
|
| 69 |
self.name = "GLMTokenizer"
|
| 70 |
|
| 71 |
self.vocab_file = vocab_file
|
|
|
|
| 75 |
"<eos>": self.tokenizer.eos_id,
|
| 76 |
"<pad>": self.tokenizer.pad_id
|
| 77 |
}
|
| 78 |
+
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
| 79 |
|
| 80 |
def get_command(self, token):
|
| 81 |
if token in self.special_tokens:
|
|
|
|
| 225 |
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
| 226 |
"""
|
| 227 |
# Load from model defaults
|
| 228 |
+
assert self.padding_side == "left"
|
| 229 |
|
| 230 |
required_input = encoded_inputs[self.model_input_names[0]]
|
| 231 |
seq_length = len(required_input)
|
|
|
|
| 248 |
if needs_to_be_padded:
|
| 249 |
difference = max_length - len(required_input)
|
| 250 |
|
| 251 |
+
if "attention_mask" in encoded_inputs:
|
| 252 |
+
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
| 253 |
+
if "position_ids" in encoded_inputs:
|
| 254 |
+
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
|
| 255 |
+
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
return encoded_inputs
|