Spaces:
Running
Running
| """ | |
| ## tiktoken版本的 unicode 错误,什么原因? | |
| - gpt-4o: {"id": 189, "token": "\u0001", "token_decode": "\u0001", "token_dumps": "\"\\u0001\"", "token_unicode": "\\u0001", "token_unicode_decimal": [1], "token_len": 1} 错误的 | |
| - gpt-oss-20b: {"id": 189, "token": "ā", "token_decode": "\u0001", "token_dumps": "\"\\u0101\"", "token_unicode": "\\u0101", "token_unicode_decimal": [257], "token_len": 1} 正确的 | |
| """ | |
| import sys | |
| import pdb | |
| sys.path.append("../../") | |
| from vocab import tokenizer_factory | |
| from character_util import _decode_bytes_to_str | |
| tokenizer1 = tokenizer_factory.get_tokenizer("openai/gpt-oss-20b") | |
| tokenizer2 = tokenizer_factory.get_tokenizer("openai/gpt-4o") | |
| vocab_1 = {v:k for k, v in tokenizer1.get_vocab().items()} | |
| vocab_2 = {v:k for k, v in tokenizer2.get_vocab().items()} | |
| min_vocab_size = min(len(vocab_1), len(vocab_2)) | |
| for i in range(min_vocab_size): | |
| if i == 188: | |
| import pdb; pdb.set_trace() | |
| print(i) | |
| token_str1 = tokenizer1.convert_ids_to_tokens([i], skip_special_tokens=False)[0] | |
| token_str2 = tokenizer2.convert_ids_to_tokens([i], skip_special_tokens=False)[0] | |
| token_str2 = _decode_bytes_to_str(token_str2) | |
| if token_str1 != token_str2: | |
| pdb.set_trace() | |
| print(i, token_str1, token_str2) | |