wikitext-2-raw-v1 / tokenizer.json
goabonga's picture
Upload tokenizer files (vocab, config, README)
0a5a926 verified
{
"version": "1.0",
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFC"
}
]
},
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Whitespace"
},
{
"type": "Punctuation",
"behavior": "Isolated"
}
]
},
"decoder": {
"type": "WordLevel"
},
"special": [
0,
1,
2,
3
],
"model": {
"type": "WordLevel",
"vocab": {
"<pad>": 0,
"<unk>": 1,
"<bos>": 2,
"<eos>": 3,
"the": 4,
",": 5,
".": 6,
"of": 7,
"and": 8,
"@": 9,
"in": 10,
"to": 11,
"a": 12,
"=": 13,
"was": 14,
"\"": 15,
"-": 16,
"The": 17,
"'": 18,
"for": 19,
"with": 20,
"that": 21,
"as": 22,
"s": 23,
"were": 24,
"by": 25,
")": 26,
"(": 27,
"on": 28,
"is": 29,
"from": 30,
"at": 31,
"are": 32,
"gods": 33,
";": 34,
"an": 35,
"which": 36,
"be": 37,
"their": 38,
"his": 39,
"had": 40,
"her": 41,
"In": 42,
"it": 43,
"or": 44,
"\u2013": 45,
"not": 46,
"deities": 47,
"one": 48,
"but": 49,
"also": 50,
"he": 51,
"who": 52,
"other": 53,
"GA": 54,
"has": 55,
"first": 56,
"game": 57,
"aircraft": 58,
"its": 59,
"two": 60,
"have": 61,
"this": 62,
":": 63,
"she": 64,
"god": 65,
"all": 66,
"more": 67,
"they": 68,
"Egyptian": 69,
"time": 70,
"1": 71,
"She": 72,
"into": 73,
"team": 74,
"after": 75,
"He": 76,
"could": 77,
"year": 78,
"than": 79,
"been": 80,
"these": 81,
"season": 82,
"A": 83,
"It": 84,
"Valkyria": 85,
"during": 86,
"only": 87,
"2": 88,
"some": 89,
"being": 90,
"Little": 91,
"over": 92,
"when": 93,
"many": 94,
"such": 95,
"would": 96,
"most": 97,
"divine": 98,
"them": 99,
"Rock": 100,
"four": 101,
"Cullen": 102,
"up": 103,
"made": 104,
"Barker": 105,
"album": 106,
"4": 107,
"between": 108,
"Blue": 109,
"Jackets": 110,
"UK": 111,
"Fernandez": 112,
"about": 113,
"before": 114,
"work": 115,
"while": 116,
"I": 117,
"gold": 118,
"Chronicles": 119,
"can": 120,
"out": 121,
"world": 122,
"Columbus": 123,
"ship": 124,
"games": 125,
"used": 126,
"three": 127,
"000": 128,
"operations": 129,
"both": 130,
"like": 131,
"NHL": 132,
"through": 133,
"under": 134,
"\u2014": 135,
"no": 136,
"part": 137,
"This": 138,
"played": 139,
"3": 140,
"same": 141,
"7": 142,
"new": 143,
"form": 144,
"5": 145,
"dollar": 146,
"where": 147,
"There": 148,
"so": 149,
"him": 150,
"there": 151,
"war": 152,
"number": 153,
"said": 154,
"well": 155,
"Blackie": 156,
"human": 157,
"Kingdom": 158,
"Slayer": 159,
"each": 160,
"They": 161,
"became": 162,
"built": 163,
"called": 164,
"deity": 165,
"against": 166,
"although": 167,
"later": 168,
"South": 169,
"6": 170,
"early": 171,
"Arkansas": 172,
"including": 173,
"often": 174,
"N": 175,
"air": 176,
"British": 177,
"Egyptians": 178,
"points": 179,
"John": 180,
"years": 181,
"any": 182,
"long": 183,
"/": 184,
"small": 185,
"per": 186,
"second": 187,
"flying": 188,
"After": 189,
"around": 190,
"single": 191,
"class": 192,
"place": 193,
"following": 194,
"several": 195,
"United": 196,
"St": 197,
"New": 198,
"Atlanta": 199,
"down": 200,
"end": 201,
"Tower": 202,
"tower": 203,
"may": 204,
"commercial": 205,
"These": 206,
"important": 207,
"Egypt": 208,
"role": 209,
"began": 210,
"development": 211,
"along": 212,
"character": 213,
"within": 214,
"then": 215,
"did": 216,
"building": 217,
"state": 218,
"Navy": 219,
"record": 220,
"ships": 221,
"pilots": 222,
"aerodromes": 223,
"Zr\u00ednyi": 224,
"military": 225,
"received": 226,
"November": 227,
"As": 228,
"order": 229,
"day": 230,
"Flower": 231,
"Arsenal": 232,
"name": 233,
"women": 234,
"described": 235,
"given": 236,
"national": 237,
"power": 238,
"Austro": 239,
"Hungarian": 240,
"Heaven": 241,
"aviation": 242,
"2011": 243,
"series": 244,
"May": 245,
"original": 246,
"main": 247,
"found": 248,
"last": 249,
"just": 250,
"took": 251,
"live": 252,
"On": 253,
"life": 254,
"12": 255,
"species": 256,
"believed": 257,
"transport": 258,
"CAA": 259,
"player": 260,
"off": 261,
"m": 262,
"another": 263,
"history": 264,
"Fairies": 265,
"t": 266,
"family": 267,
"career": 268,
"Erzherzog": 269,
"temples": 270,
"Amun": 271,
"different": 272,
"known": 273,
"due": 274,
"song": 275,
"February": 276,
"arsenal": 277,
"activities": 278,
"back": 279,
"religious": 280,
"mm": 281,
"religion": 282,
"War": 283,
"While": 284,
"previous": 285,
"based": 286,
"present": 287,
"stated": 288,
"light": 289,
"much": 290,
"World": 291,
"10": 292,
"way": 293,
"local": 294,
"But": 295,
"15": 296,
"even": 297,
"Gambia": 298,
"trade": 299,
"inch": 300,
"Some": 301,
"Ra": 302,
"third": 303,
"take": 304,
"those": 305,
"system": 306,
"make": 307,
"until": 308,
"events": 309,
"Building": 310,
"came": 311,
"8": 312,
"use": 313,
"guns": 314,
"Union": 315,
"training": 316,
"contract": 317,
"track": 318,
"humans": 319,
"carbonaria": 320,
"III": 321,
"released": 322,
"During": 323,
"names": 324,
"do": 325,
"considered": 326,
"designed": 327,
"particular": 328,
"include": 329,
"named": 330,
"James": 331,
"States": 332,
"troops": 333,
"gun": 334,
"19": 335,
"sister": 336,
"9": 337,
"start": 338,
"represented": 339,
"Italian": 340,
"Ferdinand": 341,
"Horus": 342,
"Aviation": 343,
"II": 344,
"theme": 345,
"franchise": 346,
"13": 347,
"own": 348,
"continued": 349,
"forms": 350,
"others": 351,
"since": 352,
"$": 353,
"0": 354,
"American": 355,
"people": 356,
"[": 357,
"]": 358,
"All": 359,
"km": 360,
"mother": 361,
"private": 362,
"Association": 363,
"nature": 364,
"SMS": 365,
"Max": 366,
"maat": 367,
"goddess": 368,
"texts": 369,
"fungus": 370,
"outside": 371,
"large": 372,
"characters": 373,
"members": 374,
"carried": 375,
"if": 376,
"throughout": 377,
"five": 378,
"home": 379,
"times": 380,
"General": 381,
"support": 382,
"fire": 383,
"required": 384,
"best": 385,
"play": 386,
"When": 387,
"produced": 388,
"leading": 389,
"2012": 390,
"June": 391,
"established": 392,
"sent": 393,
"away": 394,
"signed": 395,
"Most": 396,
"common": 397,
"20": 398,
"remained": 399,
"National": 400,
"Her": 401,
"\u2019": 402,
"late": 403,
"ft": 404,
"feet": 405,
"win": 406,
"Although": 407,
"appeared": 408,
"draft": 409,
"However": 410,
"88": 411,
"rituals": 412,
"Isis": 413,
"cent": 414,
"airspace": 415,
"cup": 416,
"January": 417,
"story": 418,
"returned": 419,
"release": 420,
"control": 421,
"specific": 422,
"elements": 423,
"limited": 424,
"At": 425,
"position": 426,
"next": 427,
"written": 428,
"involved": 429,
"music": 430,
"felt": 431,
"public": 432,
"still": 433,
"23": 434,
"Club": 435,
"men": 436,
"held": 437,
"18": 438,
"River": 439,
"south": 440,
"now": 441,
"band": 442,
"Croydon": 443,
"death": 444,
"24": 445,
"conducted": 446,
"football": 447,
"maskray": 448,
"cm": 449,
"group": 450,
"high": 451,
"Roman": 452,
"sun": 453,
"Carey": 454,
"port": 455,
"struck": 456,
"India": 457,
"roles": 458,
"film": 459,
"CAT": 460,
"flight": 461,
"creation": 462,
"sector": 463,
"airports": 464,
"pilot": 465,
"noise": 466,
"coin": 467,
"developed": 468,
"praised": 469,
"players": 470,
"With": 471,
"city": 472,
"major": 473,
"action": 474,
"recorded": 475,
"general": 476,
"art": 477,
"set": 478,
"half": 479,
"goals": 480,
"For": 481,
"giving": 482,
"started": 483,
"included": 484,
"month": 485,
"less": 486,
"father": 487,
"area": 488,
"goal": 489,
"published": 490,
"Christian": 491,
"sometimes": 492,
"never": 493,
"17": 494,
"across": 495,
"final": 496,
"range": 497,
"Nash": 498,
"loss": 499,
"2013": 500,
"Got": 501,
"inches": 502,
"industry": 503,
"Lightning": 504,
"cult": 505,
"images": 506,
"temple": 507,
"EASA": 508,
"aerodrome": 509,
"Mint": 510,
"Nameless": 511,
"black": 512,
"2010": 513,
"forces": 514,
"using": 515,
"top": 516,
"once": 517,
"either": 518,
"without": 519,
"become": 520,
"numbers": 521,
"works": 522,
"force": 523,
"however": 524,
"full": 525,
"popular": 526,
"design": 527,
"G": 528,
"originally": 529,
"issue": 530,
"August": 531,
"point": 532,
"behind": 533,
"C": 534,
"further": 535,
"country": 536,
"performed": 537,
"11": 538,
"North": 539,
"larger": 540,
"Book": 541,
"Society": 542,
"children": 543,
"planning": 544,
"though": 545,
"2006": 546,
"round": 547,
"scoring": 548,
"terms": 549,
"Penguins": 550,
"observatory": 551,
"Be": 552,
"Way": 553,
"Savannah": 554,
"civil": 555,
"performance": 556,
"business": 557,
"fleet": 558,
"operation": 559,
"focus": 560,
"kings": 561,
"creator": 562,
"Osiris": 563,
"King": 564,
"flights": 565,
"airfields": 566,
"growth": 567,
"battle": 568,
"To": 569,
"certain": 570,
"themselves": 571,
"command": 572,
"lives": 573,
"addition": 574,
"despite": 575,
"thought": 576,
"By": 577,
"review": 578,
"songs": 579,
"Two": 580,
"six": 581,
"Civil": 582,
"arms": 583,
"State": 584,
"allowed": 585,
"personal": 586,
"various": 587,
"1862": 588,
"near": 589,
"strong": 590,
"shot": 591,
"Sanford": 592,
"above": 593,
"purposes": 594,
"moved": 595,
"few": 596,
"child": 597,
"level": 598,
"influence": 599,
"things": 600,
"society": 601,
"cover": 602,
"international": 603,
"hours": 604,
"plain": 605,
"complex": 606,
"size": 607,
"Hockey": 608,
"deal": 609,
"York": 610,
"former": 611,
"BC": 612,
"metres": 613,
"Fingal": 614,
"speed": 615,
"connected": 616,
"difficult": 617,
"Sri": 618,
"Ancona": 619,
"goddesses": 620,
"worship": 621,
"traditional": 622,
"Private": 623,
"standards": 624,
"playing": 625,
"video": 626,
"gameplay": 627,
"making": 628,
"opening": 629,
"help": 630,
"always": 631,
"move": 632,
"foreign": 633,
"great": 634,
"combined": 635,
"created": 636,
"seven": 637,
"available": 638,
"April": 639,
"overall": 640,
"500": 641,
"particularly": 642,
"generally": 643,
"provided": 644,
"noted": 645,
"period": 646,
"Museum": 647,
"how": 648,
"self": 649,
"served": 650,
"hold": 651,
"28": 652,
"me": 653,
"official": 654,
"March": 655,
"head": 656,
"ordnance": 657,
"consisted": 658,
"Confederate": 659,
"activity": 660,
"naval": 661,
"surrounding": 662,
"significant": 663,
"finally": 664,
"16": 665,
"1923": 666,
"competition": 667,
"1918": 668,
"FIFA": 669,
"1998": 670,
"Cup": 671,
"scored": 672,
"greater": 673,
"21": 674,
"estimated": 675,
"term": 676,
"Star": 677,
"Carter": 678,
"clock": 679,
"million": 680,
"Mason": 681,
"line": 682,
"streak": 683,
"reached": 684,
"road": 685,
"put": 686,
"Pope": 687,
"result": 688,
"according": 689,
"monitors": 690,
"myth": 691,
"king": 692,
"sky": 693,
"gliders": 694,
"Geopyxis": 695,
"Patterson": 696,
"PlayStation": 697,
"real": 698,
"low": 699,
"missions": 700,
"minor": 701,
"Each": 702,
"heavy": 703,
"types": 704,
"Army": 705,
"meaning": 706,
"himself": 707,
"weapons": 708,
"young": 709,
"mostly": 710,
"proposed": 711,
"again": 712,
"beginning": 713,
"upon": 714,
"Its": 715,
"worked": 716,
"similar": 717,
"September": 718,
"appearance": 719,
"too": 720,
"left": 721,
"His": 722,
"located": 723,
"MacArthur": 724,
"US": 725,
"newly": 726,
"B": 727,
"club": 728,
"officers": 729,
"rumors": 730,
"rifles": 731,
"caliber": 732,
"old": 733,
"nearly": 734,
"among": 735,
"lead": 736,
"placed": 737,
"&": 738,
"continue": 739,
"Frederick": 740,
"ones": 741,
"professional": 742,
"themed": 743,
"wife": 744,
"little": 745,
"25": 746,
"wrote": 747,
"closed": 748,
"services": 749,
"Old": 750,
"2007": 751,
"2008": 752,
"access": 753,
"2009": 754,
"usually": 755,
"1990": 756,
"League": 757,
"Draft": 758,
"Kings": 759,
"fans": 760,
"free": 761,
"goaltender": 762,
"losing": 763,
"debut": 764,
"seen": 765,
"regulation": 766,
"silver": 767,
"fixed": 768,
"blockade": 769,
"crew": 770,
"controlled": 771,
"armor": 772,
"kg": 773,
"means": 774,
"e": 775,
"formed": 776,
"battleships": 777,
"offerings": 778,
"myths": 779,
"represent": 780,
"example": 781,
"Blood": 782,
"fly": 783,
"engaged": 784,
"Pilot": 785,
"glider": 786,
"European": 787,
"gliding": 788,
"Aircraft": 789,
"recreational": 790,
"fruitbodies": 791,
"Longacre": 792,
"nation": 793,
"Second": 794,
"multiple": 795,
"2014": 796,
"return": 797,
"route": 798,
"rest": 799,
"units": 800,
"having": 801,
"act": 802,
"turns": 803,
"movement": 804,
"Gallian": 805,
"soldiers": 806,
"One": 807,
"towards": 808,
"ancient": 809,
"chance": 810,
"gave": 811,
"majority": 812,
"material": 813,
"An": 814,
"guitar": 815,
"modern": 816,
"instruments": 817,
"pre": 818,
"People": 819,
"owned": 820,
"aspects": 821,
"featured": 822,
"damaged": 823,
"S": 824,
"\u00c6sthetic": 825,
"remaining": 826,
"14": 827,
"ammunition": 828,
"instead": 829,
"Totten": 830,
"1861": 831,
"completed": 832,
"because": 833,
"That": 834,
"issued": 835,
"oil": 836,
"miles": 837,
"attempted": 838,
"brought": 839,
"days": 840,
"Dunnington": 841,
"bill": 842,
"Children": 843,
"Mary": 844,
"Dorothy": 845,
"amateur": 846,
"spent": 847,
"won": 848,
"give": 849,
"attack": 850,
"Set": 851,
"shaped": 852,
"disc": 853,
"base": 854,
"close": 855,
"region": 856,
"resulted": 857,
"pick": 858,
"fired": 859,
"Rick": 860,
"Howson": 861,
"prior": 862,
"50": 863,
"tied": 864,
"playoff": 865,
"might": 866,
"100": 867,
"break": 868,
"earlier": 869,
"Vatican": 870,
"observations": 871,
"stage": 872,
"says": 873,
"71": 874,
"acting": 875,
"Bulloch": 876,
"primary": 877,
"pound": 878,
"success": 879,
"ground": 880,
"opposite": 881,
"cancer": 882,
"introduced": 883,
"Karl": 884,
"bombardment": 885,
"wing": 886,
"rarely": 887,
"beliefs": 888,
"characteristics": 889,
"combinations": 890,
"Atenism": 891,
"depicted": 892,
"belief": 893,
"earth": 894,
"chaos": 895,
"Hathor": 896,
"ba": 897,
"increasingly": 898,
"Reign": 899,
"albums": 900,
"registered": 901,
"Licence": 902,
"facilities": 903,
"coins": 904,
"McKay": 905,
"unit": 906,
"features": 907,
"positive": 908,
"very": 909,
"turn": 910,
"assigned": 911,
"something": 912,
"remain": 913,
"entire": 914,
"composed": 915,
"becomes": 916,
"evidence": 917,
"short": 918,
"armed": 919,
"separate": 920,
"ways": 921,
"staff": 922,
"response": 923,
"what": 924,
"parts": 925,
"body": 926,
"needed": 927,
"essential": 928,
"engine": 929,
"anime": 930,
"Music": 931,
"production": 932,
"need": 933,
"initially": 934,
"lyrics": 935,
"According": 936,
"style": 937,
"latter": 938,
"returning": 939,
"issues": 940,
"planned": 941,
"title": 942,
"eventually": 943,
"History": 944,
"1997": 945,
"commander": 946,
"Governor": 947,
"Major": 948,
"increased": 949,
"30": 950,
"federal": 951,
"government": 952,
"transferred": 953,
"Captain": 954,
"should": 955,
"necessary": 956,
"subject": 957,
"morning": 958,
"percussion": 959,
"supply": 960,
"turned": 961,
"opened": 962,
"remainder": 963,
"W": 964,
"ironclad": 965,
"scope": 966,
"briefly": 967,
"added": 968,
"smaller": 969,
"October": 970,
"side": 971,
"Great": 972,
"century": 973,
"Later": 974,
"shell": 975,
"association": 976,
"designated": 977,
"Art": 978,
"Spring": 979,
"collaboration": 980,
"died": 981,
"critic": 982,
"show": 983,
"Following": 984,
"schools": 985,
"\u00a3": 986,
"studio": 987,
"Andrew": 988,
"enough": 989,
"sacred": 990,
"taking": 991,
"Warne": 992,
"comes": 993,
"1988": 994,
"Louis": 995,
"regional": 996,
"Another": 997,
"levels": 998,
"Australia": 999
},
"unk_token": "<unk>"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": 2,
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": 3,
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": 2,
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 0
}
},
{
"SpecialToken": {
"id": 3,
"type_id": 0
}
}
],
"special_tokens": {
"<pad>": {
"id": 0,
"ids": [
0
],
"tokens": [
"<pad>"
]
},
"<unk>": {
"id": 1,
"ids": [
1
],
"tokens": [
"<unk>"
]
},
"<bos>": {
"id": 2,
"ids": [
2
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": 3,
"ids": [
3
],
"tokens": [
"<eos>"
]
}
}
}
}