Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / config.py

xu-song

update

171654c 3 months ago

raw

history blame contribute delete

4.87 kB

	"""

	## characters

	- alphanumeric characters
	- numeric characters
	- special characters: A special character is a character that is not an alphabetic or numeric character.
	- ASCII control characters
	- punctuation marks
	- accent marks
	- 数学符号
	- whitespace:
	- https://en.wikipedia.org/wiki/Whitespace_character
	- https://emptycharacter.com/


	https://www.computerhope.com/jargon/s/specchar.htm
	"""

	import random
	from datasets import load_dataset

	default_user_input = """\
	Replace this text in the input field to see how tokenization works.
	Buenos días!
	Tokenizer 是自然语言处理（NLP）中的一个关键组件，它的主要作用是将人类语言文本转换为计算机可以理解的数字表示形式。
	ラグビーワールドカップ2023フランス"""
	# default_tokenizer_name_1 = "Meta/llama3"
	# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
	default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
	default_tokenizer_name_2 = "openai/gpt-4o"


	def get_sample_input():
	default_inputs = {
	"en": "Replace this text in the input field to see how tokenization works.",
	"zh-Hans": "",
	"es": "",
	"de": "",
	}
	random.seed(10) # For reproducibility
	lines = []
	for lang in default_inputs.keys():
	dataset = load_dataset("eson/cc100-samples", lang, split="train")
	print(dataset)
	print(1)
	return default_inputs


	examples = {
	"en": [
	["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
	[
	"whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline",
	"huggyllama/llama-7b",
	"google-bert/bert-base-cased",
	], # chatglm 有blank_n, bert丢掉了空格，
	# ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
	[
	'punctuation: ,.:/?+="，。！？；【】〔〕〖〗',
	"google/gemma-7b",
	"huggyllama/llama-7b",
	], # llama词典有点小
	[
	"symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
	"baichuan-inc/Baichuan-7B",
	"huggyllama/llama-7b",
	],
	# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <\|system\|> <\|user\|> <\|assistant\|> <\|endoftext\|>", "", ""],
	],
	"zh": [
	[
	"空格测试： 2个空格 8个空格",
	"llama",
	"chatglm2_6b",
	], # chatglm 有blank_n,
	["标点测试：，。！？；", "baichuan_7b", "llama"],
	[
	"符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
	"baichuan_7b",
	"llama",
	],
	["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
	["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
	],
	}


	more_examples = [
	# bert系列
	(
	"google-bert/bert-base-cased",
	"google-bert/bert-base-uncased",
	"",
	"",
	), # # clue VS kplug， bert VS clue
	("bert-base-cased", "clue", "", "增加了[]()"),
	("roberta-chinese-clue", "kplug", "", ""),
	# llama系列 (基于sentencepiece)
	(
	"baichuan",
	"baichuan2",
	"baichuan2支持多空格，多个换行\n\n\n，do not add dummy prefix as Baichuan1",
	),
	("llama", "baichuan2", "baichuan2支持多空格，多个换行\n\n"),
	("llama", "chinese-llama-2-7b", ""),
	("llama", "llama3", "扩充词典"),
	("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
	# glm系列（基于sentencepiece）
	("glm", "chatglm1", ""),
	("chatglm1", "chatglm2", ""),
	# gpt2系列
	("gpt2", "moss", ""),
	("", "", ""),
	# openai系列（tiktoken）
	("qwen", "gpt_35_turbo", ""),
	("gpt4", "gpt-4o", "gpt35_turbo和gpt4 词典大小只有10万，gpt-4o有20万"),
	]

	lang = "en"

	example_types = [t[0].split(":")[0] for t in examples[lang]]


	def example_fn(example_idx):
	return examples[lang][example_idx]


	def get_more_example():
	import urllib.parse

	url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
	for tokenizer1, tokenizer2, text, comment in more_examples:
	full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
	print(full_url)


	if __name__ == "__main__":
	get_more_example()