Spaces:
Sleeping
Sleeping
George Pantazopoulos
commited on
Commit
·
b60238d
1
Parent(s):
79660f1
feat: update number examples
Browse files- playground_examples.py +53 -5
- playground_tokenizers.py +1 -0
playground_examples.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
-
default_user_input =
|
|
|
|
|
|
|
| 2 |
default_tokenizer_name_1 = "openai/gpt-4o"
|
| 3 |
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
| 4 |
|
| 5 |
|
| 6 |
-
number_example = """127+677=804
|
| 7 |
-
127 + 677 = 804
|
| 8 |
-
|
| 9 |
-
1275 + 6773 = 8048"""
|
| 10 |
|
| 11 |
code_example = """for i in range(1, 101):
|
| 12 |
if i % 3 == 0 and i % 5 == 0:
|
|
@@ -23,6 +24,48 @@ spelling_example = """How do you spell "accommodate"?
|
|
| 23 |
How many letters are in the word "accommodate"?
|
| 24 |
How many r's are in the word strawberry?"""
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
examples = {
|
| 27 |
"number": {
|
| 28 |
"text": number_example,
|
|
@@ -39,4 +82,9 @@ examples = {
|
|
| 39 |
"tokenizer_1": default_tokenizer_name_1,
|
| 40 |
"tokenizer_2": default_tokenizer_name_2,
|
| 41 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
|
|
|
| 1 |
+
default_user_input = (
|
| 2 |
+
"""Replace this text in the input field to see how tokenization works."""
|
| 3 |
+
)
|
| 4 |
default_tokenizer_name_1 = "openai/gpt-4o"
|
| 5 |
default_tokenizer_name_2 = "Qwen/Qwen2.5-72B"
|
| 6 |
|
| 7 |
|
| 8 |
+
number_example = """127+677=804\n
|
| 9 |
+
127 + 677 = 804
|
| 10 |
+
"""
|
|
|
|
| 11 |
|
| 12 |
code_example = """for i in range(1, 101):
|
| 13 |
if i % 3 == 0 and i % 5 == 0:
|
|
|
|
| 24 |
How many letters are in the word "accommodate"?
|
| 25 |
How many r's are in the word strawberry?"""
|
| 26 |
|
| 27 |
+
|
| 28 |
+
greek_example = """
|
| 29 |
+
# Both mean 'I am sorry' though the latter one contains accent mark or stress mark
|
| 30 |
+
Συγνωμη
|
| 31 |
+
Συγνώμη
|
| 32 |
+
|
| 33 |
+
# Both refer to "bean"
|
| 34 |
+
Φασόλι
|
| 35 |
+
Φασούλι
|
| 36 |
+
|
| 37 |
+
# Both refer to "Saturday"
|
| 38 |
+
Σάββατο
|
| 39 |
+
Σάβατο
|
| 40 |
+
|
| 41 |
+
# Both translate to 'egg'
|
| 42 |
+
Αυγό
|
| 43 |
+
Αγβό
|
| 44 |
+
|
| 45 |
+
# They both translate to grandfather, though the latter is mostly used in Corfu Island
|
| 46 |
+
Παππούς
|
| 47 |
+
Πάπους
|
| 48 |
+
|
| 49 |
+
# They mean two completely different things!
|
| 50 |
+
Νόνα # refers to grandmother commonly observed in Ionion pelagos
|
| 51 |
+
Νονά # refers to godmother in Christianity
|
| 52 |
+
|
| 53 |
+
# Both refer to something new
|
| 54 |
+
καινούριος
|
| 55 |
+
καινούργιος
|
| 56 |
+
|
| 57 |
+
# Both refer to tomato
|
| 58 |
+
ντοματα
|
| 59 |
+
τοματα
|
| 60 |
+
|
| 61 |
+
τρενο
|
| 62 |
+
τραινο
|
| 63 |
+
|
| 64 |
+
# Singular / Plural versions of something 'innate'
|
| 65 |
+
εγγενής
|
| 66 |
+
εγγενείς
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
examples = {
|
| 70 |
"number": {
|
| 71 |
"text": number_example,
|
|
|
|
| 82 |
"tokenizer_1": default_tokenizer_name_1,
|
| 83 |
"tokenizer_2": default_tokenizer_name_2,
|
| 84 |
},
|
| 85 |
+
"greek": {
|
| 86 |
+
"text": greek_example,
|
| 87 |
+
"tokenizer_1": default_tokenizer_name_1,
|
| 88 |
+
"tokenizer_2": "ilsp/Llama-Krikri-8B-Base",
|
| 89 |
+
},
|
| 90 |
}
|
playground_tokenizers.py
CHANGED
|
@@ -96,6 +96,7 @@ tokenizer_configs = [
|
|
| 96 |
TokenizerConfig("google/mt5-large", org="Google"),
|
| 97 |
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
| 98 |
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
|
|
|
| 99 |
]
|
| 100 |
|
| 101 |
assert len(set([config.name_display for config in tokenizer_configs])) == len(
|
|
|
|
| 96 |
TokenizerConfig("google/mt5-large", org="Google"),
|
| 97 |
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
| 98 |
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
| 99 |
+
TokenizerConfig("ilsp/Llama-Krikri-8B-Base", org="ILSP"),
|
| 100 |
]
|
| 101 |
|
| 102 |
assert len(set([config.name_display for config in tokenizer_configs])) == len(
|