Spaces:
Running
Running
Gül Sena Altıntaş
commited on
Commit
·
d9779a0
1
Parent(s):
f58b113
Added support for showing newlines
Browse files- TODO: add toggle button to include newlines in the tokenization
README.md
CHANGED
|
@@ -11,3 +11,6 @@ license: apache-2.0
|
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 14 |
+
|
| 15 |
+
- [x] next up i want to add some sample texts that are interesting
|
| 16 |
+
- [x] normalization of the tokenization
|
app.py
CHANGED
|
@@ -228,6 +228,9 @@ def generate_interactive_tokenization(results):
|
|
| 228 |
for i, token in enumerate(result["tokens"]):
|
| 229 |
token_text = token["text"]
|
| 230 |
display_text = token_text if token_text.strip() else "·"
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Determine token class
|
| 233 |
token_class = f"token token-{token['type']}"
|
|
@@ -243,13 +246,17 @@ def generate_interactive_tokenization(results):
|
|
| 243 |
token_text.replace("\\", "\\\\")
|
| 244 |
.replace("'", "\\'")
|
| 245 |
.replace('"', '\\"')
|
| 246 |
-
.replace("\n", "\\n")
|
| 247 |
.replace("\r", "\\r")
|
|
|
|
| 248 |
)
|
| 249 |
|
| 250 |
-
escaped_display =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
-
# Use inline event handlers that
|
| 253 |
html_parts.append(f"""<span class="{token_class}"
|
| 254 |
id="{token_id}"
|
| 255 |
data-text="{token_text.replace('"', """).replace("'", "'")}"
|
|
@@ -312,11 +319,6 @@ def generate_token_ids_display(results):
|
|
| 312 |
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
|
| 313 |
)
|
| 314 |
|
| 315 |
-
# Show ID ranges
|
| 316 |
-
id_values = [token["id"] for token in result["tokens"]]
|
| 317 |
-
if id_values:
|
| 318 |
-
output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
|
| 319 |
-
|
| 320 |
return "\n".join(output)
|
| 321 |
|
| 322 |
|
|
@@ -663,7 +665,6 @@ with gr.Blocks(
|
|
| 663 |
norm_eff, norm_html, norm_ids = generate_basic_comparison(
|
| 664 |
normalized_results
|
| 665 |
)
|
| 666 |
-
print(normalized_text)
|
| 667 |
|
| 668 |
# Combine or show separately
|
| 669 |
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
|
|
|
| 228 |
for i, token in enumerate(result["tokens"]):
|
| 229 |
token_text = token["text"]
|
| 230 |
display_text = token_text if token_text.strip() else "·"
|
| 231 |
+
if token_text == "<newline>":
|
| 232 |
+
html_parts.append("<br>")
|
| 233 |
+
continue
|
| 234 |
|
| 235 |
# Determine token class
|
| 236 |
token_class = f"token token-{token['type']}"
|
|
|
|
| 246 |
token_text.replace("\\", "\\\\")
|
| 247 |
.replace("'", "\\'")
|
| 248 |
.replace('"', '\\"')
|
|
|
|
| 249 |
.replace("\r", "\\r")
|
| 250 |
+
.replace("\n", "\\n")
|
| 251 |
)
|
| 252 |
|
| 253 |
+
escaped_display = (
|
| 254 |
+
display_text.replace('"', """)
|
| 255 |
+
.replace("'", "'")
|
| 256 |
+
.replace("\r", "\n")
|
| 257 |
+
)
|
| 258 |
|
| 259 |
+
# Use inline event handlers that work in Gradio
|
| 260 |
html_parts.append(f"""<span class="{token_class}"
|
| 261 |
id="{token_id}"
|
| 262 |
data-text="{token_text.replace('"', """).replace("'", "'")}"
|
|
|
|
| 319 |
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
|
| 320 |
)
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
return "\n".join(output)
|
| 323 |
|
| 324 |
|
|
|
|
| 665 |
norm_eff, norm_html, norm_ids = generate_basic_comparison(
|
| 666 |
normalized_results
|
| 667 |
)
|
|
|
|
| 668 |
|
| 669 |
# Combine or show separately
|
| 670 |
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
-
import unicodedata
|
| 4 |
import traceback
|
|
|
|
| 5 |
|
| 6 |
import tiktoken
|
| 7 |
from transformers import AutoTokenizer
|
|
@@ -12,16 +12,17 @@ from mappings import MODEL_MAP, TOKENIZER_INFO
|
|
| 12 |
class TokenMonsterTokenizer:
|
| 13 |
def __init__(self, name):
|
| 14 |
import tokenmonster
|
|
|
|
| 15 |
self.name = name
|
| 16 |
self.vocab = tokenmonster.load(name.split("/")[-1])
|
| 17 |
-
|
| 18 |
def __call__(self, text, **kwargs):
|
| 19 |
ids = list(self.vocab.tokenize(text))
|
| 20 |
return {"input_ids": ids}
|
| 21 |
-
|
| 22 |
def convert_ids_to_tokens(self, ids):
|
| 23 |
return [self.vocab.decode(id_) for id_ in ids]
|
| 24 |
-
|
| 25 |
|
| 26 |
def get_token_type(token_text):
|
| 27 |
if re.match(r"^\s+$", token_text):
|
|
@@ -73,27 +74,37 @@ def is_subword(token_text, model, is_first):
|
|
| 73 |
def tokenize_with_tiktoken(text, model):
|
| 74 |
encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
|
| 75 |
enc = tiktoken.get_encoding(encoding)
|
| 76 |
-
tokens = enc.encode(text)
|
| 77 |
|
| 78 |
token_data = []
|
| 79 |
current_pos = 0
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
token_data.append(
|
| 87 |
{
|
| 88 |
-
"text":
|
| 89 |
-
"id":
|
| 90 |
-
"type":
|
| 91 |
-
"is_subword":
|
| 92 |
-
"
|
| 93 |
-
"position": i,
|
| 94 |
}
|
| 95 |
)
|
| 96 |
-
current_pos += len(token_text)
|
| 97 |
|
| 98 |
return {
|
| 99 |
"model": TOKENIZER_INFO[model]["name"],
|
|
@@ -117,37 +128,50 @@ def tokenize_with_hf(text, model):
|
|
| 117 |
"tokens": [],
|
| 118 |
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
| 119 |
}
|
| 120 |
-
|
| 121 |
if "tokenmonster" in model_name:
|
| 122 |
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
| 123 |
else:
|
| 124 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 125 |
-
|
| 126 |
-
|
| 127 |
token_data = []
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
# print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
|
| 138 |
-
|
| 139 |
-
for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
|
| 140 |
-
token_type = get_token_type(token_text)
|
| 141 |
-
subword = is_subword(token_text, model, i == 0)
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
token_data.append(
|
| 144 |
{
|
| 145 |
-
"text":
|
| 146 |
-
"id":
|
| 147 |
-
"type":
|
| 148 |
-
"is_subword":
|
| 149 |
-
"
|
| 150 |
-
"position": i,
|
| 151 |
}
|
| 152 |
)
|
| 153 |
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
| 3 |
import traceback
|
| 4 |
+
import unicodedata
|
| 5 |
|
| 6 |
import tiktoken
|
| 7 |
from transformers import AutoTokenizer
|
|
|
|
| 12 |
class TokenMonsterTokenizer:
|
| 13 |
def __init__(self, name):
|
| 14 |
import tokenmonster
|
| 15 |
+
|
| 16 |
self.name = name
|
| 17 |
self.vocab = tokenmonster.load(name.split("/")[-1])
|
| 18 |
+
|
| 19 |
def __call__(self, text, **kwargs):
|
| 20 |
ids = list(self.vocab.tokenize(text))
|
| 21 |
return {"input_ids": ids}
|
| 22 |
+
|
| 23 |
def convert_ids_to_tokens(self, ids):
|
| 24 |
return [self.vocab.decode(id_) for id_ in ids]
|
| 25 |
+
|
| 26 |
|
| 27 |
def get_token_type(token_text):
|
| 28 |
if re.match(r"^\s+$", token_text):
|
|
|
|
| 74 |
def tokenize_with_tiktoken(text, model):
|
| 75 |
encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
|
| 76 |
enc = tiktoken.get_encoding(encoding)
|
|
|
|
| 77 |
|
| 78 |
token_data = []
|
| 79 |
current_pos = 0
|
| 80 |
+
for text_ in text.split("\n"):
|
| 81 |
+
tokens = enc.encode(text_ + "\n")
|
| 82 |
|
| 83 |
+
for i, token_id in enumerate(tokens):
|
| 84 |
+
token_text = enc.decode([token_id])
|
| 85 |
+
token_type = get_token_type(token_text)
|
| 86 |
+
subword = is_subword(token_text, model, i == 0)
|
| 87 |
|
| 88 |
+
token_data.append(
|
| 89 |
+
{
|
| 90 |
+
"text": token_text,
|
| 91 |
+
"id": int(token_id),
|
| 92 |
+
"type": token_type,
|
| 93 |
+
"is_subword": subword,
|
| 94 |
+
"bytes": len(token_text.encode("utf-8")),
|
| 95 |
+
"position": i,
|
| 96 |
+
}
|
| 97 |
+
)
|
| 98 |
+
current_pos += len(token_text)
|
| 99 |
token_data.append(
|
| 100 |
{
|
| 101 |
+
"text": "<newline>",
|
| 102 |
+
"id": 0,
|
| 103 |
+
"type": "special",
|
| 104 |
+
"is_subword": False,
|
| 105 |
+
"position": len(token_data),
|
|
|
|
| 106 |
}
|
| 107 |
)
|
|
|
|
| 108 |
|
| 109 |
return {
|
| 110 |
"model": TOKENIZER_INFO[model]["name"],
|
|
|
|
| 128 |
"tokens": [],
|
| 129 |
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
| 130 |
}
|
| 131 |
+
|
| 132 |
if "tokenmonster" in model_name:
|
| 133 |
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
| 134 |
else:
|
| 135 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 136 |
+
model_name, token=hf_token, trust_remote_code=True
|
| 137 |
+
)
|
| 138 |
token_data = []
|
| 139 |
+
for text_ in text.split("\n"):
|
| 140 |
+
text_ = text_ + "\n"
|
| 141 |
+
|
| 142 |
+
encoding = tokenizer(
|
| 143 |
+
text_,
|
| 144 |
+
return_offsets_mapping=False,
|
| 145 |
+
return_tensors=None,
|
| 146 |
+
add_special_tokens=False,
|
| 147 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
token_ids = encoding["input_ids"]
|
| 150 |
+
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
| 151 |
+
# print(model_name, text, "\n", tokens, token_ids)
|
| 152 |
+
# print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
|
| 153 |
+
|
| 154 |
+
for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
|
| 155 |
+
token_type = get_token_type(token_text)
|
| 156 |
+
subword = is_subword(token_text, model, i == 0)
|
| 157 |
+
|
| 158 |
+
token_data.append(
|
| 159 |
+
{
|
| 160 |
+
"text": token_text,
|
| 161 |
+
"id": token_id, # int(token_id),
|
| 162 |
+
"type": token_type,
|
| 163 |
+
"is_subword": subword,
|
| 164 |
+
"bytes": len(token_text.encode("utf-8")),
|
| 165 |
+
"position": i,
|
| 166 |
+
}
|
| 167 |
+
)
|
| 168 |
token_data.append(
|
| 169 |
{
|
| 170 |
+
"text": "<newline>",
|
| 171 |
+
"id": 0,
|
| 172 |
+
"type": "special",
|
| 173 |
+
"is_subword": False,
|
| 174 |
+
"position": len(token_data),
|
|
|
|
| 175 |
}
|
| 176 |
)
|
| 177 |
|