Gül Sena Altıntaş commited on
Commit
ce07484
·
1 Parent(s): 44cdae3

Further improvements

Browse files
Files changed (4) hide show
  1. app.py +307 -74
  2. mappings.py +11 -1
  3. requirements.txt +4 -1
  4. utils.py +536 -70
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from collections import Counter
 
2
 
3
  import gradio as gr
4
  import pandas as pd
@@ -6,12 +7,44 @@ import plotly.express as px
6
  import plotly.graph_objects as go
7
 
8
  from utils import (
 
9
  get_normalization_methods,
10
  normalize_text,
 
 
11
  tokenize_with_hf,
12
  tokenize_with_tiktoken,
13
  )
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def compare_tokenizers(text, selected_models, show_details=False):
17
  if not text.strip():
@@ -20,11 +53,7 @@ def compare_tokenizers(text, selected_models, show_details=False):
20
  results = {}
21
 
22
  for model in selected_models:
23
- if model in ["gpt-4", "gpt-2"]:
24
- results[model] = tokenize_with_tiktoken(text, model)
25
- else:
26
- results[model] = tokenize_with_hf(text, model)
27
-
28
  # Generate outputs
29
  efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
30
  results
@@ -73,6 +102,7 @@ def generate_basic_comparison(results):
73
 
74
 
75
  def generate_interactive_tokenization(results):
 
76
  """Generate HTML with working hover highlighting across tokenizers"""
77
  if not results:
78
  return "<p>No tokenization results to display.</p>"
@@ -170,6 +200,125 @@ def generate_interactive_tokenization(results):
170
  display: inline-block;
171
  justify-content: space-between;
172
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  </style>
174
 
175
  <div class="highlight-info" id="highlight-info"></div>
@@ -208,6 +357,40 @@ def generate_interactive_tokenization(results):
208
  info.style.display = 'none';
209
  }
210
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  </script>
212
  """)
213
 
@@ -239,10 +422,14 @@ def generate_interactive_tokenization(results):
239
  subword_count = 0
240
  for i, token in enumerate(result["tokens"]):
241
  token_text = token["text"]
 
242
  display_text = token_text if token_text.strip() else "·"
243
  if token_text == "<newline>":
244
  html_parts.append("<br>")
245
  continue
 
 
 
246
 
247
  # Determine token class
248
  token_class = f"token token-{token['type']}"
@@ -268,22 +455,72 @@ def generate_interactive_tokenization(results):
268
  .replace("\r", "\n")
269
  )
270
 
271
- # Use inline event handlers that work in Gradio
272
- html_parts.append(f"""<span class="{token_class}"
273
- id="{token_id}"
274
- data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
275
- data-id="{token["id"]}"
276
- data-position="{i}"
277
- data-model="{model}"
278
- title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
279
- onmouseover="highlightTokens('{escaped_text}')"
280
- onmouseout="clearHighlights()"
281
- onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
  html_parts.append(f"""
284
  </div>
285
  <div style="margin-top: 8px; font-size: 12px; color: #666;">
286
- Subwords: {subword_count}/{len(result["tokens"])}
287
  ({subword_count / len(result["tokens"]) * 100:.1f}%)
288
  </div>
289
  </div>
@@ -348,17 +585,9 @@ def compare_with_normalization(
348
  normalized_results = {}
349
 
350
  for model in selected_models:
351
- if model in ["gpt-4", "gpt-2"]:
352
- original_results[model] = tokenize_with_tiktoken(text, model)
353
- if normalization_method != "none":
354
- normalized_results[model] = tokenize_with_tiktoken(
355
- normalized_text, model
356
- )
357
- else:
358
- original_results[model] = tokenize_with_hf(text, model)
359
- if normalization_method != "none":
360
- normalized_results[model] = tokenize_with_hf(normalized_text, model)
361
-
362
  return original_results, normalized_results, normalized_text
363
 
364
 
@@ -523,29 +752,52 @@ with gr.Blocks(
523
  with gr.Row():
524
  with gr.Column(scale=2):
525
  # Sample texts dropdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
  sample_texts = gr.Dropdown(
527
- choices=[
528
- "Custom text (enter below)",
529
- "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
530
- "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
531
- "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
532
- "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
533
- "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56,价格为789美元。",
534
- "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
535
- "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
536
- "code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)",
537
- "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
538
- "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
539
- "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
540
- "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
541
- "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
542
- "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
543
- "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
544
- "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
545
- "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
546
- 'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
547
- "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
548
- ],
549
  value="Custom text (enter below)",
550
  label="Choose a sample text or enter your own",
551
  interactive=True,
@@ -555,35 +807,16 @@ with gr.Blocks(
555
  label="Text to tokenize",
556
  placeholder="Enter your text here or select a sample above...",
557
  lines=4,
558
- value="Hello world! This is a test with some subwords and punctuation.",
559
  )
560
  with gr.Column(scale=1):
561
  with gr.Tabs():
562
  with gr.TabItem("Models"):
563
  model_selector = gr.CheckboxGroup(
564
- choices=[
565
- "gpt-4",
566
- "gpt-2",
567
- "llama-2",
568
- "llama-3",
569
- "gemma-2",
570
- "qwen3",
571
- "qwen2.5",
572
- "bert",
573
- "bloom",
574
- "aya-expanse",
575
- "comma",
576
- "tokenmonster",
577
- "byt5",
578
- ],
579
- value=[
580
- "gpt-4",
581
- "llama-3",
582
- "gemma-2",
583
- "qwen2.5",
584
- "tokenmonster",
585
- ],
586
- label="Select tokenizers to compare",
587
  )
588
  show_details = gr.Checkbox(
589
  label="Show detailed analysis", value=False
 
1
  from collections import Counter
2
+ from pathlib import Path
3
 
4
  import gradio as gr
5
  import pandas as pd
 
7
  import plotly.graph_objects as go
8
 
9
  from utils import (
10
+ clean_token_display,
11
  get_normalization_methods,
12
  normalize_text,
13
+ tokenize_w_tekken,
14
+ tokenize_with_byt5,
15
  tokenize_with_hf,
16
  tokenize_with_tiktoken,
17
  )
18
 
19
+ TIKTOKENS = [ "gpt-4o", "gpt-2"]
20
+ HF = ["llama-3", "gemma-2", "qwen3", "mbert", "phi-3", "xglm", "bloom", "aya-expanse", "comma", "tokenmonster", "byt5"]
21
+ available_tokenizers = TIKTOKENS + HF + ["tekken", ]
22
+ pre_selected_tokenizers = ["xglm"]
23
+ pre_selected_tokenizers= available_tokenizers
24
+ pre_selected_tokenizers=[]
25
+ OUT_FILE = Path("paper-outs.txt")
26
+ if not OUT_FILE.exists():
27
+ open(OUT_FILE, "w")
28
+
29
+ def tokenize(model, text):
30
+
31
+ if model in ["gpt-4", "gpt-2", "gpt-4o"]:
32
+ toks = tokenize_with_tiktoken(text, model)
33
+ elif model in ["tekken"]:
34
+ toks = tokenize_w_tekken(text, model)
35
+ elif "byt5" in model:
36
+ toks = tokenize_with_byt5(text, model)
37
+ else:
38
+ toks = tokenize_with_hf(text, model)
39
+ with open(OUT_FILE, "a", encoding="utf-8") as file: # Specify UTF-8 encoding
40
+ file.write(toks["model"]+"\n")
41
+ file.write(f"Text: {text}\n")
42
+ s= str(",".join([str(t["text"]) for t in toks["tokens"]])) +"\n"
43
+ # s = s.encode("utf-8")
44
+ # s = s.encode('latin1').decode('utf-8')
45
+ file.write(s)
46
+ file.write("\n")
47
+ return toks
48
 
49
  def compare_tokenizers(text, selected_models, show_details=False):
50
  if not text.strip():
 
53
  results = {}
54
 
55
  for model in selected_models:
56
+ results[model] = tokenize(model, text)
 
 
 
 
57
  # Generate outputs
58
  efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
59
  results
 
102
 
103
 
104
  def generate_interactive_tokenization(results):
105
+ ##todo main vis
106
  """Generate HTML with working hover highlighting across tokenizers"""
107
  if not results:
108
  return "<p>No tokenization results to display.</p>"
 
200
  display: inline-block;
201
  justify-content: space-between;
202
  }
203
+
204
+ /* Multi-token span styles */
205
+ .token-span-container {
206
+ display: inline-flex;
207
+ margin: 2px;
208
+ }
209
+
210
+ .token-multi-span {
211
+ background: linear-gradient(45deg, #e8f5e8 25%, #f3e5f5 25%, #f3e5f5 50%, #e8f5e8 50%, #e8f5e8 75%, #f3e5f5 75%);
212
+ background-size: 8px 8px;
213
+ }
214
+
215
+ .token-span-part {
216
+ margin: 0 !important;
217
+ border-radius: 0 !important;
218
+ border-right: none !important;
219
+ position: relative;
220
+ min-width: 20px;
221
+ text-align: center;
222
+ font-size: 11px;
223
+ }
224
+
225
+
226
+ /* Hover effect for multi-token spans */
227
+ .token-span-container:hover .token-span-part {
228
+ transform: scale(1.02);
229
+ box-shadow: 0 2px 8px rgba(0,0,0,0.15);
230
+ }
231
+
232
+ /* Different visual for multi-token spans */
233
+ .token-multi-span.token-word {
234
+ background: repeating-linear-gradient(45deg, #e8f5e8, #e8f5e8 4px, #d4edda 4px, #d4edda 8px);
235
+ }
236
+ .token-multi-span.token-number {
237
+ background: repeating-linear-gradient(45deg, #f3e5f5, #f3e5f5 4px, #e1bee7 4px, #e1bee7 8px);
238
+ }
239
+ .token-multi-span.token-punctuation {
240
+ background: repeating-linear-gradient(45deg, #ffebee, #ffebee 4px, #ffcdd2 4px, #ffcdd2 8px);
241
+ }
242
+ /* Multi-token span styles */
243
+ .token-span-container {
244
+ display: inline-flex;
245
+ margin: 2px;
246
+ cursor: pointer;
247
+ }
248
+
249
+ .token-multi-span {
250
+ /* Distinctive background pattern for multi-token spans */
251
+ background: repeating-linear-gradient(
252
+ 45deg,
253
+ transparent,
254
+ transparent 2px,
255
+ rgba(0,0,0,0.1) 2px,
256
+ rgba(0,0,0,0.1) 4px
257
+ );
258
+ }
259
+
260
+ .token-span-part {
261
+ margin: 0 !important;
262
+ border-radius: 0 !important;
263
+ border-right: none !important;
264
+ position: relative;
265
+ padding: 4px 6px;
266
+ border: 1px dashed rgba(0,0,0,0.3) !important;
267
+ pointer-events: none; /* Prevent individual box clicks */
268
+ }
269
+
270
+ .token-span-first {
271
+ border-radius: 4px 0 0 4px !important;
272
+ }
273
+
274
+ .token-span-last {
275
+ border-radius: 0 4px 4px 0 !important;
276
+ border-right: 1px solid !important;
277
+ }
278
+
279
+ /* Connecting lines between boxes */
280
+ .token-span-part:not(.token-span-last)::after {
281
+ content: '';
282
+ position: absolute;
283
+ top: 0;
284
+ right: -1px;
285
+ width: 1px;
286
+ height: 100%;
287
+ background: rgba(0,0,0,0.3);
288
+ z-index: 1;
289
+ }
290
+
291
+ /* Hover effect for entire multi-token span */
292
+ .token-span-container:hover .token-span-part {
293
+ transform: scale(1.05);
294
+ box-shadow: 0 2px 8px rgba(0,0,0,0.2);
295
+ }
296
+
297
+ .token-span-container.highlighted .token-span-part {
298
+ background: #ff6b6b !important;
299
+ border-color: #e55353 !important;
300
+ color: white !important;
301
+ box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
302
+ transform: scale(1.1) !important;
303
+ z-index: 100 !important;
304
+ }
305
+
306
+ /* Different patterns for different token types when multi-span */
307
+ .token-multi-span.token-word .token-span-part {
308
+ background: #e8f5e8;
309
+ border-color: #4caf50;
310
+ color: #2e7d32;
311
+ }
312
+ .token-multi-span.token-number .token-span-part {
313
+ background: #f3e5f5;
314
+ border-color: #9c27b0;
315
+ color: #7b1fa2;
316
+ }
317
+ .token-multi-span.token-punctuation .token-span-part {
318
+ background: #ffebee;
319
+ border-color: #f44336;
320
+ color: #c62828;
321
+ }
322
  </style>
323
 
324
  <div class="highlight-info" id="highlight-info"></div>
 
357
  info.style.display = 'none';
358
  }
359
  }
360
+
361
+ function highlightTokens(targetText) {
362
+ // Clear all highlights
363
+ document.querySelectorAll('.token, .token-span-container').forEach(function(element) {
364
+ element.classList.remove('highlighted');
365
+ });
366
+
367
+ // Highlight matching tokens and spans
368
+ let count = 0;
369
+
370
+ // Single tokens
371
+ document.querySelectorAll('.token').forEach(function(token) {
372
+ if (token.getAttribute('data-text') === targetText) {
373
+ token.classList.add('highlighted');
374
+ count++;
375
+ }
376
+ });
377
+
378
+ // Multi-token spans
379
+ document.querySelectorAll('.token-span-container').forEach(function(span) {
380
+ if (span.getAttribute('data-text') === targetText) {
381
+ span.classList.add('highlighted');
382
+ count++;
383
+ }
384
+ });
385
+
386
+ // Show info
387
+ const info = document.getElementById('highlight-info');
388
+ if (info) {
389
+ const displayText = targetText === ' ' ? '(space)' : targetText;
390
+ info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
391
+ info.style.display = 'block';
392
+ }
393
+ }
394
  </script>
395
  """)
396
 
 
422
  subword_count = 0
423
  for i, token in enumerate(result["tokens"]):
424
  token_text = token["text"]
425
+ token_text = clean_token_display(token_text)
426
  display_text = token_text if token_text.strip() else "·"
427
  if token_text == "<newline>":
428
  html_parts.append("<br>")
429
  continue
430
+ # Check if this token spans multiple token IDs
431
+ token_ids = token["id"] if isinstance(token["id"], list) else [token["id"]]
432
+ is_multi_token = len(token_ids) > 1
433
 
434
  # Determine token class
435
  token_class = f"token token-{token['type']}"
 
455
  .replace("\r", "\n")
456
  )
457
 
458
+ if is_multi_token:
459
+ # Create a container for the multi-token span
460
+ span_id = f"span_{model}_{i}"
461
+ token_ids_str = ",".join(map(str, token_ids))
462
+
463
+ html_parts.append(f"""<span class="token-span-container"
464
+ id="{span_id}_container"
465
+ data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
466
+ data-ids="{token_ids_str}"
467
+ data-position="{i}"
468
+ data-model="{model}"
469
+ onmouseover="highlightTokens('{escaped_text}')"
470
+ onmouseout="clearHighlights()"
471
+ onclick="alert('Token: \\'{escaped_text}\\'\\nIDs: [{token_ids_str}]\\nModel: {model}\\nSpans {len(token_ids)} token IDs')"
472
+ title="Text: '{token_text}' | IDs: [{token_ids_str}] | Type: {token["type"]} | Subword: {token["is_subword"]}">""")
473
+
474
+ # Create individual boxes for each token ID - but they act as one unit
475
+ for j, tid in enumerate(token_ids):
476
+ token_id = f"token_{model}_{i}_{j}"
477
+ box_class = f"{token_class} token-span-part"
478
+ box_content = ""
479
+
480
+ # Add position indicators for styling
481
+ if j == 0:
482
+ box_class += " token-span-first"
483
+ box_content = escaped_display
484
+ elif j == len(token_ids) - 1:
485
+ box_class += " token-span-last"
486
+ else:
487
+ box_class += " token-span-middle"
488
+
489
+ # Each box shows the same text (the combined character/text)
490
+ html_parts.append(f"""<span class="{box_class}"
491
+ id="{token_id}"
492
+ data-token-id="{tid}">{box_content}</span>""")
493
+
494
+ html_parts.append("</span>")
495
+ else:
496
+ # Single token - original behavior
497
+ token_id = f"token_{model}_{i}"
498
+ html_parts.append(f"""<span class="{token_class}"
499
+ id="{token_id}"
500
+ data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
501
+ data-id="{token_ids[0]}"
502
+ data-position="{i}"
503
+ data-model="{model}"
504
+ title="Text: '{token_text}' | ID: {token_ids[0]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
505
+ onmouseover="highlightTokens('{escaped_text}')"
506
+ onmouseout="clearHighlights()"
507
+ onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token_ids[0]}\\nModel: {model}')">{escaped_display}</span>""")
508
+ # # Use inline event handlers that work in Gradio
509
+ # html_parts.append(f"""<span class="{token_class}"
510
+ # id="{token_id}"
511
+ # data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
512
+ # data-id="{token["id"]}"
513
+ # data-position="{i}"
514
+ # data-model="{model}"
515
+ # title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
516
+ # onmouseover="highlightTokens('{escaped_text}')"
517
+ # onmouseout="clearHighlights()"
518
+ # onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
519
 
520
  html_parts.append(f"""
521
  </div>
522
  <div style="margin-top: 8px; font-size: 12px; color: #666;">
523
+ Subwords: {subword_count}/{sum([len(t) for t in result["tokens"]])}
524
  ({subword_count / len(result["tokens"]) * 100:.1f}%)
525
  </div>
526
  </div>
 
585
  normalized_results = {}
586
 
587
  for model in selected_models:
588
+ original_results[model] = tokenize(model, text)
589
+ if normalization_method != "none":
590
+ normalized_results[model] = tokenize(model, text)
 
 
 
 
 
 
 
 
591
  return original_results, normalized_results, normalized_text
592
 
593
 
 
752
  with gr.Row():
753
  with gr.Column(scale=2):
754
  # Sample texts dropdown
755
+ pre_choices = [
756
+ "Custom text (enter below)",
757
+ """
758
+ ᴾʸᵗʰᵒⁿ
759
+ ₚᵧₜₕₒₙ
760
+ P̲y̲t̲h̲o̲n̲
761
+ P̄ȳt̄h̄ōn̄
762
+ P̅y̅t̅h̅o̅n̅
763
+ ⓅⓎⓉⒽⓄⓃ
764
+ ⒫⒴⒯⒣⒪⒩
765
+ 🄿🅈🅃🄷🄾🄽
766
+ ⓅⓎⓉⒽⓄⓃ
767
+ Python
768
+ Pʎʇɥou
769
+ Pyʇɥou
770
+ P̊ẙt̊h̊o̊n̊
771
+ Pëthøñ
772
+ P̶y̶t̶h̶o̶n̶
773
+ P̸y̸t̸h̸o̸n̸
774
+ P̷y̷t̷h̷o̷n̷
775
+ P̴y̴t̴h̴o̴n̴
776
+ 𝒫𝓎𝓉𝒽𝑜𝓃
777
+ ℙ𝕪𝕥𝕙𝕠𝕟
778
+ """,
779
+ "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
780
+ "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
781
+ "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
782
+ "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
783
+ "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56,价格为789美元。",
784
+ "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
785
+ "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
786
+ "code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)",
787
+ "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
788
+ "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
789
+ "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
790
+ "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
791
+ "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
792
+ "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
793
+ "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
794
+ "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
795
+ "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
796
+ 'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
797
+ "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
798
+ ]
799
  sample_texts = gr.Dropdown(
800
+ choices=pre_choices,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  value="Custom text (enter below)",
802
  label="Choose a sample text or enter your own",
803
  interactive=True,
 
807
  label="Text to tokenize",
808
  placeholder="Enter your text here or select a sample above...",
809
  lines=4,
810
+ value=pre_choices[1],
811
  )
812
  with gr.Column(scale=1):
813
  with gr.Tabs():
814
  with gr.TabItem("Models"):
815
  model_selector = gr.CheckboxGroup(
816
+
817
+ choices=available_tokenizers,
818
+ value=pre_selected_tokenizers,
819
+ label="Select tokenizers to compare...",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
820
  )
821
  show_details = gr.Checkbox(
822
  label="Show detailed analysis", value=False
mappings.py CHANGED
@@ -9,14 +9,20 @@ MODEL_MAP = {
9
  "bloom": "bigscience/bloom-560m",
10
  "aya-expanse": "CohereForAI/aya-expanse-8b",
11
  "comma": "common-pile/comma-v0.1-2t",
12
- "byte-level": "google/byt5-small",
13
  "tokenmonster": "alasdairforsythe/tokenmonster",
14
  "byt5": "google/byt5-small",
 
 
 
 
 
15
  }
 
16
 
17
 
18
  TOKENIZER_INFO = {
19
  "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
 
20
  "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
21
  "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
22
  "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
@@ -34,4 +40,8 @@ TOKENIZER_INFO = {
34
  "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
35
  "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
36
  "byt5": {"name": "Byt5", "vocab_size": 50000, "encoding": "BPE"},
 
 
 
 
37
  }
 
9
  "bloom": "bigscience/bloom-560m",
10
  "aya-expanse": "CohereForAI/aya-expanse-8b",
11
  "comma": "common-pile/comma-v0.1-2t",
 
12
  "tokenmonster": "alasdairforsythe/tokenmonster",
13
  "byt5": "google/byt5-small",
14
+ "phi-3": "microsoft/Phi-3-mini-4k-instruct",
15
+ "xglm": "facebook/xglm-564M",
16
+ "tekken": "mistralai/tekken",
17
+ "mbert": "google-bert/bert-base-multilingual-cased" ,
18
+
19
  }
20
+ # "microsoft/Phi-3-mini-4k-instruct" "mistralai/tekken" "facebook/xglm-564M" "google-bert/bert-base-multilingual-cased"
21
 
22
 
23
  TOKENIZER_INFO = {
24
  "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
25
+ "gpt-4o": {"name": "GPT-4o", "vocab_size": 199997, "encoding": "BPE"},
26
  "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
27
  "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
28
  "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
 
40
  "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
41
  "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
42
  "byt5": {"name": "Byt5", "vocab_size": 50000, "encoding": "BPE"},
43
+ "phi-3": {"name": "Phi-3", "vocab_size": 32064, "encoding": "BPE"},
44
+ "xglm": {"name": "XGLM", "vocab_size": 256008, "encoding": "BPE"},
45
+ "tekken": {"name": "Tekken", "vocab_size": 32768, "encoding": "BPE"},
46
+ "mbert": {"name": "mBERT", "vocab_size": 119547, "encoding": "WordPiece"}
47
  }
requirements.txt CHANGED
@@ -4,4 +4,7 @@ transformers
4
  torch
5
  pandas
6
  plotly
7
- tokenmonster
 
 
 
 
4
  torch
5
  pandas
6
  plotly
7
+ tokenmonster
8
+ mistral_common
9
+ protobuf
10
+ sentencepiece
utils.py CHANGED
@@ -4,7 +4,7 @@ import traceback
4
  import unicodedata
5
 
6
  import tiktoken
7
- from transformers import AutoTokenizer
8
 
9
  from mappings import MODEL_MAP, TOKENIZER_INFO
10
 
@@ -74,40 +74,155 @@ def is_subword(token_text, model, is_first):
74
 
75
 
76
  def tokenize_with_tiktoken(text, model):
77
- encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
78
- enc = tiktoken.get_encoding(encoding)
 
 
79
 
80
  token_data = []
81
- current_pos = 0
82
- for text_ in text.split("\n"):
83
- tokens = enc.encode(text_ + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- for i, token_id in enumerate(tokens):
86
- token_text = enc.decode([token_id])
87
- token_type = get_token_type(token_text)
88
- subword = is_subword(token_text, model, i == 0)
89
 
 
 
 
 
 
90
  token_data.append(
91
  {
92
- "text": token_text,
93
- "id": int(token_id),
94
- "type": token_type,
95
- "is_subword": subword,
96
- "bytes": len(token_text.encode("utf-8")),
97
- "position": i,
98
  }
99
  )
100
- current_pos += len(token_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  token_data.append(
102
  {
103
- "text": "<newline>",
104
- "id": 0,
105
- "type": "special",
106
- "is_subword": False,
 
107
  "position": len(token_data),
108
  }
109
  )
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  return {
112
  "model": TOKENIZER_INFO[model]["name"],
113
  "token_count": len(token_data),
@@ -142,81 +257,402 @@ def get_hf_tokenizer(model):
142
  return tokenizer
143
 
144
 
145
- def tokenize_with_hf(text, model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  try:
147
- tokenizer = get_hf_tokenizer(model)
 
 
 
148
  token_data = []
149
  for text_ in text.split("\n"):
150
- text_ = text_ + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  encoding = tokenizer(
153
- text_,
154
- return_offsets_mapping=False,
155
  return_tensors=None,
156
  add_special_tokens=False,
157
  )
158
-
159
  token_ids = encoding["input_ids"]
160
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
161
- # print(model_name, text, "\n", tokens, token_ids)
162
- # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
- for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
165
- token_type = get_token_type(token_text)
166
- subword = is_subword(token_text, model, i == 0)
 
 
 
 
 
167
 
168
- token_data.append(
169
- {
170
- "text": token_text,
171
- "id": token_id, # int(token_id),
172
- "type": token_type,
173
- "is_subword": subword,
174
- "bytes": len(token_text.encode("utf-8")),
175
- "position": i,
176
- }
177
- )
178
- token_data.append(
179
- {
180
- "text": "<newline>",
181
- "id": 0,
182
- "type": "special",
183
- "is_subword": False,
184
- "position": len(token_data),
185
- }
 
186
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  return {
189
  "model": TOKENIZER_INFO[model]["name"],
190
- "token_count": len(token_data),
191
  "tokens": token_data,
192
  "compression_ratio": len(text) / len(token_data) if token_data else 0,
193
  "encoding": TOKENIZER_INFO[model]["encoding"],
194
  "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
195
  }
 
196
  except Exception as e:
197
- error_msg = str(e)
198
- print(f"DEBUG: Error: {error_msg}")
199
- print(traceback.format_exc())
200
-
201
- # Provide helpful error messages
202
- if "gated repo" in error_msg.lower():
203
- error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
204
- elif "401" in error_msg:
205
- error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
206
- elif "not found" in error_msg.lower():
207
- error_msg = (
208
- f"Model {model_name} not found. It may have been moved or renamed."
209
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  return {
212
  "model": TOKENIZER_INFO[model]["name"],
213
- "token_count": 0,
214
- "tokens": [],
215
- "compression_ratio": 0,
216
- "encoding": "Error",
217
- "vocab_size": 0,
218
- "error": error_msg,
219
  }
 
 
 
 
220
 
221
 
222
  def normalize_text(text, method):
@@ -229,6 +665,8 @@ def normalize_text(text, method):
229
  return unicodedata.normalize("NFC", text)
230
  elif method == "nfd":
231
  return unicodedata.normalize("NFD", text)
 
 
232
  elif method == "nfkc":
233
  return unicodedata.normalize("NFKC", text)
234
  elif method == "nfkd":
@@ -253,9 +691,37 @@ def get_normalization_methods():
253
  ("lowercase", "Lowercase"),
254
  ("nfc", "Unicode NFC (Canonical)"),
255
  ("nfd", "Unicode NFD (Decomposed)"),
 
256
  ("nfkc", "Unicode NFKC (Compatible)"),
257
  ("nfkd", "Unicode NFKD (Compatible Decomposed)"),
258
  ("strip_accents", "Remove Accents"),
259
  ("strip_punctuation", "Remove Punctuation"),
260
  ("whitespace_normalize", "Normalize Whitespace"),
261
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import unicodedata
5
 
6
  import tiktoken
7
+ from transformers import AutoTokenizer, XGLMTokenizerFast
8
 
9
  from mappings import MODEL_MAP, TOKENIZER_INFO
10
 
 
74
 
75
 
76
  def tokenize_with_tiktoken(text, model):
77
+ enc = tiktoken.encoding_for_model(model)
78
+
79
+ # Process the entire text at once, not line by line
80
+ token_ids = enc.encode(text)
81
 
82
  token_data = []
83
+ current_text_pos = 0
84
+
85
+ # Build character-to-token mapping
86
+ char_to_tokens = {}
87
+
88
+ # Decode each token and find its position in the original text
89
+ for i, token_id in enumerate(token_ids):
90
+ token_text = enc.decode([token_id])
91
+
92
+ # Find where this token appears in the remaining text
93
+ remaining_text = text[current_text_pos:]
94
+
95
+ if token_text in remaining_text:
96
+ # Find the position of this token in the original text
97
+ local_pos = remaining_text.find(token_text)
98
+ actual_start = current_text_pos + local_pos
99
+ actual_end = actual_start + len(token_text)
100
+
101
+ # Map each character position to this token
102
+ for char_pos in range(actual_start, actual_end):
103
+ if char_pos not in char_to_tokens:
104
+ char_to_tokens[char_pos] = []
105
+ char_to_tokens[char_pos].append(token_id)
106
+
107
+ current_text_pos = actual_end
108
+
109
+ # Group consecutive characters that have the same token ID sets
110
+ processed_chars = set()
111
+ text_pos = 0
112
 
113
+ while text_pos < len(text):
114
+ if text_pos in processed_chars:
115
+ text_pos += 1
116
+ continue
117
 
118
+ # Get tokens for current character
119
+ current_tokens = char_to_tokens.get(text_pos, [])
120
+
121
+ if not current_tokens:
122
+ # Handle characters not covered by any token
123
  token_data.append(
124
  {
125
+ "text": text[text_pos],
126
+ "id": None,
127
+ "type": get_token_type(text[text_pos]),
128
+ "is_subword": False,
129
+ "bytes": len(text[text_pos].encode("utf-8")),
130
+ "position": len(token_data),
131
  }
132
  )
133
+ processed_chars.add(text_pos)
134
+ text_pos += 1
135
+ continue
136
+
137
+ # Find the span of characters that share the same token ID set
138
+ span_start = text_pos
139
+ span_end = text_pos + 1
140
+
141
+ # Extend span while characters have the same token set
142
+ while (
143
+ span_end < len(text)
144
+ and span_end in char_to_tokens
145
+ and char_to_tokens[span_end] == current_tokens
146
+ ):
147
+ span_end += 1
148
+
149
+ # Get the text for this span
150
+ span_text = text[span_start:span_end]
151
+
152
+ # Create token data entry
153
  token_data.append(
154
  {
155
+ "text": span_text,
156
+ "id": current_tokens if len(current_tokens) > 1 else current_tokens[0],
157
+ "type": get_token_type(span_text),
158
+ "is_subword": is_subword(span_text, model, len(token_data) == 0),
159
+ "bytes": len(span_text.encode("utf-8")),
160
  "position": len(token_data),
161
  }
162
  )
163
 
164
+ # Mark all characters in this span as processed
165
+ for pos in range(span_start, span_end):
166
+ processed_chars.add(pos)
167
+
168
+ text_pos = span_end
169
+
170
+ return {
171
+ "model": TOKENIZER_INFO[model]["name"],
172
+ "token_count": len(token_ids),
173
+ "tokens": token_data,
174
+ "compression_ratio": len(text) / len(token_data) if token_data else 0,
175
+ "encoding": TOKENIZER_INFO[model]["encoding"],
176
+ "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
177
+ }
178
+
179
+
180
+ def tokenize_with_tiktoke1n(text, model):
181
+ encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
182
+ enc = tiktoken.get_encoding(encoding)
183
+
184
+ token_data = []
185
+ current_pos = 0
186
+ text_ = text
187
+ for text in text_.split("\n"):
188
+ tokens = enc.encode(text + "\n")
189
+
190
+ # token_text = enc.decode([token_id])
191
+ # token_type = get_token_type(token_text)
192
+ # subword = is_subword(token_text, model, i == 0)
193
+
194
+ token_ids = encoding["input_ids"]
195
+ ## offset in the text for each token, i.e. token i covers text[offsets[i][0]:offsets[i][1]]
196
+ offsets = encoding.get("offset_mapping", [])
197
+
198
+ token_data = []
199
+ curr_tok_id = 0
200
+ current_text_pos = 0
201
+ token_id = []
202
+ while curr_tok_id < len(token_ids) and curr_tok_id < len(tokens):
203
+ if offsets and curr_tok_id < len(offsets):
204
+ start, end = offsets[curr_tok_id]
205
+ actual_text = text[start:end]
206
+ if current_text_pos == end:
207
+ token_id.append(token_ids[curr_tok_id])
208
+ else:
209
+ token_id = [token_ids[curr_tok_id]]
210
+ token_type = get_token_type(actual_text)
211
+ subword = is_subword(actual_text, model, curr_tok_id == 0)
212
+ if current_text_pos != end:
213
+ token_data.append(
214
+ {
215
+ "text": actual_text,
216
+ "id": token_id,
217
+ "type": token_type,
218
+ "is_subword": subword,
219
+ "bytes": len(actual_text.encode("utf-8")),
220
+ "position": curr_tok_id,
221
+ }
222
+ )
223
+ curr_tok_id += 1
224
+ current_text_pos = end
225
+
226
  return {
227
  "model": TOKENIZER_INFO[model]["name"],
228
  "token_count": len(token_data),
 
257
  return tokenizer
258
 
259
 
260
+ def get_tokenizer(model):
261
+ # import code; code.interact(local=locals()|globals())
262
+ model_name = MODEL_MAP.get(model, None)
263
+ if model_name is None:
264
+ raise ValueError(f"Unknown tokenizer code {model_name}")
265
+ print(model_name)
266
+ if model_name in TOKENIZER_CACHE:
267
+ return TOKENIZER_CACHE[model_name]
268
+
269
+ # Get token from environment
270
+ hf_token = os.getenv("HF_TOKEN")
271
+ if not hf_token:
272
+ return {
273
+ "model": TOKENIZER_INFO[model]["name"],
274
+ "token_count": 0,
275
+ "tokens": [],
276
+ "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
277
+ }
278
+ if "tekken" in model_name:
279
+ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
280
+
281
+ tok = MistralTokenizer.v3(is_tekken=True)
282
+ tokenizer = tok.instruct_tokenizer.tokenizer
283
+ elif "tokenmonster" in model_name:
284
+ tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
285
+ elif "xglm" in model_name.lower():
286
+ # tokenizer = AutoTokenizer.from_pretrained(
287
+ tokenizer = XGLMTokenizerFast.from_pretrained(
288
+ model_name, token=hf_token, trust_remote_code=True,# use_fast=False
289
+ )
290
+ else:
291
+ tokenizer = AutoTokenizer.from_pretrained(
292
+ model_name, token=hf_token, trust_remote_code=True
293
+ )
294
+ TOKENIZER_CACHE[model_name] = tokenizer
295
+ return tokenizer
296
+
297
+
298
+
299
+ def tokenize_w_tekken(text, model):
300
+ tokenizer = get_tokenizer(model)
301
+
302
+ # Process the entire text at once, not line by line
303
+ token_ids = tokenizer.encode(text, bos=False, eos=False)
304
+
305
+ token_data = []
306
+ current_text_pos = 0
307
+
308
+ # Build character-to-token mapping
309
+ char_to_tokens = {}
310
+
311
+ # Decode each token and find its position in the original text
312
+ for i, token_id in enumerate(token_ids):
313
+ token_text = tokenizer.decode([token_id])
314
+
315
+ # Find where this token appears in the remaining text
316
+ remaining_text = text[current_text_pos:]
317
+
318
+ if token_text in remaining_text:
319
+ # Find the position of this token in the original text
320
+ local_pos = remaining_text.find(token_text)
321
+ actual_start = current_text_pos + local_pos
322
+ actual_end = actual_start + len(token_text)
323
+
324
+ # Map each character position to this token
325
+ for char_pos in range(actual_start, actual_end):
326
+ if char_pos not in char_to_tokens:
327
+ char_to_tokens[char_pos] = []
328
+ char_to_tokens[char_pos].append(token_id)
329
+
330
+ current_text_pos = actual_end
331
+
332
+ # Group consecutive characters that have the same token ID sets
333
+ processed_chars = set()
334
+ text_pos = 0
335
+
336
+ while text_pos < len(text):
337
+ if text_pos in processed_chars:
338
+ text_pos += 1
339
+ continue
340
+
341
+ # Get tokens for current character
342
+ current_tokens = char_to_tokens.get(text_pos, [])
343
+
344
+ if not current_tokens:
345
+ # Handle characters not covered by any token
346
+ token_data.append(
347
+ {
348
+ "text": text[text_pos],
349
+ "id": None,
350
+ "type": get_token_type(text[text_pos]),
351
+ "is_subword": False,
352
+ "bytes": len(text[text_pos].encode("utf-8")),
353
+ "position": len(token_data),
354
+ }
355
+ )
356
+ processed_chars.add(text_pos)
357
+ text_pos += 1
358
+ continue
359
+
360
+ # Find the span of characters that share the same token ID set
361
+ span_start = text_pos
362
+ span_end = text_pos + 1
363
+
364
+ # Extend span while characters have the same token set
365
+ while (
366
+ span_end < len(text)
367
+ and span_end in char_to_tokens
368
+ and char_to_tokens[span_end] == current_tokens
369
+ ):
370
+ span_end += 1
371
+
372
+ # Get the text for this span
373
+ span_text = text[span_start:span_end]
374
+
375
+ # Create token data entry
376
+ token_data.append(
377
+ {
378
+ "text": span_text,
379
+ "id": current_tokens if len(current_tokens) > 1 else current_tokens[0],
380
+ "type": get_token_type(span_text),
381
+ "is_subword": is_subword(span_text, model, len(token_data) == 0),
382
+ "bytes": len(span_text.encode("utf-8")),
383
+ "position": len(token_data),
384
+ }
385
+ )
386
+
387
+ # Mark all characters in this span as processed
388
+ for pos in range(span_start, span_end):
389
+ processed_chars.add(pos)
390
+
391
+ text_pos = span_end
392
+
393
+ return {
394
+ "model": TOKENIZER_INFO[model]["name"],
395
+ "token_count": len(token_ids),
396
+ "tokens": token_data,
397
+ "compression_ratio": len(text) / len(token_data) if token_data else 0,
398
+ "encoding": TOKENIZER_INFO[model]["encoding"],
399
+ "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
400
+ }
401
+ def tokenize_w_tekken1(text, model):
402
  try:
403
+ tokenizer = get_tokenizer(model)
404
+
405
+ text_ = text
406
+ index = 0
407
  token_data = []
408
  for text_ in text.split("\n"):
409
+ text_ += "\n"
410
+ token_ids = tokenizer.encode(text_, bos=False, eos=False)
411
+ tokens = [tokenizer.decode([tok]) for tok in token_ids]
412
+ # import code; code.interact(local=locals()|globals())
413
+ for i, tok in enumerate(tokens):
414
+ tok = tok[0].encode("utf-8")
415
+ # token_type = get_token_type(tok)
416
+ token_type=None
417
+ # subword = is_subword(tok, tokenizer, is_first=index == 0)
418
+ subword=False
419
+ token_data.append(
420
+ {
421
+ "text": tok,
422
+ "id": token_ids[i],
423
+ "type": token_type,
424
+ "is_subword": subword,
425
+ "bytes": len(tok),
426
+ "position": index,
427
+ }
428
+ )
429
+ index += 1
430
+ # import code; code.interact(local=locals()|globals())
431
+
432
+ return {
433
+ "model": TOKENIZER_INFO[model]["name"],
434
+ "token_count": index,
435
+ "tokens": token_data,
436
+ "compression_ratio": len(text) / len(token_data) if token_data else 0,
437
+ "encoding": TOKENIZER_INFO[model]["encoding"],
438
+ "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
439
+ }
440
+
441
+ except Exception as e:
442
+ # Your existing error handling...
443
+ print(f"Error: {e}")
444
+ pass
445
+
446
 
447
+
448
+
449
+ # Alternative version if you really need line-by-line processing:
450
+ def tokenize_with_hf(text, model):
451
+ try:
452
+ tokenizer = get_tokenizer(model)
453
+
454
+ all_token_data = []
455
+ global_position = 0
456
+ text_offset = 0
457
+
458
+ # Process line by line but accumulate results
459
+ for line in text.split("\n"):
460
+ line_with_newline = line + "\n"
461
+
462
  encoding = tokenizer(
463
+ line_with_newline,
464
+ return_offsets_mapping=True,
465
  return_tensors=None,
466
  add_special_tokens=False,
467
  )
 
468
  token_ids = encoding["input_ids"]
469
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
470
+ offsets = encoding.get("offset_mapping", [])
471
+
472
+ # Process tokens for this line
473
+ for i in range(len(token_ids)):
474
+ if i < len(offsets) and offsets[i] is not None:
475
+ start, end = offsets[i]
476
+ actual_text = line_with_newline[start:end]
477
+ else:
478
+ actual_text = tokens[i] if i < len(tokens) else ""
479
+
480
+ if not actual_text:
481
+ continue
482
+
483
+ token_type = get_token_type(actual_text)
484
+ subword = is_subword(actual_text, model, global_position == 0)
485
+
486
+ all_token_data.append({
487
+ # "text": actual_text,
488
+ "text": tokens[i],
489
+ "id": [token_ids[i]],
490
+ "type": token_type,
491
+ "is_subword": subword,
492
+ "bytes": len(actual_text.encode("utf-8")),
493
+ "position": global_position,
494
+ })
495
+ global_position += 1
496
+
497
+ text_offset += len(line_with_newline)
498
+
499
+ # Calculate total token count
500
+ total_tokens = sum(len(encoding["input_ids"]) for encoding in [
501
+ tokenizer(text, return_tensors=None, add_special_tokens=False)
502
+ ])
503
 
504
+ return {
505
+ "model": TOKENIZER_INFO[model]["name"],
506
+ "token_count": total_tokens,
507
+ "tokens": all_token_data,
508
+ "compression_ratio": len(text) / len(all_token_data) if all_token_data else 0,
509
+ "encoding": TOKENIZER_INFO[model]["encoding"],
510
+ "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
511
+ }
512
 
513
+ except Exception as e:
514
+ print(f"Error: {e}")
515
+ import traceback
516
+ traceback.print_exc()
517
+ return None
518
+ def tokenize_with_hfold(text, model):
519
+ try:
520
+ tokenizer = get_hf_tokenizer(model)
521
+
522
+ # Process the ENTIRE text at once, not line by line
523
+ text_ = text
524
+ token_data = []
525
+ for text_ in text.split("\n"):
526
+ text_ += "\n"
527
+ encoding = tokenizer(
528
+ text, # Use original text, not line by line
529
+ return_offsets_mapping=True,
530
+ return_tensors=None,
531
+ add_special_tokens=False,
532
  )
533
+ token_ids = encoding["input_ids"]
534
+ tokens = tokenizer.convert_ids_to_tokens(token_ids)
535
+ ## offset in the text for each token, i.e. token i covers text[offsets[i][0]:offsets[i][1]]
536
+ offsets = encoding.get("offset_mapping", [])
537
+
538
+ curr_tok_id = 0
539
+ current_text_pos = 0
540
+ token_id = []
541
+ while curr_tok_id < len(token_ids) and curr_tok_id < len(tokens):
542
+ if offsets and curr_tok_id < len(offsets):
543
+ start, end = offsets[curr_tok_id]
544
+ actual_text = text[start:end]
545
+ if current_text_pos == end:
546
+ token_id.append(token_ids[curr_tok_id])
547
+ else:
548
+ token_id = [token_ids[curr_tok_id]]
549
+ token_type = get_token_type(actual_text)
550
+ subword = is_subword(actual_text, model, curr_tok_id == 0)
551
+ if current_text_pos != end:
552
+ token_data.append(
553
+ {
554
+ "text": actual_text,
555
+ "id": token_id,
556
+ "type": token_type,
557
+ "is_subword": subword,
558
+ "bytes": len(actual_text.encode("utf-8")),
559
+ "position": curr_tok_id,
560
+ }
561
+ )
562
+ current_text_pos = end
563
+ else:
564
+ token_data.append(
565
+ {
566
+ "text": tokens[curr_tok_id],
567
+ "id": [token_ids[curr_tok_id]],
568
+ "type": get_token_type(tokens[curr_tok_id]),
569
+ "is_subword": is_subword(tokens[curr_tok_id]),
570
+ "bytes": len(tokens[curr_tok_id].encode("utf-8")),
571
+ "position": curr_tok_id,
572
+ }
573
+ )
574
+ curr_tok_id += 1
575
 
576
  return {
577
  "model": TOKENIZER_INFO[model]["name"],
578
+ "token_count": len(token_ids),
579
  "tokens": token_data,
580
  "compression_ratio": len(text) / len(token_data) if token_data else 0,
581
  "encoding": TOKENIZER_INFO[model]["encoding"],
582
  "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
583
  }
584
+
585
  except Exception as e:
586
+ # Your existing error handling...
587
+ print(f"Error: {e}")
588
+ pass
589
+
590
+
591
+
592
+ def tokenize_with_byt5(text, model):
593
+ """Special handling for ByT5 byte-level tokenizer"""
594
+ try:
595
+ tokenizer = get_hf_tokenizer(model)
596
+ # ByT5 doesn't support offset_mapping, so we handle it differently
597
+ encoding = tokenizer(
598
+ text,
599
+ return_tensors=None,
600
+ add_special_tokens=False,
601
+ )
602
+ token_ids = encoding["input_ids"]
603
+
604
+ # For ByT5, each token represents a byte
605
+ text_bytes = text.encode('utf-8')
606
+ token_data = []
607
+
608
+ for i, token_id in enumerate(token_ids):
609
+ # Decode individual token
610
+ try:
611
+ token_text = tokenizer.decode([token_id])
612
+
613
+ # For ByT5, tokens often correspond to individual bytes/characters
614
+ if i < len(text_bytes):
615
+ # Get the actual byte this token represents
616
+ byte_val = text_bytes[i]
617
+ actual_char = chr(byte_val) if byte_val < 128 else text_bytes[i:i+1].decode('utf-8', errors='replace')
618
+ else:
619
+ actual_char = token_text
620
+
621
+ token_type = get_token_type(actual_char)
622
+ subword = is_subword(actual_char, model, i == 0)
623
+
624
+ token_data.append({
625
+ "text": actual_char,
626
+ "id": [token_id],
627
+ "type": token_type,
628
+ "is_subword": subword,
629
+ "bytes": len(actual_char.encode("utf-8")),
630
+ "position": i,
631
+ })
632
+
633
+ except Exception as e:
634
+ # Handle special tokens or decoding issues
635
+ token_data.append({
636
+ "text": f"<special_token_{token_id}>",
637
+ "id": [token_id],
638
+ "type": "special",
639
+ "is_subword": False,
640
+ "bytes": 0,
641
+ "position": i,
642
+ })
643
 
644
  return {
645
  "model": TOKENIZER_INFO[model]["name"],
646
+ "token_count": len(token_ids),
647
+ "tokens": token_data,
648
+ "compression_ratio": len(text) / len(token_data) if token_data else 0,
649
+ "encoding": TOKENIZER_INFO[model]["encoding"],
650
+ "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
 
651
  }
652
+
653
+ except Exception as e:
654
+ print(f"Error in ByT5 tokenization: {e}")
655
+ return None
656
 
657
 
658
  def normalize_text(text, method):
 
665
  return unicodedata.normalize("NFC", text)
666
  elif method == "nfd":
667
  return unicodedata.normalize("NFD", text)
668
+ elif method == "nfk":
669
+ return unicodedata.normalize("NFK", text)
670
  elif method == "nfkc":
671
  return unicodedata.normalize("NFKC", text)
672
  elif method == "nfkd":
 
691
  ("lowercase", "Lowercase"),
692
  ("nfc", "Unicode NFC (Canonical)"),
693
  ("nfd", "Unicode NFD (Decomposed)"),
694
+ ("nfk", ""),
695
  ("nfkc", "Unicode NFKC (Compatible)"),
696
  ("nfkd", "Unicode NFKD (Compatible Decomposed)"),
697
  ("strip_accents", "Remove Accents"),
698
  ("strip_punctuation", "Remove Punctuation"),
699
  ("whitespace_normalize", "Normalize Whitespace"),
700
  ]
701
+
702
+
703
+ def clean_token_display(token_text, tokenizer=None):
704
+ """Clean up token display to avoid ? characters"""
705
+ if token_text == "\n" or token_text == "<newline> ":
706
+ return "<newline>"
707
+ # Handle common prefixes
708
+ if token_text.startswith("Ġ"): # GPT-2 style
709
+ return " " + token_text[1:]
710
+ elif token_text.startswith("▁"): # SentencePiece style
711
+ return " " + token_text[1:]
712
+
713
+ # Handle byte-level representations
714
+ if token_text.startswith("<0x") and token_text.endswith(">"):
715
+ try:
716
+ # Convert hex byte to character
717
+ hex_val = token_text[3:-1]
718
+ byte_val = int(hex_val, 16)
719
+ return chr(byte_val) if 32 <= byte_val <= 126 else f"[{hex_val}]"
720
+ except:
721
+ return token_text
722
+
723
+ # Handle other special cases
724
+ if "�" in token_text: # Unicode replacement character
725
+ return token_text.replace("�", "?")
726
+
727
+ return token_text