Spaces:

gsaltintas
/

tokenizer-comparison

Running

App Files Files Community

Gül Sena Altıntaş commited on Sep 20

Commit

ce07484

1 Parent(s): 44cdae3

Further improvements

Browse files

Files changed (4) hide show

app.py +307 -74
mappings.py +11 -1
requirements.txt +4 -1
utils.py +536 -70

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from collections import Counter
 import gradio as gr
 import pandas as pd
@@ -6,12 +7,44 @@ import plotly.express as px
 import plotly.graph_objects as go
 from utils import (
     get_normalization_methods,
     normalize_text,
     tokenize_with_hf,
     tokenize_with_tiktoken,
 )
 def compare_tokenizers(text, selected_models, show_details=False):
     if not text.strip():
@@ -20,11 +53,7 @@ def compare_tokenizers(text, selected_models, show_details=False):
     results = {}
     for model in selected_models:
-        if model in ["gpt-4", "gpt-2"]:
-            results[model] = tokenize_with_tiktoken(text, model)
-        else:
-            results[model] = tokenize_with_hf(text, model)
     # Generate outputs
     efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
         results
@@ -73,6 +102,7 @@ def generate_basic_comparison(results):
 def generate_interactive_tokenization(results):
     """Generate HTML with working hover highlighting across tokenizers"""
     if not results:
         return "<p>No tokenization results to display.</p>"
@@ -170,6 +200,125 @@ def generate_interactive_tokenization(results):
         display: inline-block;
         justify-content: space-between;
     }
     </style>
     <div class="highlight-info" id="highlight-info"></div>
@@ -208,6 +357,40 @@ def generate_interactive_tokenization(results):
             info.style.display = 'none';
         }
     }
     </script>
     """)
@@ -239,10 +422,14 @@ def generate_interactive_tokenization(results):
         subword_count = 0
         for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
             display_text = token_text if token_text.strip() else "·"
             if token_text == "<newline>":
                 html_parts.append("<br>")
                 continue
             # Determine token class
             token_class = f"token token-{token['type']}"
@@ -268,22 +455,72 @@ def generate_interactive_tokenization(results):
                 .replace("\r", "\n")
             )
-            # Use inline event handlers that work in Gradio
-            html_parts.append(f"""<span class="{token_class}"
-                      id="{token_id}"
-                      data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
-                      data-id="{token["id"]}"
-                      data-position="{i}"
-                      data-model="{model}"
-                      title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
-                      onmouseover="highlightTokens('{escaped_text}')"
-                      onmouseout="clearHighlights()"
-                      onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
         html_parts.append(f"""
             </div>
             <div style="margin-top: 8px; font-size: 12px; color: #666;">
-                Subwords: {subword_count}/{len(result["tokens"])}
                 ({subword_count / len(result["tokens"]) * 100:.1f}%)
             </div>
         </div>
@@ -348,17 +585,9 @@ def compare_with_normalization(
     normalized_results = {}
     for model in selected_models:
-        if model in ["gpt-4", "gpt-2"]:
-            original_results[model] = tokenize_with_tiktoken(text, model)
-            if normalization_method != "none":
-                normalized_results[model] = tokenize_with_tiktoken(
-                    normalized_text, model
-                )
-        else:
-            original_results[model] = tokenize_with_hf(text, model)
-            if normalization_method != "none":
-                normalized_results[model] = tokenize_with_hf(normalized_text, model)
     return original_results, normalized_results, normalized_text
@@ -523,29 +752,52 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column(scale=2):
             # Sample texts dropdown
             sample_texts = gr.Dropdown(
-                choices=[
-                    "Custom text (enter below)",
-                    "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
-                    "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
-                    "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
-                    "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
-                    "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56，价格为789美元。",
-                    "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
-                    "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
-                    "code: def calculate_sum(a, b):\n    return a + b\n\nresult = calculate_sum(123, 456)",
-                    "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
-                    "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
-                    "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
-                    "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
-                    "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
-                    "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
-                    "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
-                    "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
-                    "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
-                    'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
-                    "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
-                ],
                 value="Custom text (enter below)",
                 label="Choose a sample text or enter your own",
                 interactive=True,
@@ -555,35 +807,16 @@ with gr.Blocks(
                 label="Text to tokenize",
                 placeholder="Enter your text here or select a sample above...",
                 lines=4,
-                value="Hello world! This is a test with some subwords and punctuation.",
             )
         with gr.Column(scale=1):
             with gr.Tabs():
                 with gr.TabItem("Models"):
                     model_selector = gr.CheckboxGroup(
-                        choices=[
-                            "gpt-4",
-                            "gpt-2",
-                            "llama-2",
-                            "llama-3",
-                            "gemma-2",
-                            "qwen3",
-                            "qwen2.5",
-                            "bert",
-                            "bloom",
-                            "aya-expanse",
-                            "comma",
-                            "tokenmonster",
-                            "byt5",
-                        ],
-                        value=[
-                            "gpt-4",
-                            "llama-3",
-                            "gemma-2",
-                            "qwen2.5",
-                            "tokenmonster",
-                        ],
-                        label="Select tokenizers to compare",
                     )
                     show_details = gr.Checkbox(
                         label="Show detailed analysis", value=False

 from collections import Counter
+from pathlib import Path
 import gradio as gr
 import pandas as pd
 import plotly.graph_objects as go
 from utils import (
+    clean_token_display,
     get_normalization_methods,
     normalize_text,
+    tokenize_w_tekken,
+    tokenize_with_byt5,
     tokenize_with_hf,
     tokenize_with_tiktoken,
 )
+TIKTOKENS = [ "gpt-4o", "gpt-2"]
+HF = ["llama-3", "gemma-2", "qwen3", "mbert", "phi-3", "xglm",  "bloom", "aya-expanse", "comma", "tokenmonster", "byt5"]
+available_tokenizers = TIKTOKENS + HF + ["tekken", ]
+pre_selected_tokenizers = ["xglm"]
+pre_selected_tokenizers= available_tokenizers
+pre_selected_tokenizers=[]
+OUT_FILE = Path("paper-outs.txt")
+if not OUT_FILE.exists():
+    open(OUT_FILE, "w")
+def tokenize(model, text):
+    if model in ["gpt-4", "gpt-2", "gpt-4o"]:
+        toks = tokenize_with_tiktoken(text, model)
+    elif model in ["tekken"]:
+        toks = tokenize_w_tekken(text, model)
+    elif "byt5" in model:
+        toks = tokenize_with_byt5(text, model)
+    else:
+        toks = tokenize_with_hf(text, model)
+    with open(OUT_FILE, "a", encoding="utf-8") as file:  # Specify UTF-8 encoding
+        file.write(toks["model"]+"\n")
+        file.write(f"Text: {text}\n")
+        s= str(",".join([str(t["text"]) for t in toks["tokens"]])) +"\n"
+        # s = s.encode("utf-8")
+        # s = s.encode('latin1').decode('utf-8')
+        file.write(s)
+        file.write("\n")
+    return toks
 def compare_tokenizers(text, selected_models, show_details=False):
     if not text.strip():
     results = {}
     for model in selected_models:
+        results[model] = tokenize(model, text)
     # Generate outputs
     efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
         results
 def generate_interactive_tokenization(results):
+    ##todo main vis
     """Generate HTML with working hover highlighting across tokenizers"""
     if not results:
         return "<p>No tokenization results to display.</p>"
         display: inline-block;
         justify-content: space-between;
     }
+                      /* Multi-token span styles */
+.token-span-container {
+    display: inline-flex;
+    margin: 2px;
+}
+.token-multi-span {
+    background: linear-gradient(45deg, #e8f5e8 25%, #f3e5f5 25%, #f3e5f5 50%, #e8f5e8 50%, #e8f5e8 75%, #f3e5f5 75%);
+    background-size: 8px 8px;
+}
+.token-span-part {
+    margin: 0 !important;
+    border-radius: 0 !important;
+    border-right: none !important;
+    position: relative;
+    min-width: 20px;
+    text-align: center;
+    font-size: 11px;
+}
+/* Hover effect for multi-token spans */
+.token-span-container:hover .token-span-part {
+    transform: scale(1.02);
+    box-shadow: 0 2px 8px rgba(0,0,0,0.15);
+}
+/* Different visual for multi-token spans */
+.token-multi-span.token-word {
+    background: repeating-linear-gradient(45deg, #e8f5e8, #e8f5e8 4px, #d4edda 4px, #d4edda 8px);
+}
+.token-multi-span.token-number {
+    background: repeating-linear-gradient(45deg, #f3e5f5, #f3e5f5 4px, #e1bee7 4px, #e1bee7 8px);
+}
+.token-multi-span.token-punctuation {
+    background: repeating-linear-gradient(45deg, #ffebee, #ffebee 4px, #ffcdd2 4px, #ffcdd2 8px);
+}
+                      /* Multi-token span styles */
+.token-span-container {
+    display: inline-flex;
+    margin: 2px;
+    cursor: pointer;
+}
+.token-multi-span {
+    /* Distinctive background pattern for multi-token spans */
+    background: repeating-linear-gradient(
+        45deg,
+        transparent,
+        transparent 2px,
+        rgba(0,0,0,0.1) 2px,
+        rgba(0,0,0,0.1) 4px
+    );
+}
+.token-span-part {
+    margin: 0 !important;
+    border-radius: 0 !important;
+    border-right: none !important;
+    position: relative;
+    padding: 4px 6px;
+    border: 1px dashed rgba(0,0,0,0.3) !important;
+    pointer-events: none; /* Prevent individual box clicks */
+}
+.token-span-first {
+    border-radius: 4px 0 0 4px !important;
+}
+.token-span-last {
+    border-radius: 0 4px 4px 0 !important;
+    border-right: 1px solid !important;
+}
+/* Connecting lines between boxes */
+.token-span-part:not(.token-span-last)::after {
+    content: '';
+    position: absolute;
+    top: 0;
+    right: -1px;
+    width: 1px;
+    height: 100%;
+    background: rgba(0,0,0,0.3);
+    z-index: 1;
+}
+/* Hover effect for entire multi-token span */
+.token-span-container:hover .token-span-part {
+    transform: scale(1.05);
+    box-shadow: 0 2px 8px rgba(0,0,0,0.2);
+}
+.token-span-container.highlighted .token-span-part {
+    background: #ff6b6b !important;
+    border-color: #e55353 !important;
+    color: white !important;
+    box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
+    transform: scale(1.1) !important;
+    z-index: 100 !important;
+}
+/* Different patterns for different token types when multi-span */
+.token-multi-span.token-word .token-span-part {
+    background: #e8f5e8;
+    border-color: #4caf50;
+    color: #2e7d32;
+}
+.token-multi-span.token-number .token-span-part {
+    background: #f3e5f5;
+    border-color: #9c27b0;
+    color: #7b1fa2;
+}
+.token-multi-span.token-punctuation .token-span-part {
+    background: #ffebee;
+    border-color: #f44336;
+    color: #c62828;
+}
     </style>
     <div class="highlight-info" id="highlight-info"></div>
             info.style.display = 'none';
         }
     }
+                      function highlightTokens(targetText) {
+    // Clear all highlights
+    document.querySelectorAll('.token, .token-span-container').forEach(function(element) {
+        element.classList.remove('highlighted');
+    });
+    // Highlight matching tokens and spans
+    let count = 0;
+    // Single tokens
+    document.querySelectorAll('.token').forEach(function(token) {
+        if (token.getAttribute('data-text') === targetText) {
+            token.classList.add('highlighted');
+            count++;
+        }
+    });
+    // Multi-token spans
+    document.querySelectorAll('.token-span-container').forEach(function(span) {
+        if (span.getAttribute('data-text') === targetText) {
+            span.classList.add('highlighted');
+            count++;
+        }
+    });
+    // Show info
+    const info = document.getElementById('highlight-info');
+    if (info) {
+        const displayText = targetText === ' ' ? '(space)' : targetText;
+        info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
+        info.style.display = 'block';
+    }
+}
     </script>
     """)
         subword_count = 0
         for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
+            token_text = clean_token_display(token_text)
             display_text = token_text if token_text.strip() else "·"
             if token_text == "<newline>":
                 html_parts.append("<br>")
                 continue
+            # Check if this token spans multiple token IDs
+            token_ids = token["id"] if isinstance(token["id"], list) else [token["id"]]
+            is_multi_token = len(token_ids) > 1
             # Determine token class
             token_class = f"token token-{token['type']}"
                 .replace("\r", "\n")
             )
+            if is_multi_token:
+                # Create a container for the multi-token span
+                span_id = f"span_{model}_{i}"
+                token_ids_str = ",".join(map(str, token_ids))
+                html_parts.append(f"""<span class="token-span-container"
+                        id="{span_id}_container"
+                        data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
+                        data-ids="{token_ids_str}"
+                        data-position="{i}"
+                        data-model="{model}"
+                        onmouseover="highlightTokens('{escaped_text}')"
+                        onmouseout="clearHighlights()"
+                        onclick="alert('Token: \\'{escaped_text}\\'\\nIDs: [{token_ids_str}]\\nModel: {model}\\nSpans {len(token_ids)} token IDs')"
+                        title="Text: '{token_text}' | IDs: [{token_ids_str}] | Type: {token["type"]} | Subword: {token["is_subword"]}">""")
+                # Create individual boxes for each token ID - but they act as one unit
+                for j, tid in enumerate(token_ids):
+                    token_id = f"token_{model}_{i}_{j}"
+                    box_class = f"{token_class} token-span-part"
+                    box_content = ""
+                    # Add position indicators for styling
+                    if j == 0:
+                        box_class += " token-span-first"
+                        box_content = escaped_display
+                    elif j == len(token_ids) - 1:
+                        box_class += " token-span-last"
+                    else:
+                        box_class += " token-span-middle"
+                    # Each box shows the same text (the combined character/text)
+                    html_parts.append(f"""<span class="{box_class}"
+                            id="{token_id}"
+                            data-token-id="{tid}">{box_content}</span>""")
+                html_parts.append("</span>")
+            else:
+                # Single token - original behavior
+                token_id = f"token_{model}_{i}"
+                html_parts.append(f"""<span class="{token_class}"
+                        id="{token_id}"
+                        data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
+                        data-id="{token_ids[0]}"
+                        data-position="{i}"
+                        data-model="{model}"
+                        title="Text: '{token_text}' | ID: {token_ids[0]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
+                        onmouseover="highlightTokens('{escaped_text}')"
+                        onmouseout="clearHighlights()"
+                        onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token_ids[0]}\\nModel: {model}')">{escaped_display}</span>""")
+            # # Use inline event handlers that work in Gradio
+            # html_parts.append(f"""<span class="{token_class}"
+            #           id="{token_id}"
+            #           data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
+            #           data-id="{token["id"]}"
+            #           data-position="{i}"
+            #           data-model="{model}"
+            #           title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
+            #           onmouseover="highlightTokens('{escaped_text}')"
+            #           onmouseout="clearHighlights()"
+            #           onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
         html_parts.append(f"""
             </div>
             <div style="margin-top: 8px; font-size: 12px; color: #666;">
+                Subwords: {subword_count}/{sum([len(t) for t in result["tokens"]])}
                 ({subword_count / len(result["tokens"]) * 100:.1f}%)
             </div>
         </div>
     normalized_results = {}
     for model in selected_models:
+        original_results[model] = tokenize(model, text)
+        if normalization_method != "none":
+            normalized_results[model] = tokenize(model, text)
     return original_results, normalized_results, normalized_text
     with gr.Row():
         with gr.Column(scale=2):
             # Sample texts dropdown
+            pre_choices = [
+                "Custom text (enter below)",
+                """
+ᴾʸᵗʰᵒⁿ
+ₚᵧₜₕₒₙ
+P̲y̲t̲h̲o̲n̲
+P̄ȳt̄h̄ōn̄
+P̅y̅t̅h̅o̅n̅
+ⓅⓎⓉⒽⓄⓃ
+⒫⒴⒯⒣⒪⒩
+🄿🅈🅃🄷🄾🄽
+ⓅⓎⓉⒽⓄⓃ
+Ｐｙｔｈｏｎ
+Pʎʇɥou
+Pyʇɥou
+P̊ẙt̊h̊o̊n̊
+Pëthøñ
+P̶y̶t̶h̶o̶n̶
+P̸y̸t̸h̸o̸n̸
+P̷y̷t̷h̷o̷n̷
+P̴y̴t̴h̴o̴n̴
+𝒫𝓎𝓉𝒽𝑜𝓃
+ℙ𝕪𝕥𝕙𝕠𝕟
+                    """,
+                "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
+                "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
+                "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
+                "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
+                "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56，价格为789美元。",
+                "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
+                "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
+                "code: def calculate_sum(a, b):\n    return a + b\n\nresult = calculate_sum(123, 456)",
+                "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
+                "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
+                "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
+                "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
+                "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
+                "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
+                "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
+                "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
+                "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
+                'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
+                "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
+            ]
             sample_texts = gr.Dropdown(
+                choices=pre_choices,
                 value="Custom text (enter below)",
                 label="Choose a sample text or enter your own",
                 interactive=True,
                 label="Text to tokenize",
                 placeholder="Enter your text here or select a sample above...",
                 lines=4,
+                value=pre_choices[1],
             )
         with gr.Column(scale=1):
             with gr.Tabs():
                 with gr.TabItem("Models"):
                     model_selector = gr.CheckboxGroup(
+                        choices=available_tokenizers,
+                        value=pre_selected_tokenizers,
+                        label="Select tokenizers to compare...",
                     )
                     show_details = gr.Checkbox(
                         label="Show detailed analysis", value=False

mappings.py CHANGED Viewed

@@ -9,14 +9,20 @@ MODEL_MAP = {
     "bloom": "bigscience/bloom-560m",
     "aya-expanse": "CohereForAI/aya-expanse-8b",
     "comma": "common-pile/comma-v0.1-2t",
-    "byte-level": "google/byt5-small",
     "tokenmonster": "alasdairforsythe/tokenmonster",
     "byt5": "google/byt5-small",
 }
 TOKENIZER_INFO = {
     "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
     "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
     "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
     "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
@@ -34,4 +40,8 @@ TOKENIZER_INFO = {
     "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
     "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
     "byt5": {"name": "Byt5", "vocab_size": 50000, "encoding": "BPE"},
 }

     "bloom": "bigscience/bloom-560m",
     "aya-expanse": "CohereForAI/aya-expanse-8b",
     "comma": "common-pile/comma-v0.1-2t",
     "tokenmonster": "alasdairforsythe/tokenmonster",
     "byt5": "google/byt5-small",
+    "phi-3": "microsoft/Phi-3-mini-4k-instruct",
+    "xglm": "facebook/xglm-564M",
+    "tekken": "mistralai/tekken",
+    "mbert":  "google-bert/bert-base-multilingual-cased" ,
 }
+# "microsoft/Phi-3-mini-4k-instruct" "mistralai/tekken"  "facebook/xglm-564M"  "google-bert/bert-base-multilingual-cased"
 TOKENIZER_INFO = {
     "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
+    "gpt-4o": {"name": "GPT-4o", "vocab_size": 199997, "encoding": "BPE"},
     "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
     "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
     "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
     "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
     "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
     "byt5": {"name": "Byt5", "vocab_size": 50000, "encoding": "BPE"},
+    "phi-3": {"name": "Phi-3", "vocab_size": 32064, "encoding": "BPE"},
+    "xglm": {"name": "XGLM", "vocab_size": 256008, "encoding": "BPE"},
+    "tekken": {"name": "Tekken", "vocab_size": 32768, "encoding": "BPE"},
+    "mbert": {"name": "mBERT", "vocab_size": 119547, "encoding": "WordPiece"}
 }

requirements.txt CHANGED Viewed

@@ -4,4 +4,7 @@ transformers
 torch
 pandas
 plotly
-tokenmonster

 torch
 pandas
 plotly
+tokenmonster
+mistral_common
+protobuf
+sentencepiece

utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ import traceback
 import unicodedata
 import tiktoken
-from transformers import AutoTokenizer
 from mappings import MODEL_MAP, TOKENIZER_INFO
@@ -74,40 +74,155 @@ def is_subword(token_text, model, is_first):
 def tokenize_with_tiktoken(text, model):
-    encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
-    enc = tiktoken.get_encoding(encoding)
     token_data = []
-    current_pos = 0
-    for text_ in text.split("\n"):
-        tokens = enc.encode(text_ + "\n")
-        for i, token_id in enumerate(tokens):
-            token_text = enc.decode([token_id])
-            token_type = get_token_type(token_text)
-            subword = is_subword(token_text, model, i == 0)
             token_data.append(
                 {
-                    "text": token_text,
-                    "id": int(token_id),
-                    "type": token_type,
-                    "is_subword": subword,
-                    "bytes": len(token_text.encode("utf-8")),
-                    "position": i,
                 }
             )
-            current_pos += len(token_text)
         token_data.append(
             {
-                "text": "<newline>",
-                "id": 0,
-                "type": "special",
-                "is_subword": False,
                 "position": len(token_data),
             }
         )
     return {
         "model": TOKENIZER_INFO[model]["name"],
         "token_count": len(token_data),
@@ -142,81 +257,402 @@ def get_hf_tokenizer(model):
     return tokenizer
-def tokenize_with_hf(text, model):
     try:
-        tokenizer = get_hf_tokenizer(model)
         token_data = []
         for text_ in text.split("\n"):
-            text_ = text_ + "\n"
             encoding = tokenizer(
-                text_,
-                return_offsets_mapping=False,
                 return_tensors=None,
                 add_special_tokens=False,
             )
             token_ids = encoding["input_ids"]
             tokens = tokenizer.convert_ids_to_tokens(token_ids)
-            # print(model_name, text, "\n", tokens, token_ids)
-            # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
-            for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
-                token_type = get_token_type(token_text)
-                subword = is_subword(token_text, model, i == 0)
-                token_data.append(
-                    {
-                        "text": token_text,
-                        "id": token_id,  # int(token_id),
-                        "type": token_type,
-                        "is_subword": subword,
-                        "bytes": len(token_text.encode("utf-8")),
-                        "position": i,
-                    }
-                )
-            token_data.append(
-                {
-                    "text": "<newline>",
-                    "id": 0,
-                    "type": "special",
-                    "is_subword": False,
-                    "position": len(token_data),
-                }
             )
         return {
             "model": TOKENIZER_INFO[model]["name"],
-            "token_count": len(token_data),
             "tokens": token_data,
             "compression_ratio": len(text) / len(token_data) if token_data else 0,
             "encoding": TOKENIZER_INFO[model]["encoding"],
             "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         }
     except Exception as e:
-        error_msg = str(e)
-        print(f"DEBUG: Error: {error_msg}")
-        print(traceback.format_exc())
-        # Provide helpful error messages
-        if "gated repo" in error_msg.lower():
-            error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
-        elif "401" in error_msg:
-            error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
-        elif "not found" in error_msg.lower():
-            error_msg = (
-                f"Model {model_name} not found. It may have been moved or renamed."
-            )
         return {
             "model": TOKENIZER_INFO[model]["name"],
-            "token_count": 0,
-            "tokens": [],
-            "compression_ratio": 0,
-            "encoding": "Error",
-            "vocab_size": 0,
-            "error": error_msg,
         }
 def normalize_text(text, method):
@@ -229,6 +665,8 @@ def normalize_text(text, method):
         return unicodedata.normalize("NFC", text)
     elif method == "nfd":
         return unicodedata.normalize("NFD", text)
     elif method == "nfkc":
         return unicodedata.normalize("NFKC", text)
     elif method == "nfkd":
@@ -253,9 +691,37 @@ def get_normalization_methods():
         ("lowercase", "Lowercase"),
         ("nfc", "Unicode NFC (Canonical)"),
         ("nfd", "Unicode NFD (Decomposed)"),
         ("nfkc", "Unicode NFKC (Compatible)"),
         ("nfkd", "Unicode NFKD (Compatible Decomposed)"),
         ("strip_accents", "Remove Accents"),
         ("strip_punctuation", "Remove Punctuation"),
         ("whitespace_normalize", "Normalize Whitespace"),
     ]

 import unicodedata
 import tiktoken
+from transformers import AutoTokenizer, XGLMTokenizerFast
 from mappings import MODEL_MAP, TOKENIZER_INFO
 def tokenize_with_tiktoken(text, model):
+    enc = tiktoken.encoding_for_model(model)
+    # Process the entire text at once, not line by line
+    token_ids = enc.encode(text)
     token_data = []
+    current_text_pos = 0
+    # Build character-to-token mapping
+    char_to_tokens = {}
+    # Decode each token and find its position in the original text
+    for i, token_id in enumerate(token_ids):
+        token_text = enc.decode([token_id])
+        # Find where this token appears in the remaining text
+        remaining_text = text[current_text_pos:]
+        if token_text in remaining_text:
+            # Find the position of this token in the original text
+            local_pos = remaining_text.find(token_text)
+            actual_start = current_text_pos + local_pos
+            actual_end = actual_start + len(token_text)
+            # Map each character position to this token
+            for char_pos in range(actual_start, actual_end):
+                if char_pos not in char_to_tokens:
+                    char_to_tokens[char_pos] = []
+                char_to_tokens[char_pos].append(token_id)
+            current_text_pos = actual_end
+    # Group consecutive characters that have the same token ID sets
+    processed_chars = set()
+    text_pos = 0
+    while text_pos < len(text):
+        if text_pos in processed_chars:
+            text_pos += 1
+            continue
+        # Get tokens for current character
+        current_tokens = char_to_tokens.get(text_pos, [])
+        if not current_tokens:
+            # Handle characters not covered by any token
             token_data.append(
                 {
+                    "text": text[text_pos],
+                    "id": None,
+                    "type": get_token_type(text[text_pos]),
+                    "is_subword": False,
+                    "bytes": len(text[text_pos].encode("utf-8")),
+                    "position": len(token_data),
                 }
             )
+            processed_chars.add(text_pos)
+            text_pos += 1
+            continue
+        # Find the span of characters that share the same token ID set
+        span_start = text_pos
+        span_end = text_pos + 1
+        # Extend span while characters have the same token set
+        while (
+            span_end < len(text)
+            and span_end in char_to_tokens
+            and char_to_tokens[span_end] == current_tokens
+        ):
+            span_end += 1
+        # Get the text for this span
+        span_text = text[span_start:span_end]
+        # Create token data entry
         token_data.append(
             {
+                "text": span_text,
+                "id": current_tokens if len(current_tokens) > 1 else current_tokens[0],
+                "type": get_token_type(span_text),
+                "is_subword": is_subword(span_text, model, len(token_data) == 0),
+                "bytes": len(span_text.encode("utf-8")),
                 "position": len(token_data),
             }
         )
+        # Mark all characters in this span as processed
+        for pos in range(span_start, span_end):
+            processed_chars.add(pos)
+        text_pos = span_end
+    return {
+        "model": TOKENIZER_INFO[model]["name"],
+        "token_count": len(token_ids),
+        "tokens": token_data,
+        "compression_ratio": len(text) / len(token_data) if token_data else 0,
+        "encoding": TOKENIZER_INFO[model]["encoding"],
+        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
+    }
+def tokenize_with_tiktoke1n(text, model):
+    encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
+    enc = tiktoken.get_encoding(encoding)
+    token_data = []
+    current_pos = 0
+    text_ = text
+    for text in text_.split("\n"):
+        tokens = enc.encode(text + "\n")
+        #         token_text = enc.decode([token_id])
+        #         token_type = get_token_type(token_text)
+        #         subword = is_subword(token_text, model, i == 0)
+        token_ids = encoding["input_ids"]
+        ## offset in the text for each token, i.e. token i covers text[offsets[i][0]:offsets[i][1]]
+        offsets = encoding.get("offset_mapping", [])
+        token_data = []
+        curr_tok_id = 0
+        current_text_pos = 0
+        token_id = []
+        while curr_tok_id < len(token_ids) and curr_tok_id < len(tokens):
+            if offsets and curr_tok_id < len(offsets):
+                start, end = offsets[curr_tok_id]
+                actual_text = text[start:end]
+                if current_text_pos == end:
+                    token_id.append(token_ids[curr_tok_id])
+                else:
+                    token_id = [token_ids[curr_tok_id]]
+                token_type = get_token_type(actual_text)
+                subword = is_subword(actual_text, model, curr_tok_id == 0)
+                if current_text_pos != end:
+                    token_data.append(
+                        {
+                            "text": actual_text,
+                            "id": token_id,
+                            "type": token_type,
+                            "is_subword": subword,
+                            "bytes": len(actual_text.encode("utf-8")),
+                            "position": curr_tok_id,
+                        }
+                    )
+                curr_tok_id += 1
+                current_text_pos = end
     return {
         "model": TOKENIZER_INFO[model]["name"],
         "token_count": len(token_data),
     return tokenizer
+def get_tokenizer(model):
+    # import code; code.interact(local=locals()|globals())
+    model_name = MODEL_MAP.get(model, None)
+    if model_name is None:
+        raise ValueError(f"Unknown tokenizer code {model_name}")
+    print(model_name)
+    if model_name in TOKENIZER_CACHE:
+        return TOKENIZER_CACHE[model_name]
+    # Get token from environment
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": 0,
+            "tokens": [],
+            "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
+        }
+    if "tekken" in model_name:
+        from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+        tok = MistralTokenizer.v3(is_tekken=True)
+        tokenizer = tok.instruct_tokenizer.tokenizer
+    elif "tokenmonster" in model_name:
+        tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
+    elif "xglm" in model_name.lower():
+        # tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer = XGLMTokenizerFast.from_pretrained(
+            model_name, token=hf_token, trust_remote_code=True,# use_fast=False
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name, token=hf_token, trust_remote_code=True
+        )
+    TOKENIZER_CACHE[model_name] = tokenizer
+    return tokenizer
+def tokenize_w_tekken(text, model):
+    tokenizer = get_tokenizer(model)
+    # Process the entire text at once, not line by line
+    token_ids = tokenizer.encode(text, bos=False, eos=False)
+    token_data = []
+    current_text_pos = 0
+    # Build character-to-token mapping
+    char_to_tokens = {}
+    # Decode each token and find its position in the original text
+    for i, token_id in enumerate(token_ids):
+        token_text = tokenizer.decode([token_id])
+        # Find where this token appears in the remaining text
+        remaining_text = text[current_text_pos:]
+        if token_text in remaining_text:
+            # Find the position of this token in the original text
+            local_pos = remaining_text.find(token_text)
+            actual_start = current_text_pos + local_pos
+            actual_end = actual_start + len(token_text)
+            # Map each character position to this token
+            for char_pos in range(actual_start, actual_end):
+                if char_pos not in char_to_tokens:
+                    char_to_tokens[char_pos] = []
+                char_to_tokens[char_pos].append(token_id)
+            current_text_pos = actual_end
+    # Group consecutive characters that have the same token ID sets
+    processed_chars = set()
+    text_pos = 0
+    while text_pos < len(text):
+        if text_pos in processed_chars:
+            text_pos += 1
+            continue
+        # Get tokens for current character
+        current_tokens = char_to_tokens.get(text_pos, [])
+        if not current_tokens:
+            # Handle characters not covered by any token
+            token_data.append(
+                {
+                    "text": text[text_pos],
+                    "id": None,
+                    "type": get_token_type(text[text_pos]),
+                    "is_subword": False,
+                    "bytes": len(text[text_pos].encode("utf-8")),
+                    "position": len(token_data),
+                }
+            )
+            processed_chars.add(text_pos)
+            text_pos += 1
+            continue
+        # Find the span of characters that share the same token ID set
+        span_start = text_pos
+        span_end = text_pos + 1
+        # Extend span while characters have the same token set
+        while (
+            span_end < len(text)
+            and span_end in char_to_tokens
+            and char_to_tokens[span_end] == current_tokens
+        ):
+            span_end += 1
+        # Get the text for this span
+        span_text = text[span_start:span_end]
+        # Create token data entry
+        token_data.append(
+            {
+                "text": span_text,
+                "id": current_tokens if len(current_tokens) > 1 else current_tokens[0],
+                "type": get_token_type(span_text),
+                "is_subword": is_subword(span_text, model, len(token_data) == 0),
+                "bytes": len(span_text.encode("utf-8")),
+                "position": len(token_data),
+            }
+        )
+        # Mark all characters in this span as processed
+        for pos in range(span_start, span_end):
+            processed_chars.add(pos)
+        text_pos = span_end
+    return {
+        "model": TOKENIZER_INFO[model]["name"],
+        "token_count": len(token_ids),
+        "tokens": token_data,
+        "compression_ratio": len(text) / len(token_data) if token_data else 0,
+        "encoding": TOKENIZER_INFO[model]["encoding"],
+        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
+    }
+def tokenize_w_tekken1(text, model):
     try:
+        tokenizer = get_tokenizer(model)
+        text_ = text
+        index = 0
         token_data = []
         for text_ in text.split("\n"):
+            text_ += "\n"
+            token_ids = tokenizer.encode(text_, bos=False, eos=False)
+            tokens = [tokenizer.decode([tok]) for tok in token_ids]
+            # import code; code.interact(local=locals()|globals())
+            for i, tok in enumerate(tokens):
+                tok = tok[0].encode("utf-8")
+                # token_type = get_token_type(tok)
+                token_type=None
+                # subword = is_subword(tok, tokenizer, is_first=index == 0)
+                subword=False
+                token_data.append(
+                    {
+                        "text": tok,
+                        "id": token_ids[i],
+                        "type": token_type,
+                        "is_subword": subword,
+                        "bytes": len(tok),
+                        "position": index,
+                    }
+                )
+                index += 1
+        # import code; code.interact(local=locals()|globals())
+        return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": index,
+            "tokens": token_data,
+            "compression_ratio": len(text) / len(token_data) if token_data else 0,
+            "encoding": TOKENIZER_INFO[model]["encoding"],
+            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
+        }
+    except Exception as e:
+        # Your existing error handling...
+        print(f"Error: {e}")
+        pass
+# Alternative version if you really need line-by-line processing:
+def tokenize_with_hf(text, model):
+    try:
+        tokenizer = get_tokenizer(model)
+        all_token_data = []
+        global_position = 0
+        text_offset = 0
+        # Process line by line but accumulate results
+        for line in text.split("\n"):
+            line_with_newline = line + "\n"
             encoding = tokenizer(
+                line_with_newline,
+                return_offsets_mapping=True,
                 return_tensors=None,
                 add_special_tokens=False,
             )
             token_ids = encoding["input_ids"]
             tokens = tokenizer.convert_ids_to_tokens(token_ids)
+            offsets = encoding.get("offset_mapping", [])
+            # Process tokens for this line
+            for i in range(len(token_ids)):
+                if i < len(offsets) and offsets[i] is not None:
+                    start, end = offsets[i]
+                    actual_text = line_with_newline[start:end]
+                else:
+                    actual_text = tokens[i] if i < len(tokens) else ""
+                if not actual_text:
+                    continue
+                token_type = get_token_type(actual_text)
+                subword = is_subword(actual_text, model, global_position == 0)
+                all_token_data.append({
+                    # "text": actual_text,
+                    "text": tokens[i],
+                    "id": [token_ids[i]],
+                    "type": token_type,
+                    "is_subword": subword,
+                    "bytes": len(actual_text.encode("utf-8")),
+                    "position": global_position,
+                })
+                global_position += 1
+            text_offset += len(line_with_newline)
+        # Calculate total token count
+        total_tokens = sum(len(encoding["input_ids"]) for encoding in [
+            tokenizer(text, return_tensors=None, add_special_tokens=False)
+        ])
+        return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": total_tokens,
+            "tokens": all_token_data,
+            "compression_ratio": len(text) / len(all_token_data) if all_token_data else 0,
+            "encoding": TOKENIZER_INFO[model]["encoding"],
+            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
+        }
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def tokenize_with_hfold(text, model):
+    try:
+        tokenizer = get_hf_tokenizer(model)
+        # Process the ENTIRE text at once, not line by line
+        text_ = text
+        token_data = []
+        for text_ in text.split("\n"):
+            text_ += "\n"
+            encoding = tokenizer(
+                text,  # Use original text, not line by line
+                return_offsets_mapping=True,
+                return_tensors=None,
+                add_special_tokens=False,
             )
+            token_ids = encoding["input_ids"]
+            tokens = tokenizer.convert_ids_to_tokens(token_ids)
+            ## offset in the text for each token, i.e. token i covers text[offsets[i][0]:offsets[i][1]]
+            offsets = encoding.get("offset_mapping", [])
+            curr_tok_id = 0
+            current_text_pos = 0
+            token_id = []
+            while curr_tok_id < len(token_ids) and curr_tok_id < len(tokens):
+                if offsets and curr_tok_id < len(offsets):
+                    start, end = offsets[curr_tok_id]
+                    actual_text = text[start:end]
+                    if current_text_pos == end:
+                        token_id.append(token_ids[curr_tok_id])
+                    else:
+                        token_id = [token_ids[curr_tok_id]]
+                    token_type = get_token_type(actual_text)
+                    subword = is_subword(actual_text, model, curr_tok_id == 0)
+                    if current_text_pos != end:
+                        token_data.append(
+                            {
+                                "text": actual_text,
+                                "id": token_id,
+                                "type": token_type,
+                                "is_subword": subword,
+                                "bytes": len(actual_text.encode("utf-8")),
+                                "position": curr_tok_id,
+                            }
+                        )
+                        current_text_pos = end
+                else:
+                    token_data.append(
+                            {
+                                "text": tokens[curr_tok_id],
+                                "id": [token_ids[curr_tok_id]],
+                                "type": get_token_type(tokens[curr_tok_id]),
+                                "is_subword": is_subword(tokens[curr_tok_id]),
+                                "bytes": len(tokens[curr_tok_id].encode("utf-8")),
+                                "position": curr_tok_id,
+                            }
+                        )
+                curr_tok_id += 1
         return {
             "model": TOKENIZER_INFO[model]["name"],
+            "token_count": len(token_ids),
             "tokens": token_data,
             "compression_ratio": len(text) / len(token_data) if token_data else 0,
             "encoding": TOKENIZER_INFO[model]["encoding"],
             "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         }
     except Exception as e:
+        # Your existing error handling...
+        print(f"Error: {e}")
+        pass
+def tokenize_with_byt5(text, model):
+    """Special handling for ByT5 byte-level tokenizer"""
+    try:
+        tokenizer = get_hf_tokenizer(model)
+        # ByT5 doesn't support offset_mapping, so we handle it differently
+        encoding = tokenizer(
+            text,
+            return_tensors=None,
+            add_special_tokens=False,
+        )
+        token_ids = encoding["input_ids"]
+        # For ByT5, each token represents a byte
+        text_bytes = text.encode('utf-8')
+        token_data = []
+        for i, token_id in enumerate(token_ids):
+            # Decode individual token
+            try:
+                token_text = tokenizer.decode([token_id])
+                # For ByT5, tokens often correspond to individual bytes/characters
+                if i < len(text_bytes):
+                    # Get the actual byte this token represents
+                    byte_val = text_bytes[i]
+                    actual_char = chr(byte_val) if byte_val < 128 else text_bytes[i:i+1].decode('utf-8', errors='replace')
+                else:
+                    actual_char = token_text
+                token_type = get_token_type(actual_char)
+                subword = is_subword(actual_char, model, i == 0)
+                token_data.append({
+                    "text": actual_char,
+                    "id": [token_id],
+                    "type": token_type,
+                    "is_subword": subword,
+                    "bytes": len(actual_char.encode("utf-8")),
+                    "position": i,
+                })
+            except Exception as e:
+                # Handle special tokens or decoding issues
+                token_data.append({
+                    "text": f"<special_token_{token_id}>",
+                    "id": [token_id],
+                    "type": "special",
+                    "is_subword": False,
+                    "bytes": 0,
+                    "position": i,
+                })
         return {
             "model": TOKENIZER_INFO[model]["name"],
+            "token_count": len(token_ids),
+            "tokens": token_data,
+            "compression_ratio": len(text) / len(token_data) if token_data else 0,
+            "encoding": TOKENIZER_INFO[model]["encoding"],
+            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         }
+    except Exception as e:
+        print(f"Error in ByT5 tokenization: {e}")
+        return None
 def normalize_text(text, method):
         return unicodedata.normalize("NFC", text)
     elif method == "nfd":
         return unicodedata.normalize("NFD", text)
+    elif method == "nfk":
+        return unicodedata.normalize("NFK", text)
     elif method == "nfkc":
         return unicodedata.normalize("NFKC", text)
     elif method == "nfkd":
         ("lowercase", "Lowercase"),
         ("nfc", "Unicode NFC (Canonical)"),
         ("nfd", "Unicode NFD (Decomposed)"),
+        ("nfk", ""),
         ("nfkc", "Unicode NFKC (Compatible)"),
         ("nfkd", "Unicode NFKD (Compatible Decomposed)"),
         ("strip_accents", "Remove Accents"),
         ("strip_punctuation", "Remove Punctuation"),
         ("whitespace_normalize", "Normalize Whitespace"),
     ]
+def clean_token_display(token_text, tokenizer=None):
+    """Clean up token display to avoid ? characters"""
+    if token_text == "\n" or token_text == "<newline>   ":
+        return "<newline>"
+    # Handle common prefixes
+    if token_text.startswith("Ġ"):  # GPT-2 style
+        return " " + token_text[1:]
+    elif token_text.startswith("▁"):  # SentencePiece style
+        return " " + token_text[1:]
+    # Handle byte-level representations
+    if token_text.startswith("<0x") and token_text.endswith(">"):
+        try:
+            # Convert hex byte to character
+            hex_val = token_text[3:-1]
+            byte_val = int(hex_val, 16)
+            return chr(byte_val) if 32 <= byte_val <= 126 else f"[{hex_val}]"
+        except:
+            return token_text
+    # Handle other special cases
+    if "�" in token_text:  # Unicode replacement character
+        return token_text.replace("�", "?")
+    return token_text