Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	
		Gül Sena Altıntaş
		
	commited on
		
		
					Commit 
							
							·
						
						c02e89e
	
1
								Parent(s):
							
							3a08f05
								
Refactoring, and visual improvements
Browse files- .gitignore +7 -0
- app.py +245 -196
- mappings.py +36 -0
- utils.py +136 -0
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            *.pyc
         | 
| 2 | 
            +
            *.pyo
         | 
| 3 | 
            +
            *.pyd
         | 
| 4 | 
            +
            *.pyw
         | 
| 5 | 
            +
            *.pyz
         | 
| 6 | 
            +
            *.pywz
         | 
| 7 | 
            +
            *.pyzw
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,160 +1,16 @@ | |
| 1 | 
            -
            import json
         | 
| 2 | 
            -
            import os
         | 
| 3 | 
             
            from collections import Counter
         | 
| 4 |  | 
| 5 | 
             
            import gradio as gr
         | 
| 6 | 
             
            import pandas as pd
         | 
| 7 | 
             
            import plotly.express as px
         | 
| 8 | 
             
            import plotly.graph_objects as go
         | 
| 9 | 
            -
            import tiktoken
         | 
| 10 | 
            -
            from transformers import AutoTokenizer
         | 
| 11 | 
            -
             | 
| 12 | 
            -
            # Model mappings
         | 
| 13 | 
            -
            MODEL_MAP = {
         | 
| 14 | 
            -
                "llama-2": "meta-llama/Llama-2-7b-hf",
         | 
| 15 | 
            -
                "llama-3": "meta-llama/Llama-3.2-1B",
         | 
| 16 | 
            -
                "gemma-2": "google/gemma-2-2b",
         | 
| 17 | 
            -
                "qwen3": "Qwen/Qwen3-0.6B",
         | 
| 18 | 
            -
                "qwen2.5": "Qwen/Qwen2.5-0.5B",
         | 
| 19 | 
            -
                "bert": "bert-base-uncased",
         | 
| 20 | 
            -
                "bloom": "bigscience/bloom-560m",
         | 
| 21 | 
            -
                "aya-expanse": "CohereForAI/aya-expanse-8b",
         | 
| 22 | 
            -
                "comma": "common-pile/comma-v0.1-2tgpt2",
         | 
| 23 | 
            -
                "byte-level": "google/byt5-small",
         | 
| 24 | 
            -
                "tokenmonster": "alasdairforsythe/tokenmonster",
         | 
| 25 | 
            -
            }
         | 
| 26 | 
            -
             | 
| 27 | 
            -
            TOKENIZER_INFO = {
         | 
| 28 | 
            -
                "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
         | 
| 29 | 
            -
                "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
         | 
| 30 | 
            -
                "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
         | 
| 31 | 
            -
                "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
         | 
| 32 | 
            -
                "gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
         | 
| 33 | 
            -
                "qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
         | 
| 34 | 
            -
                "qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
         | 
| 35 | 
            -
                "bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
         | 
| 36 | 
            -
                "bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
         | 
| 37 | 
            -
                "aya-expanse": {
         | 
| 38 | 
            -
                    "name": "Aya Expanse",
         | 
| 39 | 
            -
                    "vocab_size": 256000,
         | 
| 40 | 
            -
                    "encoding": "SentencePiece",
         | 
| 41 | 
            -
                },
         | 
| 42 | 
            -
                "comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
         | 
| 43 | 
            -
                "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
         | 
| 44 | 
            -
                "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
         | 
| 45 | 
            -
            }
         | 
| 46 |  | 
| 47 | 
            -
             | 
| 48 | 
            -
            def get_token_type(token_text):
         | 
| 49 | 
            -
                import re
         | 
| 50 | 
            -
             | 
| 51 | 
            -
                if re.match(r"^\s+$", token_text):
         | 
| 52 | 
            -
                    return "whitespace"
         | 
| 53 | 
            -
                elif re.match(r"^[a-zA-Z]+$", token_text):
         | 
| 54 | 
            -
                    return "word"
         | 
| 55 | 
            -
                elif re.match(r"^\d+$", token_text):
         | 
| 56 | 
            -
                    return "number"
         | 
| 57 | 
            -
                elif re.match(r"^[^\w\s]+$", token_text):
         | 
| 58 | 
            -
                    return "punctuation"
         | 
| 59 | 
            -
                elif token_text.startswith("<") and token_text.endswith(">"):
         | 
| 60 | 
            -
                    return "special"
         | 
| 61 | 
            -
                else:
         | 
| 62 | 
            -
                    return "mixed"
         | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
            def is_subword(token_text, model, is_first):
         | 
| 66 | 
            -
                if model in ["llama-2", "llama-3", "qwen3"]:
         | 
| 67 | 
            -
                    return not token_text.startswith("▁") and not is_first
         | 
| 68 | 
            -
                elif model == "bert":
         | 
| 69 | 
            -
                    return token_text.startswith("##")
         | 
| 70 | 
            -
                else:  # BPE models
         | 
| 71 | 
            -
                    return not token_text.startswith(" ") and not is_first and len(token_text) > 0
         | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
            def tokenize_with_tiktoken(text, model):
         | 
| 75 | 
            -
                encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
         | 
| 76 | 
            -
                enc = tiktoken.get_encoding(encoding)
         | 
| 77 | 
            -
                tokens = enc.encode(text)
         | 
| 78 | 
            -
             | 
| 79 | 
            -
                token_data = []
         | 
| 80 | 
            -
                current_pos = 0
         | 
| 81 | 
            -
             | 
| 82 | 
            -
                for i, token_id in enumerate(tokens):
         | 
| 83 | 
            -
                    token_text = enc.decode([token_id])
         | 
| 84 | 
            -
                    token_type = get_token_type(token_text)
         | 
| 85 | 
            -
                    subword = is_subword(token_text, model, i == 0)
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                    token_data.append(
         | 
| 88 | 
            -
                        {
         | 
| 89 | 
            -
                            "text": token_text,
         | 
| 90 | 
            -
                            "id": int(token_id),
         | 
| 91 | 
            -
                            "type": token_type,
         | 
| 92 | 
            -
                            "is_subword": subword,
         | 
| 93 | 
            -
                            "bytes": len(token_text.encode("utf-8")),
         | 
| 94 | 
            -
                            "position": i,
         | 
| 95 | 
            -
                        }
         | 
| 96 | 
            -
                    )
         | 
| 97 | 
            -
                    current_pos += len(token_text)
         | 
| 98 | 
            -
             | 
| 99 | 
            -
                return {
         | 
| 100 | 
            -
                    "model": TOKENIZER_INFO[model]["name"],
         | 
| 101 | 
            -
                    "token_count": len(tokens),
         | 
| 102 | 
            -
                    "tokens": token_data,
         | 
| 103 | 
            -
                    "compression_ratio": len(text) / len(tokens) if tokens else 0,
         | 
| 104 | 
            -
                    "encoding": TOKENIZER_INFO[model]["encoding"],
         | 
| 105 | 
            -
                    "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         | 
| 106 | 
            -
                }
         | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
            def tokenize_with_hf(text, model):
         | 
| 110 | 
            -
                try:
         | 
| 111 | 
            -
                    model_name = MODEL_MAP.get(model, "gpt2")
         | 
| 112 | 
            -
                    tokenizer = AutoTokenizer.from_pretrained(
         | 
| 113 | 
            -
                        model_name, token=os.getenv("HF_TOKEN"), trust_remote_code=True
         | 
| 114 | 
            -
                    )
         | 
| 115 | 
            -
             | 
| 116 | 
            -
                    tokens = tokenizer.encode(text)
         | 
| 117 | 
            -
                    token_data = []
         | 
| 118 | 
            -
             | 
| 119 | 
            -
                    for i, token_id in enumerate(tokens):
         | 
| 120 | 
            -
                        token_text = tokenizer.decode([token_id], skip_special_tokens=False)
         | 
| 121 | 
            -
                        token_type = get_token_type(token_text)
         | 
| 122 | 
            -
                        subword = is_subword(token_text, model, i == 0)
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                        token_data.append(
         | 
| 125 | 
            -
                            {
         | 
| 126 | 
            -
                                "text": token_text,
         | 
| 127 | 
            -
                                "id": int(token_id),
         | 
| 128 | 
            -
                                "type": token_type,
         | 
| 129 | 
            -
                                "is_subword": subword,
         | 
| 130 | 
            -
                                "bytes": len(token_text.encode("utf-8")),
         | 
| 131 | 
            -
                                "position": i,
         | 
| 132 | 
            -
                            }
         | 
| 133 | 
            -
                        )
         | 
| 134 | 
            -
             | 
| 135 | 
            -
                    return {
         | 
| 136 | 
            -
                        "model": TOKENIZER_INFO[model]["name"],
         | 
| 137 | 
            -
                        "token_count": len(tokens),
         | 
| 138 | 
            -
                        "tokens": token_data,
         | 
| 139 | 
            -
                        "compression_ratio": len(text) / len(tokens) if tokens else 0,
         | 
| 140 | 
            -
                        "encoding": TOKENIZER_INFO[model]["encoding"],
         | 
| 141 | 
            -
                        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         | 
| 142 | 
            -
                    }
         | 
| 143 | 
            -
                except Exception as e:
         | 
| 144 | 
            -
                    return {
         | 
| 145 | 
            -
                        "model": TOKENIZER_INFO[model]["name"],
         | 
| 146 | 
            -
                        "token_count": 0,
         | 
| 147 | 
            -
                        "tokens": [],
         | 
| 148 | 
            -
                        "compression_ratio": 0,
         | 
| 149 | 
            -
                        "encoding": "Error",
         | 
| 150 | 
            -
                        "vocab_size": 0,
         | 
| 151 | 
            -
                        "error": str(e),
         | 
| 152 | 
            -
                    }
         | 
| 153 |  | 
| 154 |  | 
| 155 | 
             
            def compare_tokenizers(text, selected_models, show_details=False):
         | 
| 156 | 
             
                if not text.strip():
         | 
| 157 | 
            -
                    return "Please enter some text to tokenize.", "", None, None
         | 
| 158 |  | 
| 159 | 
             
                results = {}
         | 
| 160 |  | 
| @@ -165,77 +21,252 @@ def compare_tokenizers(text, selected_models, show_details=False): | |
| 165 | 
             
                        results[model] = tokenize_with_hf(text, model)
         | 
| 166 |  | 
| 167 | 
             
                # Generate outputs
         | 
| 168 | 
            -
                 | 
|  | |
|  | |
| 169 | 
             
                detailed_output = generate_detailed_analysis(results) if show_details else ""
         | 
| 170 | 
             
                efficiency_chart = create_efficiency_chart(results)
         | 
| 171 | 
             
                token_distribution_chart = create_token_distribution_chart(results)
         | 
| 172 |  | 
| 173 | 
            -
                return  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 174 |  | 
| 175 |  | 
| 176 | 
             
            def generate_basic_comparison(results):
         | 
| 177 | 
             
                if not results:
         | 
| 178 | 
            -
                    return "No results to display."
         | 
| 179 | 
            -
             | 
| 180 | 
            -
                output = []
         | 
| 181 |  | 
| 182 | 
             
                # Efficiency ranking
         | 
| 183 | 
             
                sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
         | 
| 184 |  | 
| 185 | 
            -
                 | 
|  | |
| 186 | 
             
                for i, (model, result) in enumerate(sorted_models):
         | 
| 187 | 
             
                    if "error" in result:
         | 
| 188 | 
            -
                         | 
| 189 | 
             
                            f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
         | 
| 190 | 
             
                        )
         | 
| 191 | 
             
                    else:
         | 
| 192 | 
            -
                         | 
| 193 | 
             
                            f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
         | 
| 194 | 
             
                            f"({result['compression_ratio']:.2f}x compression)"
         | 
| 195 | 
             
                        )
         | 
| 196 |  | 
| 197 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 198 |  | 
| 199 | 
             
                for model, result in results.items():
         | 
| 200 | 
             
                    if "error" in result:
         | 
| 201 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 202 | 
             
                        continue
         | 
| 203 |  | 
| 204 | 
            -
                     | 
| 205 | 
            -
                     | 
| 206 | 
            -
             | 
| 207 | 
            -
             | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 212 | 
             
                    subword_count = 0
         | 
| 213 | 
            -
             | 
| 214 | 
            -
                    for token in result["tokens"][:20]:
         | 
| 215 | 
             
                        token_text = token["text"]
         | 
| 216 | 
            -
                         | 
| 217 | 
            -
                            token_text  | 
| 218 | 
            -
                         | 
| 219 | 
            -
                            token_text = "⎵"  # Empty token indicator
         | 
| 220 |  | 
| 221 | 
            -
                        #  | 
|  | |
| 222 | 
             
                        if token["is_subword"]:
         | 
| 223 | 
            -
                             | 
| 224 | 
             
                            subword_count += 1
         | 
| 225 | 
            -
                        elif token["type"] == "word":
         | 
| 226 | 
            -
                            tokens_display.append(f"🔤`{token_text}`")
         | 
| 227 | 
            -
                        elif token["type"] == "number":
         | 
| 228 | 
            -
                            tokens_display.append(f"🔢`{token_text}`")
         | 
| 229 | 
            -
                        elif token["type"] == "punctuation":
         | 
| 230 | 
            -
                            tokens_display.append(f"❗`{token_text}`")
         | 
| 231 | 
            -
                        else:
         | 
| 232 | 
            -
                            tokens_display.append(f"`{token_text}`")
         | 
| 233 |  | 
| 234 | 
            -
             | 
| 235 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 236 |  | 
| 237 | 
            -
             | 
| 238 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 239 |  | 
| 240 | 
             
                return "\n".join(output)
         | 
| 241 |  | 
| @@ -414,8 +445,10 @@ with gr.Blocks( | |
| 414 | 
             
                                "bloom",
         | 
| 415 | 
             
                                "aya-expanse",
         | 
| 416 | 
             
                                "comma",
         | 
| 417 | 
            -
                                " | 
|  | |
| 418 | 
             
                                "tokenmonster",
         | 
|  | |
| 419 | 
             
                            ],
         | 
| 420 | 
             
                            value=["gpt-4", "llama-3", "gpt-2"],
         | 
| 421 | 
             
                            label="Select tokenizers to compare",
         | 
| @@ -425,9 +458,22 @@ with gr.Blocks( | |
| 425 |  | 
| 426 | 
             
                with gr.Row():
         | 
| 427 | 
             
                    with gr.Column():
         | 
| 428 | 
            -
                         | 
| 429 | 
            -
                            label=" | 
| 430 | 
            -
                            value="Enter text above to see  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 431 | 
             
                        )
         | 
| 432 |  | 
| 433 | 
             
                with gr.Row():
         | 
| @@ -448,10 +494,10 @@ with gr.Blocks( | |
| 448 |  | 
| 449 | 
             
                # Main comparison function
         | 
| 450 | 
             
                def update_comparison(text, models, details):
         | 
| 451 | 
            -
                     | 
| 452 | 
            -
                        text, models, details
         | 
| 453 | 
             
                    )
         | 
| 454 | 
            -
                    return  | 
| 455 |  | 
| 456 | 
             
                # Auto-update on changes
         | 
| 457 | 
             
                for component in [text_input, model_selector, show_details]:
         | 
| @@ -459,7 +505,9 @@ with gr.Blocks( | |
| 459 | 
             
                        fn=update_comparison,
         | 
| 460 | 
             
                        inputs=[text_input, model_selector, show_details],
         | 
| 461 | 
             
                        outputs=[
         | 
| 462 | 
            -
                             | 
|  | |
|  | |
| 463 | 
             
                            detailed_output,
         | 
| 464 | 
             
                            efficiency_chart,
         | 
| 465 | 
             
                            distribution_chart,
         | 
| @@ -474,12 +522,11 @@ with gr.Blocks( | |
| 474 | 
             
                - **LLaMA-2/3**: Meta's models using SentencePiece
         | 
| 475 | 
             
                - **Gemma-2**: Google's model with SentencePiece
         | 
| 476 | 
             
                - **Qwen3/2.5**: Alibaba's models with BPE
         | 
| 477 | 
            -
                - **BERT**: Google's  | 
|  | |
| 478 | 
             
                - **BLOOM**: BigScience's multilingual model with BPE
         | 
| 479 | 
             
                - **Aya Expanse**: Cohere's multilingual model with SentencePiece
         | 
| 480 | 
            -
                - **Comma  | 
| 481 | 
            -
                - **Byte-Level**: Byte-level BPE tokenizer
         | 
| 482 | 
            -
                - **TokenMonster**: Optimized tokenizer with BPE
         | 
| 483 |  | 
| 484 | 
             
                ### Features
         | 
| 485 | 
             
                - **Efficiency Ranking**: Compare token counts across models
         | 
| @@ -491,3 +538,5 @@ with gr.Blocks( | |
| 491 |  | 
| 492 | 
             
            if __name__ == "__main__":
         | 
| 493 | 
             
                demo.launch()
         | 
|  | |
|  | 
|  | |
|  | |
|  | |
| 1 | 
             
            from collections import Counter
         | 
| 2 |  | 
| 3 | 
             
            import gradio as gr
         | 
| 4 | 
             
            import pandas as pd
         | 
| 5 | 
             
            import plotly.express as px
         | 
| 6 | 
             
            import plotly.graph_objects as go
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 7 |  | 
| 8 | 
            +
            from utils import tokenize_with_hf, tokenize_with_tiktoken
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 9 |  | 
| 10 |  | 
| 11 | 
             
            def compare_tokenizers(text, selected_models, show_details=False):
         | 
| 12 | 
             
                if not text.strip():
         | 
| 13 | 
            +
                    return "Please enter some text to tokenize.", "", "", "", None, None
         | 
| 14 |  | 
| 15 | 
             
                results = {}
         | 
| 16 |  | 
|  | |
| 21 | 
             
                        results[model] = tokenize_with_hf(text, model)
         | 
| 22 |  | 
| 23 | 
             
                # Generate outputs
         | 
| 24 | 
            +
                efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
         | 
| 25 | 
            +
                    results
         | 
| 26 | 
            +
                )
         | 
| 27 | 
             
                detailed_output = generate_detailed_analysis(results) if show_details else ""
         | 
| 28 | 
             
                efficiency_chart = create_efficiency_chart(results)
         | 
| 29 | 
             
                token_distribution_chart = create_token_distribution_chart(results)
         | 
| 30 |  | 
| 31 | 
            +
                return (
         | 
| 32 | 
            +
                    efficiency_output,
         | 
| 33 | 
            +
                    tokenization_html,
         | 
| 34 | 
            +
                    token_ids_output,
         | 
| 35 | 
            +
                    detailed_output,
         | 
| 36 | 
            +
                    efficiency_chart,
         | 
| 37 | 
            +
                    token_distribution_chart,
         | 
| 38 | 
            +
                )
         | 
| 39 |  | 
| 40 |  | 
| 41 | 
             
            def generate_basic_comparison(results):
         | 
| 42 | 
             
                if not results:
         | 
| 43 | 
            +
                    return "No results to display.", "", ""
         | 
|  | |
|  | |
| 44 |  | 
| 45 | 
             
                # Efficiency ranking
         | 
| 46 | 
             
                sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
         | 
| 47 |  | 
| 48 | 
            +
                ranking_output = []
         | 
| 49 | 
            +
                ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
         | 
| 50 | 
             
                for i, (model, result) in enumerate(sorted_models):
         | 
| 51 | 
             
                    if "error" in result:
         | 
| 52 | 
            +
                        ranking_output.append(
         | 
| 53 | 
             
                            f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
         | 
| 54 | 
             
                        )
         | 
| 55 | 
             
                    else:
         | 
| 56 | 
            +
                        ranking_output.append(
         | 
| 57 | 
             
                            f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
         | 
| 58 | 
             
                            f"({result['compression_ratio']:.2f}x compression)"
         | 
| 59 | 
             
                        )
         | 
| 60 |  | 
| 61 | 
            +
                # Generate interactive tokenization display
         | 
| 62 | 
            +
                tokenization_html = generate_interactive_tokenization(results)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                # Generate token ID tables
         | 
| 65 | 
            +
                token_ids_display = generate_token_ids_display(results)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                return "\n".join(ranking_output), tokenization_html, token_ids_display
         | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
            def generate_interactive_tokenization(results):
         | 
| 71 | 
            +
                """Generate HTML with hover highlighting across tokenizers"""
         | 
| 72 | 
            +
                if not results:
         | 
| 73 | 
            +
                    return "<p>No tokenization results to display.</p>"
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                html_parts = []
         | 
| 76 | 
            +
                html_parts.append("""
         | 
| 77 | 
            +
                <style>
         | 
| 78 | 
            +
                .tokenizer-container {
         | 
| 79 | 
            +
                    margin-bottom: 20px;
         | 
| 80 | 
            +
                    border: 1px solid #e0e0e0;
         | 
| 81 | 
            +
                    border-radius: 8px;
         | 
| 82 | 
            +
                    padding: 15px;
         | 
| 83 | 
            +
                    background: white;
         | 
| 84 | 
            +
                }
         | 
| 85 | 
            +
                .tokenizer-header {
         | 
| 86 | 
            +
                    font-weight: bold;
         | 
| 87 | 
            +
                    font-size: 18px;
         | 
| 88 | 
            +
                    margin-bottom: 10px;
         | 
| 89 | 
            +
                    color: #2c3e50;
         | 
| 90 | 
            +
                }
         | 
| 91 | 
            +
                .token-display {
         | 
| 92 | 
            +
                    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
         | 
| 93 | 
            +
                    line-height: 1.8;
         | 
| 94 | 
            +
                    word-wrap: break-word;
         | 
| 95 | 
            +
                }
         | 
| 96 | 
            +
                .token {
         | 
| 97 | 
            +
                    display: inline-block;
         | 
| 98 | 
            +
                    margin: 2px;
         | 
| 99 | 
            +
                    padding: 4px 8px;
         | 
| 100 | 
            +
                    border-radius: 4px;
         | 
| 101 | 
            +
                    border: 1px solid;
         | 
| 102 | 
            +
                    cursor: pointer;
         | 
| 103 | 
            +
                    transition: all 0.2s ease;
         | 
| 104 | 
            +
                    position: relative;
         | 
| 105 | 
            +
                    font-size: 14px;
         | 
| 106 | 
            +
                }
         | 
| 107 | 
            +
                .token:hover {
         | 
| 108 | 
            +
                    transform: scale(1.1);
         | 
| 109 | 
            +
                    z-index: 10;
         | 
| 110 | 
            +
                    box-shadow: 0 2px 8px rgba(0,0,0,0.2);
         | 
| 111 | 
            +
                }
         | 
| 112 | 
            +
                .token.highlighted {
         | 
| 113 | 
            +
                    background: #ff6b6b !important;
         | 
| 114 | 
            +
                    border-color: #e55353 !important;
         | 
| 115 | 
            +
                    color: white !important;
         | 
| 116 | 
            +
                    box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
         | 
| 117 | 
            +
                }
         | 
| 118 | 
            +
                .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
         | 
| 119 | 
            +
                .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
         | 
| 120 | 
            +
                .token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
         | 
| 121 | 
            +
                .token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
         | 
| 122 | 
            +
                .token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
         | 
| 123 | 
            +
                .token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
         | 
| 124 | 
            +
                .token-subword {
         | 
| 125 | 
            +
                    background: #fff8e1 !important;
         | 
| 126 | 
            +
                    border-color: #ffc107 !important;
         | 
| 127 | 
            +
                    border-style: dashed !important;
         | 
| 128 | 
            +
                }
         | 
| 129 | 
            +
                .token-stats {
         | 
| 130 | 
            +
                    display: inline-block;
         | 
| 131 | 
            +
                    margin-left: 10px;
         | 
| 132 | 
            +
                    padding: 2px 6px;
         | 
| 133 | 
            +
                    background: #f8f9fa;
         | 
| 134 | 
            +
                    border-radius: 3px;
         | 
| 135 | 
            +
                    font-size: 12px;
         | 
| 136 | 
            +
                    color: #666;
         | 
| 137 | 
            +
                }
         | 
| 138 | 
            +
                </style>
         | 
| 139 | 
            +
                
         | 
| 140 | 
            +
                <script>
         | 
| 141 | 
            +
                function highlightToken(text, allTokenizers) {
         | 
| 142 | 
            +
                    // Remove existing highlights
         | 
| 143 | 
            +
                    document.querySelectorAll('.token').forEach(token => {
         | 
| 144 | 
            +
                        token.classList.remove('highlighted');
         | 
| 145 | 
            +
                    });
         | 
| 146 | 
            +
                    
         | 
| 147 | 
            +
                    // Highlight tokens with same text across all tokenizers
         | 
| 148 | 
            +
                    document.querySelectorAll('.token').forEach(token => {
         | 
| 149 | 
            +
                        if (token.dataset.text === text) {
         | 
| 150 | 
            +
                            token.classList.add('highlighted');
         | 
| 151 | 
            +
                        }
         | 
| 152 | 
            +
                    });
         | 
| 153 | 
            +
                }
         | 
| 154 | 
            +
                
         | 
| 155 | 
            +
                function clearHighlights() {
         | 
| 156 | 
            +
                    document.querySelectorAll('.token').forEach(token => {
         | 
| 157 | 
            +
                        token.classList.remove('highlighted');
         | 
| 158 | 
            +
                    });
         | 
| 159 | 
            +
                }
         | 
| 160 | 
            +
                </script>
         | 
| 161 | 
            +
                """)
         | 
| 162 |  | 
| 163 | 
             
                for model, result in results.items():
         | 
| 164 | 
             
                    if "error" in result:
         | 
| 165 | 
            +
                        html_parts.append(f"""
         | 
| 166 | 
            +
                        <div class="tokenizer-container">
         | 
| 167 | 
            +
                            <div class="tokenizer-header">{result["model"]} ❌</div>
         | 
| 168 | 
            +
                            <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
         | 
| 169 | 
            +
                        </div>
         | 
| 170 | 
            +
                        """)
         | 
| 171 | 
             
                        continue
         | 
| 172 |  | 
| 173 | 
            +
                    html_parts.append(f"""
         | 
| 174 | 
            +
                    <div class="tokenizer-container">
         | 
| 175 | 
            +
                        <div class="tokenizer-header">
         | 
| 176 | 
            +
                            {result["model"]} 
         | 
| 177 | 
            +
                            <span class="token-stats">
         | 
| 178 | 
            +
                                {result["token_count"]} tokens | 
         | 
| 179 | 
            +
                                {result["encoding"]} | 
         | 
| 180 | 
            +
                                {result["compression_ratio"]:.2f}x compression
         | 
| 181 | 
            +
                            </span>
         | 
| 182 | 
            +
                        </div>
         | 
| 183 | 
            +
                        <div class="token-display">
         | 
| 184 | 
            +
                    """)
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                    # Add tokens with hover functionality
         | 
| 187 | 
             
                    subword_count = 0
         | 
| 188 | 
            +
                    for i, token in enumerate(result["tokens"]):
         | 
|  | |
| 189 | 
             
                        token_text = token["text"]
         | 
| 190 | 
            +
                        display_text = (
         | 
| 191 | 
            +
                            token_text if token_text.strip() else "·"
         | 
| 192 | 
            +
                        )  # Show space as dot
         | 
|  | |
| 193 |  | 
| 194 | 
            +
                        # Determine token class
         | 
| 195 | 
            +
                        token_class = f"token token-{token['type']}"
         | 
| 196 | 
             
                        if token["is_subword"]:
         | 
| 197 | 
            +
                            token_class += " token-subword"
         | 
| 198 | 
             
                            subword_count += 1
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 199 |  | 
| 200 | 
            +
                        # Escape text for HTML
         | 
| 201 | 
            +
                        escaped_text = token_text.replace('"', """).replace("'", "'")
         | 
| 202 | 
            +
                        escaped_display = display_text.replace('"', """).replace("'", "'")
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                        html_parts.append(f"""
         | 
| 205 | 
            +
                            <span class="{token_class}" 
         | 
| 206 | 
            +
                                  data-text="{escaped_text}"
         | 
| 207 | 
            +
                                  data-id="{token["id"]}"
         | 
| 208 | 
            +
                                  data-position="{i}"
         | 
| 209 | 
            +
                                  title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
         | 
| 210 | 
            +
                                  onmouseover="highlightToken('{escaped_text}', true)"
         | 
| 211 | 
            +
                                  onmouseout="clearHighlights()">
         | 
| 212 | 
            +
                                {escaped_display}
         | 
| 213 | 
            +
                            </span>
         | 
| 214 | 
            +
                        """)
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                    html_parts.append(f"""
         | 
| 217 | 
            +
                        </div>
         | 
| 218 | 
            +
                        <div style="margin-top: 8px; font-size: 12px; color: #666;">
         | 
| 219 | 
            +
                            Subwords: {subword_count}/{len(result["tokens"])} 
         | 
| 220 | 
            +
                            ({subword_count / len(result["tokens"]) * 100:.1f}%)
         | 
| 221 | 
            +
                        </div>
         | 
| 222 | 
            +
                    </div>
         | 
| 223 | 
            +
                    """)
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                return "".join(html_parts)
         | 
| 226 | 
            +
             | 
| 227 | 
            +
             | 
| 228 | 
            +
            def generate_token_ids_display(results):
         | 
| 229 | 
            +
                """Generate a clean display of token IDs for each tokenizer"""
         | 
| 230 | 
            +
                if not results:
         | 
| 231 | 
            +
                    return "No token IDs to display."
         | 
| 232 |  | 
| 233 | 
            +
                output = []
         | 
| 234 | 
            +
                output.append("## 🔢 Token IDs by Tokenizer")
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                for model, result in results.items():
         | 
| 237 | 
            +
                    if "error" in result:
         | 
| 238 | 
            +
                        output.append(f"\n### {result['model']} ❌")
         | 
| 239 | 
            +
                        output.append(f"Error: {result['error']}")
         | 
| 240 | 
            +
                        continue
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                    output.append(f"\n### {result['model']}")
         | 
| 243 | 
            +
                    output.append(
         | 
| 244 | 
            +
                        f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}"
         | 
| 245 | 
            +
                    )
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                    # Display token IDs in a readable format
         | 
| 248 | 
            +
                    token_ids = [str(token["id"]) for token in result["tokens"]]
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                    # Group IDs for better readability (10 per line)
         | 
| 251 | 
            +
                    lines = []
         | 
| 252 | 
            +
                    for i in range(0, len(token_ids), 10):
         | 
| 253 | 
            +
                        line_ids = token_ids[i : i + 10]
         | 
| 254 | 
            +
                        lines.append(" ".join(line_ids))
         | 
| 255 | 
            +
             | 
| 256 | 
            +
                    output.append("```")
         | 
| 257 | 
            +
                    output.append("\n".join(lines))
         | 
| 258 | 
            +
                    output.append("```")
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                    # Add some statistics
         | 
| 261 | 
            +
                    unique_ids = len(set(token_ids))
         | 
| 262 | 
            +
                    output.append(
         | 
| 263 | 
            +
                        f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
         | 
| 264 | 
            +
                    )
         | 
| 265 | 
            +
             | 
| 266 | 
            +
                    # Show ID ranges
         | 
| 267 | 
            +
                    id_values = [token["id"] for token in result["tokens"]]
         | 
| 268 | 
            +
                    if id_values:
         | 
| 269 | 
            +
                        output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
         | 
| 270 |  | 
| 271 | 
             
                return "\n".join(output)
         | 
| 272 |  | 
|  | |
| 445 | 
             
                                "bloom",
         | 
| 446 | 
             
                                "aya-expanse",
         | 
| 447 | 
             
                                "comma",
         | 
| 448 | 
            +
                                "roberta",
         | 
| 449 | 
            +
                                "distilbert",
         | 
| 450 | 
             
                                "tokenmonster",
         | 
| 451 | 
            +
                                "byt5",
         | 
| 452 | 
             
                            ],
         | 
| 453 | 
             
                            value=["gpt-4", "llama-3", "gpt-2"],
         | 
| 454 | 
             
                            label="Select tokenizers to compare",
         | 
|  | |
| 458 |  | 
| 459 | 
             
                with gr.Row():
         | 
| 460 | 
             
                    with gr.Column():
         | 
| 461 | 
            +
                        efficiency_output = gr.Markdown(
         | 
| 462 | 
            +
                            label="Efficiency Ranking",
         | 
| 463 | 
            +
                            value="Enter text above to see efficiency comparison...",
         | 
| 464 | 
            +
                        )
         | 
| 465 | 
            +
             | 
| 466 | 
            +
                with gr.Row():
         | 
| 467 | 
            +
                    with gr.Column():
         | 
| 468 | 
            +
                        tokenization_display = gr.HTML(
         | 
| 469 | 
            +
                            label="Interactive Tokenization (Hover to highlight across tokenizers)",
         | 
| 470 | 
            +
                            value="<p>Enter text above to see interactive tokenization...</p>",
         | 
| 471 | 
            +
                        )
         | 
| 472 | 
            +
             | 
| 473 | 
            +
                with gr.Row():
         | 
| 474 | 
            +
                    with gr.Column():
         | 
| 475 | 
            +
                        token_ids_output = gr.Markdown(
         | 
| 476 | 
            +
                            label="Token IDs", value="Token IDs will appear here..."
         | 
| 477 | 
             
                        )
         | 
| 478 |  | 
| 479 | 
             
                with gr.Row():
         | 
|  | |
| 494 |  | 
| 495 | 
             
                # Main comparison function
         | 
| 496 | 
             
                def update_comparison(text, models, details):
         | 
| 497 | 
            +
                    efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
         | 
| 498 | 
            +
                        compare_tokenizers(text, models, details)
         | 
| 499 | 
             
                    )
         | 
| 500 | 
            +
                    return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
         | 
| 501 |  | 
| 502 | 
             
                # Auto-update on changes
         | 
| 503 | 
             
                for component in [text_input, model_selector, show_details]:
         | 
|  | |
| 505 | 
             
                        fn=update_comparison,
         | 
| 506 | 
             
                        inputs=[text_input, model_selector, show_details],
         | 
| 507 | 
             
                        outputs=[
         | 
| 508 | 
            +
                            efficiency_output,
         | 
| 509 | 
            +
                            tokenization_display,
         | 
| 510 | 
            +
                            token_ids_output,
         | 
| 511 | 
             
                            detailed_output,
         | 
| 512 | 
             
                            efficiency_chart,
         | 
| 513 | 
             
                            distribution_chart,
         | 
|  | |
| 522 | 
             
                - **LLaMA-2/3**: Meta's models using SentencePiece
         | 
| 523 | 
             
                - **Gemma-2**: Google's model with SentencePiece
         | 
| 524 | 
             
                - **Qwen3/2.5**: Alibaba's models with BPE
         | 
| 525 | 
            +
                - **BERT/DistilBERT**: Google's models with WordPiece
         | 
| 526 | 
            +
                - **RoBERTa**: Facebook's model with BPE
         | 
| 527 | 
             
                - **BLOOM**: BigScience's multilingual model with BPE
         | 
| 528 | 
             
                - **Aya Expanse**: Cohere's multilingual model with SentencePiece
         | 
| 529 | 
            +
                - **Comma (Common Pile)**: Common Pile's model with BPE
         | 
|  | |
|  | |
| 530 |  | 
| 531 | 
             
                ### Features
         | 
| 532 | 
             
                - **Efficiency Ranking**: Compare token counts across models
         | 
|  | |
| 538 |  | 
| 539 | 
             
            if __name__ == "__main__":
         | 
| 540 | 
             
                demo.launch()
         | 
| 541 | 
            +
                demo.launch()
         | 
| 542 | 
            +
                demo.launch()
         | 
    	
        mappings.py
    ADDED
    
    | @@ -0,0 +1,36 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Model mappings
         | 
| 2 | 
            +
            MODEL_MAP = {
         | 
| 3 | 
            +
                "llama-2": "meta-llama/Llama-2-7b-hf",
         | 
| 4 | 
            +
                "llama-3": "meta-llama/Llama-3.2-1B",
         | 
| 5 | 
            +
                "gemma-2": "google/gemma-2-2b",
         | 
| 6 | 
            +
                "qwen3": "Qwen/Qwen3-0.6B",
         | 
| 7 | 
            +
                "qwen2.5": "Qwen/Qwen2.5-0.5B",
         | 
| 8 | 
            +
                "bert": "bert-base-uncased",
         | 
| 9 | 
            +
                "bloom": "bigscience/bloom-560m",
         | 
| 10 | 
            +
                "aya-expanse": "CohereForAI/aya-expanse-8b",
         | 
| 11 | 
            +
                "comma": "common-pile/comma-v0.1-2t",
         | 
| 12 | 
            +
                "byte-level": "google/byt5-small",
         | 
| 13 | 
            +
                "tokenmonster": "alasdairforsythe/tokenmonster",
         | 
| 14 | 
            +
                "byt5": "google/byt5-small",
         | 
| 15 | 
            +
            }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            TOKENIZER_INFO = {
         | 
| 18 | 
            +
                "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
         | 
| 19 | 
            +
                "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
         | 
| 20 | 
            +
                "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
         | 
| 21 | 
            +
                "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
         | 
| 22 | 
            +
                "gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
         | 
| 23 | 
            +
                "qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
         | 
| 24 | 
            +
                "qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
         | 
| 25 | 
            +
                "bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
         | 
| 26 | 
            +
                "bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
         | 
| 27 | 
            +
                "aya-expanse": {
         | 
| 28 | 
            +
                    "name": "Aya Expanse",
         | 
| 29 | 
            +
                    "vocab_size": 256000,
         | 
| 30 | 
            +
                    "encoding": "SentencePiece",
         | 
| 31 | 
            +
                },
         | 
| 32 | 
            +
                "comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
         | 
| 33 | 
            +
                "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
         | 
| 34 | 
            +
                "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
         | 
| 35 | 
            +
                "byt5": {"name": "Byt5", "vocab_size": 50000, "encoding": "BPE"},
         | 
| 36 | 
            +
            }
         | 
    	
        utils.py
    ADDED
    
    | @@ -0,0 +1,136 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import re
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import tiktoken
         | 
| 5 | 
            +
            from transformers import AutoTokenizer
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from mappings import MODEL_MAP, TOKENIZER_INFO
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            def get_token_type(token_text):
         | 
| 11 | 
            +
                if re.match(r"^\s+$", token_text):
         | 
| 12 | 
            +
                    return "whitespace"
         | 
| 13 | 
            +
                elif re.match(r"^[a-zA-Z]+$", token_text):
         | 
| 14 | 
            +
                    return "word"
         | 
| 15 | 
            +
                elif re.match(r"^\d+$", token_text):
         | 
| 16 | 
            +
                    return "number"
         | 
| 17 | 
            +
                elif re.match(r"^[^\w\s]+$", token_text):
         | 
| 18 | 
            +
                    return "punctuation"
         | 
| 19 | 
            +
                elif token_text.startswith("<") and token_text.endswith(">"):
         | 
| 20 | 
            +
                    return "special"
         | 
| 21 | 
            +
                else:
         | 
| 22 | 
            +
                    return "mixed"
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            def is_subword(token_text, model, is_first):
         | 
| 26 | 
            +
                if model in ["llama-2", "llama-3", "qwen3"]:
         | 
| 27 | 
            +
                    return not token_text.startswith("▁") and not is_first
         | 
| 28 | 
            +
                elif model == "bert":
         | 
| 29 | 
            +
                    return token_text.startswith("##")
         | 
| 30 | 
            +
                else:  # BPE models
         | 
| 31 | 
            +
                    return not token_text.startswith(" ") and not is_first and len(token_text) > 0
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            def tokenize_with_tiktoken(text, model):
         | 
| 35 | 
            +
                encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
         | 
| 36 | 
            +
                enc = tiktoken.get_encoding(encoding)
         | 
| 37 | 
            +
                tokens = enc.encode(text)
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                token_data = []
         | 
| 40 | 
            +
                current_pos = 0
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                for i, token_id in enumerate(tokens):
         | 
| 43 | 
            +
                    token_text = enc.decode([token_id])
         | 
| 44 | 
            +
                    token_type = get_token_type(token_text)
         | 
| 45 | 
            +
                    subword = is_subword(token_text, model, i == 0)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    token_data.append(
         | 
| 48 | 
            +
                        {
         | 
| 49 | 
            +
                            "text": token_text,
         | 
| 50 | 
            +
                            "id": int(token_id),
         | 
| 51 | 
            +
                            "type": token_type,
         | 
| 52 | 
            +
                            "is_subword": subword,
         | 
| 53 | 
            +
                            "bytes": len(token_text.encode("utf-8")),
         | 
| 54 | 
            +
                            "position": i,
         | 
| 55 | 
            +
                        }
         | 
| 56 | 
            +
                    )
         | 
| 57 | 
            +
                    current_pos += len(token_text)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                return {
         | 
| 60 | 
            +
                    "model": TOKENIZER_INFO[model]["name"],
         | 
| 61 | 
            +
                    "token_count": len(tokens),
         | 
| 62 | 
            +
                    "tokens": token_data,
         | 
| 63 | 
            +
                    "compression_ratio": len(text) / len(tokens) if tokens else 0,
         | 
| 64 | 
            +
                    "encoding": TOKENIZER_INFO[model]["encoding"],
         | 
| 65 | 
            +
                    "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         | 
| 66 | 
            +
                }
         | 
| 67 | 
            +
             | 
| 68 | 
            +
             | 
| 69 | 
            +
            def tokenize_with_hf(text, model):
         | 
| 70 | 
            +
                try:
         | 
| 71 | 
            +
                    model_name = MODEL_MAP.get(model, "gpt2")
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    # Get token from environment
         | 
| 74 | 
            +
                    hf_token = os.getenv("HF_TOKEN")
         | 
| 75 | 
            +
                    if not hf_token:
         | 
| 76 | 
            +
                        return {
         | 
| 77 | 
            +
                            "model": TOKENIZER_INFO[model]["name"],
         | 
| 78 | 
            +
                            "token_count": 0,
         | 
| 79 | 
            +
                            "tokens": [],
         | 
| 80 | 
            +
                            "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
         | 
| 81 | 
            +
                        }
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                    print(f"DEBUG: Loading model {model_name} with token")
         | 
| 84 | 
            +
                    tokenizer = AutoTokenizer.from_pretrained(
         | 
| 85 | 
            +
                        model_name, token=hf_token, trust_remote_code=True
         | 
| 86 | 
            +
                    )
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    tokens = tokenizer.encode(text)
         | 
| 89 | 
            +
                    token_data = []
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                    for i, token_id in enumerate(tokens):
         | 
| 92 | 
            +
                        token_text = tokenizer.decode([token_id], skip_special_tokens=False)
         | 
| 93 | 
            +
                        token_type = get_token_type(token_text)
         | 
| 94 | 
            +
                        subword = is_subword(token_text, model, i == 0)
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                        token_data.append(
         | 
| 97 | 
            +
                            {
         | 
| 98 | 
            +
                                "text": token_text,
         | 
| 99 | 
            +
                                "id": int(token_id),
         | 
| 100 | 
            +
                                "type": token_type,
         | 
| 101 | 
            +
                                "is_subword": subword,
         | 
| 102 | 
            +
                                "bytes": len(token_text.encode("utf-8")),
         | 
| 103 | 
            +
                                "position": i,
         | 
| 104 | 
            +
                            }
         | 
| 105 | 
            +
                        )
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                    return {
         | 
| 108 | 
            +
                        "model": TOKENIZER_INFO[model]["name"],
         | 
| 109 | 
            +
                        "token_count": len(tokens),
         | 
| 110 | 
            +
                        "tokens": token_data,
         | 
| 111 | 
            +
                        "compression_ratio": len(text) / len(tokens) if tokens else 0,
         | 
| 112 | 
            +
                        "encoding": TOKENIZER_INFO[model]["encoding"],
         | 
| 113 | 
            +
                        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         | 
| 114 | 
            +
                    }
         | 
| 115 | 
            +
                except Exception as e:
         | 
| 116 | 
            +
                    error_msg = str(e)
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    # Provide helpful error messages
         | 
| 119 | 
            +
                    if "gated repo" in error_msg.lower():
         | 
| 120 | 
            +
                        error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
         | 
| 121 | 
            +
                    elif "401" in error_msg:
         | 
| 122 | 
            +
                        error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
         | 
| 123 | 
            +
                    elif "not found" in error_msg.lower():
         | 
| 124 | 
            +
                        error_msg = (
         | 
| 125 | 
            +
                            f"Model {model_name} not found. It may have been moved or renamed."
         | 
| 126 | 
            +
                        )
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                    return {
         | 
| 129 | 
            +
                        "model": TOKENIZER_INFO[model]["name"],
         | 
| 130 | 
            +
                        "token_count": 0,
         | 
| 131 | 
            +
                        "tokens": [],
         | 
| 132 | 
            +
                        "compression_ratio": 0,
         | 
| 133 | 
            +
                        "encoding": "Error",
         | 
| 134 | 
            +
                        "vocab_size": 0,
         | 
| 135 | 
            +
                        "error": error_msg,
         | 
| 136 | 
            +
                    }
         | 
