salmankhanpm commited on
Commit
a92795e
·
verified ·
1 Parent(s): 5ccf71d

Upload 2 files

Browse files
Files changed (2) hide show
  1. _app.py +79 -146
  2. requirements.txt +68 -1
_app.py CHANGED
@@ -1,150 +1,83 @@
1
- import logging
2
-
3
  import tiktoken
4
  from transformers import AutoTokenizer
5
 
6
- import gradio as gr
7
-
8
- logger = logging.getLogger(__name__) # noqa
9
-
10
-
11
- def load_test_phrases(filename):
12
- with open(f"./data/{filename}", "r", encoding="utf-8") as file:
13
- return file.read().splitlines()
14
-
15
-
16
- models = ["HuggingFaceTB/SmolLM2-135M-Instruct", # SmolLM2
17
- "meta-llama/Llama-3.2-3B-Instruct", # LLAMA-3
18
- "Telugu-LLM-Labs/Telugu-Llama2-7B-v0-Instruct", # LLama 2 Finetuned for Improving Telugu
19
- "CohereForAI/aya-23-8B", # AYA
20
- "google/gemma-3-4b-it", # GEMMA 3
21
- "sarvamai/sarvam-1", # SarvamAI
22
- "gpt-4o", # GPT4o
23
- "Qwen/Qwen3-4B", # Qwen
24
- "TWO/sutra-mlt256-v2"] # SUTRA
25
-
26
- test_phrase_set = [
27
- "ఐదు వాక్యాలలో న్యూట్రాన్ స్కాటరింగ్ గురించి నాకు వివరణ ఇవ్వండి", # Telugu
28
- ]
29
-
30
- test_phrase_set_long_1 = load_test_phrases('multilingualphrases01.txt')
31
- test_phrase_set_long_2 = load_test_phrases('multilingualphrases02.txt')
32
- test_phrase_set_long_3 = load_test_phrases('multilingualphrases03.txt')
33
-
34
-
35
- def generate_tokens_as_table(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  table = []
37
- for model in models:
38
- if 'gpt' not in model:
39
- tokenizer = AutoTokenizer.from_pretrained(model)
40
- tokens = tokenizer.encode(text, add_special_tokens=False)
41
- else:
42
- tokenizer = tiktoken.encoding_for_model(model)
43
- tokens = tokenizer.encode(text)
44
- decoded = [tokenizer.decode([t]) for t in tokens]
45
- table.append([model] + decoded)
46
- return table
47
-
48
-
49
- def generate_tokenizer_table(text):
50
- if not text:
51
- return []
52
-
53
- token_counts = {model: 0 for model in models}
54
- vocab_size = {model: 0 for model in models}
55
-
56
- for model in models:
57
- if 'gpt' not in model:
58
- tokenizer = AutoTokenizer.from_pretrained(model)
59
- vocab_size[model] = tokenizer.vocab_size
60
- else:
61
- tokenizer = tiktoken.encoding_for_model(model)
62
- vocab_size[model] = tokenizer.n_vocab
63
-
64
- token_counts[model] += len(tokenizer.encode(text))
65
-
66
- word_count = len(text.split(' '))
67
-
68
- output = []
69
- for m in models:
70
- row = [m, vocab_size[m], word_count, token_counts[m], f"{token_counts[m] / word_count:0.2f}"]
71
- output.append(row)
72
-
73
- return output
74
-
75
-
76
- def generate_split_token_table(text):
77
- if not text:
78
- return gr.Dataframe()
79
-
80
- table = generate_tokenizer_table(text)
81
- return gr.Dataframe(
82
- table,
83
- headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'],
84
- datatype=["str", "number", "str"],
85
- row_count=len(models),
86
- col_count=(5, "fixed"),
87
- )
88
-
89
-
90
- with gr.Blocks() as sutra_token_count:
91
- gr.Markdown(
92
- """
93
- # SUTRA Multilingual Tokenizer Specs & Stats.
94
- ## Tokenize paragraphs in multiple languages and compare token counts.
95
- """)
96
- textbox = gr.Textbox(label="Input Text")
97
- submit_button = gr.Button("Submit")
98
- output = gr.Dataframe()
99
- examples = [
100
- [' '.join(test_phrase_set_long_1)],
101
- [' '.join(test_phrase_set_long_2)],
102
- [' '.join(test_phrase_set_long_3)],
103
- ]
104
- gr.Examples(examples=examples, inputs=[textbox])
105
- submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output])
106
-
107
-
108
- def generate_tokens_table(text):
109
- table = generate_tokens_as_table(text)
110
- cols = len(table[0])
111
- return gr.Dataframe(
112
- table,
113
- headers=['model'] + [str(i) for i in range(cols - 1)],
114
- row_count=2,
115
- col_count=(cols, "fixed"),
116
- )
117
-
118
-
119
- with gr.Blocks() as sutra_tokenize:
120
- gr.Markdown(
121
- """
122
- # SUTRA Multilingual Tokenizer Sentence Inspector.
123
- ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
124
- """)
125
- textbox = gr.Textbox(label="Input Text")
126
- submit_button = gr.Button("Submit")
127
- output = gr.Dataframe()
128
- examples = test_phrase_set
129
- gr.Examples(examples=examples, inputs=[textbox])
130
- submit_button.click(generate_tokens_table, inputs=[textbox], outputs=[output])
131
-
132
-
133
- if __name__ == '__main__':
134
- with gr.Blocks(analytics_enabled=False) as demo:
135
- with gr.Row():
136
- gr.Markdown(
137
- """
138
- ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
139
- """
140
- )
141
- with gr.Row():
142
- gr.TabbedInterface(
143
- interface_list=[sutra_tokenize, sutra_token_count],
144
- tab_names=["Tokenize Text", "Tokenize Paragraphs"]
145
- )
146
-
147
- demo.queue(default_concurrency_limit=5).launch(
148
- server_name="0.0.0.0",
149
- allowed_paths=["/"],
150
- )
 
 
 
1
  import tiktoken
2
  from transformers import AutoTokenizer
3
 
4
+ # ... existing code ...
5
+ def analyze_tokens_detailed(text, model):
6
+ """
7
+ For a given text and model, returns a list of dicts with details for each token:
8
+ - token string
9
+ - token id
10
+ - decoded value
11
+ - token length
12
+ - NSL value (token length / max token length in sequence)
13
+ - subword fertility (number of tokens per word)
14
+ Also returns the decoded output for the entire sequence.
15
+ """
16
+ # Tokenize
17
+ if 'gpt' in model:
18
+ tokenizer = tiktoken.encoding_for_model(model)
19
+ token_ids = tokenizer.encode(text)
20
+ tokens = [tokenizer.decode([tid]) for tid in token_ids]
21
+ else:
22
+ tokenizer = AutoTokenizer.from_pretrained(model)
23
+ token_ids = tokenizer.encode(text, add_special_tokens=False)
24
+ tokens = [tokenizer.decode([tid]) for tid in token_ids]
25
+
26
+ # Decoded output for the entire sequence
27
+ if 'gpt' in model:
28
+ decoded_output = tokenizer.decode(token_ids)
29
+ else:
30
+ decoded_output = tokenizer.decode(token_ids)
31
+
32
+ # Token lengths
33
+ token_lengths = [len(t) for t in tokens]
34
+ max_token_length = max(token_lengths) if token_lengths else 1
35
+ nsl_values = [l / max_token_length for l in token_lengths]
36
+
37
+ # Subword fertility: number of tokens per word
38
+ # Map each token to its originating word (approximate)
39
+ words = text.split()
40
+ word_token_counts = []
41
+ if len(words) > 0:
42
+ # Use a simple greedy approach: assign tokens to words in order
43
+ import re
44
+ text_pointer = 0
45
+ word_idx = 0
46
+ token_word_map = []
47
+ for token in tokens:
48
+ # Find the next word that matches the start of the token
49
+ while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]):
50
+ text_pointer += 1
51
+ if word_idx < len(words):
52
+ token_word_map.append(word_idx)
53
+ text_pointer += len(token)
54
+ if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])):
55
+ word_idx += 1
56
+ else:
57
+ token_word_map.append(-1)
58
+ # Count tokens per word
59
+ from collections import Counter
60
+ fertility_counter = Counter(token_word_map)
61
+ subword_fertility = [fertility_counter[i] for i in range(len(words))]
62
+ # Assign fertility to each token
63
+ token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map]
64
+ else:
65
+ token_fertility = [1 for _ in tokens]
66
+
67
+ # Build table
68
  table = []
69
+ for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)):
70
+ table.append({
71
+ 'token': token,
72
+ 'token_id': tid,
73
+ 'decoded': decoded,
74
+ 'token_length': length,
75
+ 'nsl': nsl,
76
+ 'subword_fertility': fert
77
+ })
78
+ return {
79
+ 'model': model,
80
+ 'decoded_output': decoded_output,
81
+ 'tokens': table
82
+ }
83
+ # ... existing code ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,71 @@
1
  transformers
2
  tiktoken
3
  gradio
4
- sentencepiece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  transformers
2
  tiktoken
3
  gradio
4
+ sentencepieceaiofiles==24.1.0
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ brotli==1.1.0
8
+ certifi==2025.7.14
9
+ charset-normalizer==3.4.2
10
+ click==8.2.1
11
+ dotenv==0.9.9
12
+ fastapi==0.116.1
13
+ ffmpy==0.6.1
14
+ filelock==3.18.0
15
+ fsspec==2025.7.0
16
+ gradio==5.38.2
17
+ gradio-client==1.11.0
18
+ groovy==0.1.2
19
+ h11==0.16.0
20
+ hf-xet==1.1.5
21
+ httpcore==1.0.9
22
+ httpx==0.28.1
23
+ huggingface-hub==0.34.1
24
+ idna==3.10
25
+ inquirerpy==0.3.4
26
+ jinja2==3.1.6
27
+ markdown-it-py==3.0.0
28
+ markupsafe==3.0.2
29
+ mdurl==0.1.2
30
+ numpy==2.3.2
31
+ orjson==3.11.1
32
+ packaging==25.0
33
+ pandas==2.3.1
34
+ pfzy==0.3.4
35
+ pillow==11.3.0
36
+ prompt-toolkit==3.0.51
37
+ protobuf==6.31.1
38
+ pydantic==2.11.7
39
+ pydantic-core==2.33.2
40
+ pydub==0.25.1
41
+ pygments==2.19.2
42
+ python-dateutil==2.9.0.post0
43
+ python-dotenv==1.1.1
44
+ python-multipart==0.0.20
45
+ pytz==2025.2
46
+ pyyaml==6.0.2
47
+ regex==2024.11.6
48
+ requests==2.32.4
49
+ rich==14.1.0
50
+ ruff==0.12.5
51
+ safehttpx==0.1.6
52
+ safetensors==0.5.3
53
+ semantic-version==2.10.0
54
+ sentencepiece==0.2.0
55
+ shellingham==1.5.4
56
+ six==1.17.0
57
+ sniffio==1.3.1
58
+ starlette==0.47.2
59
+ tiktoken==0.9.0
60
+ tokenizers==0.21.2
61
+ tomlkit==0.13.3
62
+ tqdm==4.67.1
63
+ transformers==4.54.0
64
+ typer==0.16.0
65
+ typing-extensions==4.14.1
66
+ typing-inspection==0.4.1
67
+ tzdata==2025.2
68
+ urllib3==2.5.0
69
+ uvicorn==0.35.0
70
+ wcwidth==0.2.13
71
+ websockets==15.0.1