Spaces:

freddyaboulton
/

gpt-oss-tokenizer-playground

Sleeping

App Files Files Community

freddyaboulton HF Staff commited on Aug 8

Commit

4a104a5

verified ·

1 Parent(s): 52971db

Create app.py

Browse files

Files changed (1) hide show

app.py +141 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import gradio as gr
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
+def tokenize_dialogue(dialogue_data):
+    """
+    Tokenize the dialogue using the GPT-OSS tokenizer
+    """
+    if tokenizer is None:
+        raise ValueError("Tokenizer not loaded. Please check your installation.")
+    messages = []
+    for message in dialogue_data:
+        role = message.get("speaker", "user")
+        content = message.get("text", "")
+        if role == "system":
+            messages.append({"role": "system", "content": content})
+        elif role == "user":
+            messages.append({"role": "user", "content": content})
+        elif role == "assistant":
+            messages.append({"role": "assistant", "content": content})
+    formatted_input = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="np"
+    )
+    token_ids = formatted_input[0].tolist()
+    decoded_text = []
+    colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"]
+    color_map = {}
+    for i, token_id in enumerate(token_ids):
+        color = colors[i % len(colors)]
+        if token_id not in color_map:
+            color_map[str(token_id)] = color
+        decoded_text.append((tokenizer.decode([token_id]), str(token_id)))
+    print("decoded_text", decoded_text)
+    return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids)
+def create_sample_dialogue():
+    """
+    Create a sample dialogue for demonstration
+    """
+    return [
+        {"speaker": "system", "text": "You are a helpful assistant."},
+        {"speaker": "user", "text": "Hello! How are you today?"},
+        {"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"},
+        {"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"}
+    ]
+with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo:
+    gr.Markdown("# GPT-OSS Tokenizer Explorer")
+    gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Input Dialogue")
+            dialogue_input = gr.Dialogue(
+                speakers=["system", "user", "assistant"],
+                label="Enter your dialogue",
+                placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message",
+                show_submit_button=True,
+                show_copy_button=True,
+                type="dialogue",
+                ui_mode="dialogue-only",
+            )
+            with gr.Row():
+                sample_btn = gr.Button("Load Sample", variant="secondary")
+                clear_btn = gr.Button("Clear", variant="secondary")
+        with gr.Column(scale=1):
+            gr.Markdown("### Tokenization Results")
+            highlighted_output = gr.HighlightedText(
+                label="Tokenized Output",
+                show_inline_category=False
+            )
+            token_count = gr.Label(
+                value="Total Tokens: 0",
+                label="Token Count"
+            )
+    with gr.Accordion("How to use", open=False):
+        gr.Markdown("""
+        ### Instructions:
+        1. **Enter dialogue**: Use the dialogue component to enter conversations
+        2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message
+        3. **Submit**: Click 'Tokenize Dialogue' to process the conversation
+        4. **View results**: See the tokenization details in the output area
+        ### Example:
+        ```
+        system: You are a helpful assistant.
+        user: Hello! How are you today?
+        assistant: I'm doing well, thank you for asking!
+        ```
+        ### What you'll see:
+        - **Total tokens**: Number of tokens in the conversation
+        - **Tokenized output**: How the tokenizer formats the conversation
+        """)
+    def process_dialogue(dialogue):
+        if not dialogue:
+            return "Please enter some dialogue first.", {}, "Total Tokens: 0"
+        result_text, token_count_val = tokenize_dialogue(dialogue)
+        return result_text, f"Total Tokens: {token_count_val}"
+    def clear_dialogue():
+        return None, [], "Total Tokens: 0"
+    sample_btn.click(
+        fn=create_sample_dialogue,
+        outputs=[dialogue_input]
+    )
+    clear_btn.click(
+        fn=clear_dialogue,
+        outputs=[dialogue_input, highlighted_output, token_count]
+    )
+    dialogue_input.submit(
+        fn=process_dialogue,
+        inputs=[dialogue_input],
+        outputs=[highlighted_output, token_count]
+    )
+if __name__ == "__main__":
+    demo.launch()