Spaces:
Running
Running
fix typo
Browse files- compression_app.py +2 -2
compression_app.py
CHANGED
|
@@ -28,7 +28,7 @@ from compression_util import get_compression_leaderboard, common_corpuses
|
|
| 28 |
docs = """## 📖 What is a good tokenizer?
|
| 29 |
|
| 30 |
From a compression perspective, a good tokenizer should be lossless,
|
| 31 |
-
and keep high compression rate (
|
| 32 |
The encoding and decoding process can be formulated as
|
| 33 |
```python
|
| 34 |
token_ids = tokenizer.encode(input_text) # compressed tokens
|
|
@@ -144,7 +144,7 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 144 |
|
| 145 |
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
| 146 |
"This leaderboard aims to evaluate tokenizer performance on different languages.\n"
|
| 147 |
-
"Lower `oov_ratio` refers to
|
| 148 |
"Lower `char/token` means more words might be segmented into subwords."
|
| 149 |
)
|
| 150 |
search_bar = gr.Textbox(
|
|
|
|
| 28 |
docs = """## 📖 What is a good tokenizer?
|
| 29 |
|
| 30 |
From a compression perspective, a good tokenizer should be lossless,
|
| 31 |
+
and keep high compression rate (fewer tokens for given text).
|
| 32 |
The encoding and decoding process can be formulated as
|
| 33 |
```python
|
| 34 |
token_ids = tokenizer.encode(input_text) # compressed tokens
|
|
|
|
| 144 |
|
| 145 |
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
| 146 |
"This leaderboard aims to evaluate tokenizer performance on different languages.\n"
|
| 147 |
+
"Lower `oov_ratio` refers to fewer out-of-vocabulary tokens.\n"
|
| 148 |
"Lower `char/token` means more words might be segmented into subwords."
|
| 149 |
)
|
| 150 |
search_bar = gr.Textbox(
|