|
|
from sentence_transformers import SentenceTransformer |
|
|
from tokenizers import Encoding, Tokenizer |
|
|
from torch.nn import EmbeddingBag |
|
|
import torch |
|
|
|
|
|
|
|
|
def test_tokenizer(): |
|
|
examples = [ |
|
|
"This is an example of encoding", |
|
|
"The quick brown fox jumps over the lazy dog.", |
|
|
"Curaçao, naïve fiancé, jalapeño, déjà vu.", |
|
|
"Привет, как дела?", |
|
|
"Бързата кафява лисица прескача мързеливото куче.", |
|
|
"Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.", |
|
|
"اللغة العربية جميلة وغنية بالتاريخ.", |
|
|
"مرحبا بالعالم!", |
|
|
"Simplified: 快速的棕色狐狸跳过懒狗。", |
|
|
"Traditional: 快速的棕色狐狸跳過懶狗。", |
|
|
"素早い茶色の狐が怠け者の犬を飛び越える。", |
|
|
"コンピュータープログラミング", |
|
|
"빠른 갈색 여우가 게으른 개를 뛰어넘습니다.", |
|
|
"तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।", |
|
|
"দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।", |
|
|
"வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.", |
|
|
"สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.", |
|
|
"ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።", |
|
|
"Hello 世界 مرحبا 🌍", |
|
|
"123, αβγ, абв, العربية, 中文, हिन्दी.", |
|
|
] |
|
|
|
|
|
tokenizer: Tokenizer = Tokenizer.from_file("js/tokenizer.json") |
|
|
|
|
|
for example in examples: |
|
|
encoding: Encoding = tokenizer.encode(example) |
|
|
print(example) |
|
|
print(encoding.tokens) |
|
|
print() |
|
|
|
|
|
|
|
|
|
|
|
model = SentenceTransformer( |
|
|
"sentence-transformers/static-similarity-mrl-multilingual-v1", device="cpu" |
|
|
) |
|
|
|
|
|
embedding_bag: EmbeddingBag = model[0].embedding |
|
|
embeddings = torch.Tensor(embedding_bag.weight) |
|
|
|
|
|
print(embeddings.shape) |
|
|
assert embeddings.shape == torch.Size([105879, 1024]) |
|
|
|
|
|
print("float32") |
|
|
print(f" 1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB") |
|
|
print(f" 512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB") |
|
|
print(f" 256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB") |
|
|
|
|
|
print("float16") |
|
|
print(f" 1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB") |
|
|
print(f" 512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB") |
|
|
print(f" 256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB") |
|
|
|