File size: 2,930 Bytes
f7fef32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from sentence_transformers import SentenceTransformer
from tokenizers import Encoding, Tokenizer
from torch.nn import EmbeddingBag
import torch


def test_tokenizer():
    examples = [
        "This is an example of encoding",
        "The quick brown fox jumps over the lazy dog.",
        "Curaçao, naïve fiancé, jalapeño, déjà vu.",
        "Привет, как дела?",
        "Бързата кафява лисица прескача мързеливото куче.",
        "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
        "اللغة العربية جميلة وغنية بالتاريخ.",
        "مرحبا بالعالم!",
        "Simplified: 快速的棕色狐狸跳过懒狗。",
        "Traditional: 快速的棕色狐狸跳過懶狗。",
        "素早い茶色の狐が怠け者の犬を飛び越える。",
        "コンピュータープログラミング",
        "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
        "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
        "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
        "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
        "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
        "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
        "Hello 世界 مرحبا 🌍",
        "123, αβγ, абв, العربية, 中文, हिन्दी.",
    ]

    tokenizer: Tokenizer = Tokenizer.from_file("js/tokenizer.json")

    for example in examples:
        encoding: Encoding = tokenizer.encode(example)
        print(example)
        print(encoding.tokens)
        print()


# https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1
model = SentenceTransformer(
    "sentence-transformers/static-similarity-mrl-multilingual-v1", device="cpu"
)

embedding_bag: EmbeddingBag = model[0].embedding  # type: ignore
embeddings = torch.Tensor(embedding_bag.weight)

print(embeddings.shape)
assert embeddings.shape == torch.Size([105879, 1024])

print("float32")
print(f"  1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB")
print(f"   512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB")
print(f"   256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB")

print("float16")
print(f"  1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB")
print(f"   512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB")
print(f"   256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB")