static-embeddings / build_models.py
gregtatum's picture
Add the initial models
f7fef32
from dataclasses import dataclass
import shutil
from textwrap import dedent, indent
from typing import Any
import numpy as np
from zstandard import ZstdCompressor
from pathlib import Path
import io
from sentence_transformers import SentenceTransformer
from torch.nn import EmbeddingBag
import torch
from model2vec import StaticModel
from tokenizers import Encoding, Tokenizer
models_path = Path("models")
@dataclass
class ModelCard:
owner: str
repo: str
# The dimensions that were applied with Matroyshka Loss.
matroyshka_dims: list[int]
description: str
license: str
def name(self):
return f"{self.owner}/{self.repo}"
def path(self):
return models_path / self.owner / self.repo
def get_description(self):
return dedent(self.description).strip()
def zst_compress_file(input: Path):
cctx = ZstdCompressor()
output = input.parent / f"{input.name}.zst"
print(f"Compressing {output}")
with open(input, "rb") as fin, open(output, "wb") as fout:
cctx.copy_stream(fin, fout)
def save_data(path: Path, tensor: torch.Tensor):
"""Writes out the static embeddings to a .npy and .npy.zst file"""
buffer = io.BytesIO()
if tensor.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
# Store as the raw bytes.
np.save(buffer, tensor.detach().view(torch.uint8).numpy())
else:
np.save(buffer, tensor.detach().numpy())
print(f"Saving {path}")
with (open(path, "wb") as outfile,):
outfile.write(buffer.getvalue())
zst_compress_file(path)
def quantization_loss_mse(tensor: torch.Tensor, dtype: torch.dtype):
"""
Compute reconstruction loss when converting embeddings to a datatype and back using
the mean squared error, which punishes big errors more than small ones.
"""
# Original → quantize → dequantize
roundtrip = tensor.detach().to(dtype).to(tensor.dtype)
# Mean squared error
return torch.mean((tensor - roundtrip) ** 2).item()
def quantization_loss_mae(tensor: torch.Tensor, dtype: torch.dtype):
"""
Compute reconstruction loss when converting embeddings to a datatype and back using
the mean absolute error, which is less sensitive to outliers than MSE.
"""
# Original → quantize → dequantize
roundtrip = tensor.detach().to(dtype).to(tensor.dtype)
# Mean absolute error
return torch.mean(torch.abs(tensor - roundtrip)).item()
def quantization_loss_cosine(tensor: torch.Tensor, dtype: torch.dtype):
"""
Compute reconstruction loss when converting embeddings to a datatype and back using
cosine similarity. This measures whether the embedding directions are preserved
after quantization, independent of their magnitudes.
"""
# Original → quantize → dequantize
roundtrip = tensor.detach().to(dtype).to(tensor.dtype)
# Flatten both to 2D (num_vectors, dimensions) in case tensor is 1D or higher-D
if tensor.ndim == 1:
orig = tensor.unsqueeze(0)
recon = roundtrip.unsqueeze(0)
else:
orig = tensor.view(tensor.shape[0], -1)
recon = roundtrip.view(roundtrip.shape[0], -1)
# Cosine similarity per vector, then average
cos = torch.nn.functional.cosine_similarity(orig, recon, dim=1)
return cos.mean().item()
def export_embeddings(model_card: ModelCard, embeddings: torch.Tensor) -> None:
vocab_size, dimensions = embeddings.shape
# This logic can always be adjusted for models with different shapes.
assert (
embeddings.dtype == torch.float32
), f"The embeddings {embeddings.dtype} are assumed to be float32."
for dim in model_card.matroyshka_dims:
assert (
dim <= dimensions
), f"The Matroyshka dimensions {dim} were bigger than the models dimensions of {dimensions}"
truncated = embeddings[:, :dim]
assert truncated.shape == torch.Size([vocab_size, dim])
save_data(model_card.path() / f"fp32.d{dim}.npy", truncated)
save_data(
model_card.path() / f"fp16.d{dim}.npy",
truncated.to(dtype=torch.float16),
)
save_data(
model_card.path() / f"fp8_e5m2.d{dim}.npy",
truncated.to(dtype=torch.float8_e5m2),
)
save_data(
model_card.path() / f"fp8_e4m3.d{dim}.npy",
truncated.to(dtype=torch.float8_e4m3fn),
)
def normalized_mean_pooling(x: torch.Tensor) -> torch.Tensor:
pooled = x.mean(dim=0)
normalized = torch.nn.functional.normalize(pooled, dim=0)
return normalized
def export_readme(
model_card: ModelCard,
embeddings: torch.Tensor,
tokenizer: Tokenizer,
):
vocab_size, dimensions = embeddings.shape
norms = torch.norm(embeddings, dim=1) # shape: [vocab_size]
phrases = [
"The committee approved the proposal after hours of heated discussion and several last-minute amendments."
"When training large neural networks, careful tuning of hyperparameters can significantly affect performance and stability."
"Despite the heavy rain, the concert continued as planned and the crowd stayed enthusiastic until the final encore."
"In ancient mythology, heroes often embarked on perilous journeys to discover hidden truths about themselves and their world."
"The new smartphone model features an improved camera system, faster processing, and extended battery life compared to its predecessor."
"He tried to explain the concept using simple analogies, but the underlying mathematics remained difficult to grasp for most listeners."
"After weeks of negotiations, the two countries signed a historic trade agreement aimed at reducing tariffs and boosting cooperation."
"She paused for a moment before answering, choosing her words carefully to avoid misunderstanding in such a delicate situation."
"The detective pieced together the timeline of events, realizing that the key witness had provided a contradictory statement."
"Remote work has changed the way teams collaborate, with online tools replacing traditional office routines and in-person meetings."
]
cosine_similarity = {
torch.float16: [],
torch.float8_e4m3fn: [],
torch.float8_e5m2: [],
}
for phrase in phrases:
encoding: Encoding = tokenizer.encode(phrase)
embedded_phrase = embeddings[torch.tensor(encoding.ids, dtype=torch.long)]
for dtype in cosine_similarity.keys():
pooling_unquantized = normalized_mean_pooling(embedded_phrase)
pooling_roundtrip = normalized_mean_pooling(
embedded_phrase.to(dtype).to(torch.float32)
)
cosine = torch.dot(pooling_unquantized, pooling_roundtrip).item()
cosine_similarity[dtype].append(cosine)
avg_cosine_similarity = {
dtype: sum(values) / len(values) for dtype, values in cosine_similarity.items()
}
tokenizer_examples = []
for text in [
"This is an example of encoding",
"The quick brown fox jumps over the lazy dog.",
"Curaçao, naïve fiancé, jalapeño, déjà vu.",
"Привет, как дела?",
"Бързата кафява лисица прескача мързеливото куче.",
"Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
"اللغة العربية جميلة وغنية بالتاريخ.",
"مرحبا بالعالم!",
"Simplified: 快速的棕色狐狸跳过懒狗。",
"Traditional: 快速的棕色狐狸跳過懶狗。",
"素早い茶色の狐が怠け者の犬を飛び越える。",
"コンピュータープログラミング",
"빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
"तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
"দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
"வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
"สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
"ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
"Hello 世界 مرحبا 🌍",
"123, αβγ, абв, العربية, 中文, हिन्दी.",
]:
encoding = tokenizer.encode(text)
tokens = [f"`{token}`" for token in encoding.tokens]
tokenizer_examples.append(f"**Input:** {text}<br/>")
tokenizer_examples.append(f"**Tokens**: {' '.join(tokens)}")
tokenizer_examples.append("")
tokenizer_output = "\n".join(tokenizer_examples)
with (model_card.path() / "README.md").open("wt") as file:
prefix = " "
file.write(
dedent(
f"""
# [{model_card.name()}](https://huggingface.co/{model_card.name()})
License: [{model_card.license}](https://choosealicense.com/licenses/{model_card.license}/)
{indent(model_card.get_description(), prefix).strip()}
## Model Stats
Stats that describe the embeddings tensor shapes and value distribution.
| item | metric | value |
| --------------| ----------------------- | ----- |
| vocab | size | {vocab_size:,.0f} |
| embedding | dimensions | {dimensions:,.0f} |
| vector length | mean | {norms.mean().item():.2f} |
| vector length | median | {norms.median().item():.2f} |
| vector length | stddev | {norms.std().item():.2f} |
| values | mean | {embeddings.mean().item():.2f} |
| values | median | {embeddings.median().item():.2f} |
| values | stddev | {embeddings.std().item():.2f} |
## Mean Pooled Quantization Loss
This test roundtrips the vectors through quantization, but performs the
mean pooling arithmetic in float32 space. The quantized and unquantized
mean pooled vectors are compared to each other to determine their cosine
similarity, to show how much the meaning of the vector has changed due
to quantization.
| Precision | Cosine Similarity |
| ------------- | ----------------- |
| fp16 | {avg_cosine_similarity[torch.float16]:.5f} |
| fp8 e4m3 | {avg_cosine_similarity[torch.float8_e4m3fn]:.5f} |
| fp8 e5m2 | {avg_cosine_similarity[torch.float8_e5m2]:.5f} |
## Quantization Loss Per Vector
While ultimately the embedding vectors will be mean pooled together, it's
still useful to look at the loss per-vector in the embedding table to see
which quantization strategies retain the most vector meaning.
- **Cosine Similarity** — measures how well the *direction* of embedding vectors
is preserved after quantization, independent of scale. This is especially
relevant when embeddings are used for similarity search or retrieval.
- **MSE (Mean Squared Error)** — emphasizes large errors by squaring the
differences. Useful for detecting whether any values are badly distorted.
- **MAE (Mean Absolute Error)** — the average absolute difference between
original and quantized values. Easier to interpret, less sensitive to outliers.
| Precision | Metric | Value |
| ------------- | ------ | ----- |
| fp16 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float16):.5f} |
| fp8 e4m3 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e4m3fn):.5f} |
| fp8 e5m2 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e5m2):.5f} |
| fp16 | MSE | {quantization_loss_mse(embeddings, torch.float16):.5f} |
| fp8 e4m3 | MSE | {quantization_loss_mse(embeddings, torch.float8_e4m3fn):.5f} |
| fp8 e5m2 | MSE | {quantization_loss_mse(embeddings, torch.float8_e5m2):.5f} |
| fp16 | MAE | {quantization_loss_mae(embeddings, torch.float16):.5f} |
| fp8 e4m3 | MAE | {quantization_loss_mae(embeddings, torch.float8_e4m3fn):.5f} |
| fp8 e5m2 | MAE | {quantization_loss_mae(embeddings, torch.float8_e5m2):.5f} |
## Tokenizer Examples
{indent(tokenizer_output, prefix).strip()}
"""
).strip()
)
def export_tokenizer(model_card: ModelCard, tokenizer: Tokenizer) -> None:
tokenizer_path = model_card.path() / "tokenizer.json"
print(f"Exporting tokenizer: {tokenizer_path}")
tokenizer.save(str(tokenizer_path))
zst_compress_file(tokenizer_path)
def export_sentence_transformers(model_card: ModelCard) -> None:
"""Extract the embeddings and tokenizer from SentenceTransformers"""
print("Processing", model_card.name())
model = SentenceTransformer(model_card.name(), device="cpu")
embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
model_card.path().mkdir(exist_ok=True, parents=True)
embeddings = torch.Tensor(embedding_bag.weight)
export_embeddings(model_card, embeddings)
export_tokenizer(model_card, model.tokenizer)
export_readme(model_card, embeddings, model.tokenizer)
def export_model2vec(model_card: ModelCard) -> None:
"""Extract the embeddings and tokenizer from model2vec"""
print("Processing", model_card.name())
model = StaticModel.from_pretrained(model_card.name())
model_card.path().mkdir(exist_ok=True, parents=True)
embeddings = torch.from_numpy(model.embedding)
export_embeddings(model_card, embeddings)
export_tokenizer(model_card, model.tokenizer)
export_readme(model_card, embeddings, model.tokenizer)
def main() -> None:
# Static embedders that use sentence_transformers models.
sentence_transformers_models = [
ModelCard(
owner="sentence-transformers",
repo="static-similarity-mrl-multilingual-v1",
description="""
Multi-lingual similarity embeddings that were trained with Matroyshka loss
that allows for more effective truncation of the embedding vectors. It
was trained on a variety of domains of multilingual datasets.
It's a general purpose model that can be used for semantic textual similarity,
paraphrase mining, text classification, clustering, and more
""",
matroyshka_dims=[32, 64, 128, 256, 512, 1024],
license="apache-2.0",
),
ModelCard(
owner="sentence-transformers",
repo="static-retrieval-mrl-en-v1",
description="""
English-only uncased similarity embeddings that were trained with Matroyshka
loss that allows for more effective truncation of the embedding vectors. It
was trained on a variety of domains of monolingual datasets. I was designed
specifically for similarity retrieval.
""",
matroyshka_dims=[32, 64, 128, 256, 512, 1024],
license="apache-2.0",
),
]
# Static embedders that use model2vec.
model2vec_models = [
ModelCard(
owner="minishlab",
repo="potion-multilingual-128M",
# These are assumed as their is no python reference implementation:
matroyshka_dims=[32, 64, 128, 256],
description="""
A multilingual embedder. The details are a bit scant on how it's trained as
there is no source code for it. However, it's likely a close architecture
to the potion-retrieval-32M model, but trained on Common Crawl data.
The 128M references the number of parameters in the embeddings:
256 dimensions * 500,353 vocab.
""",
license="mit",
),
ModelCard(
owner="minishlab",
repo="potion-retrieval-32M",
matroyshka_dims=[32, 64, 128, 256, 512],
description="""
The token embeddings from a monolingual English 32M parameter model that was
distilled from embeddings that were initialized from the the multi-domain
[BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)
The 32M references the number of parameters in the embeddings:
512 dimension * 63,091 vocab.
""",
license="mit",
),
]
if models_path.exists():
print(f"Removing the old models folder: {models_path}")
shutil.rmtree(models_path)
models_path.mkdir()
for model_card in sentence_transformers_models:
export_sentence_transformers(model_card)
for model_card in model2vec_models:
export_model2vec(model_card)
if __name__ == "__main__":
main()