from dataclasses import dataclass import shutil from textwrap import dedent, indent from typing import Any import numpy as np from zstandard import ZstdCompressor from pathlib import Path import io from sentence_transformers import SentenceTransformer from torch.nn import EmbeddingBag import torch from model2vec import StaticModel from tokenizers import Encoding, Tokenizer models_path = Path("models") @dataclass class ModelCard: owner: str repo: str # The dimensions that were applied with Matroyshka Loss. matroyshka_dims: list[int] description: str license: str def name(self): return f"{self.owner}/{self.repo}" def path(self): return models_path / self.owner / self.repo def get_description(self): return dedent(self.description).strip() def zst_compress_file(input: Path): cctx = ZstdCompressor() output = input.parent / f"{input.name}.zst" print(f"Compressing {output}") with open(input, "rb") as fin, open(output, "wb") as fout: cctx.copy_stream(fin, fout) def save_data(path: Path, tensor: torch.Tensor): """Writes out the static embeddings to a .npy and .npy.zst file""" buffer = io.BytesIO() if tensor.dtype in (torch.float8_e4m3fn, torch.float8_e5m2): # Store as the raw bytes. np.save(buffer, tensor.detach().view(torch.uint8).numpy()) else: np.save(buffer, tensor.detach().numpy()) print(f"Saving {path}") with (open(path, "wb") as outfile,): outfile.write(buffer.getvalue()) zst_compress_file(path) def quantization_loss_mse(tensor: torch.Tensor, dtype: torch.dtype): """ Compute reconstruction loss when converting embeddings to a datatype and back using the mean squared error, which punishes big errors more than small ones. """ # Original → quantize → dequantize roundtrip = tensor.detach().to(dtype).to(tensor.dtype) # Mean squared error return torch.mean((tensor - roundtrip) ** 2).item() def quantization_loss_mae(tensor: torch.Tensor, dtype: torch.dtype): """ Compute reconstruction loss when converting embeddings to a datatype and back using the mean absolute error, which is less sensitive to outliers than MSE. """ # Original → quantize → dequantize roundtrip = tensor.detach().to(dtype).to(tensor.dtype) # Mean absolute error return torch.mean(torch.abs(tensor - roundtrip)).item() def quantization_loss_cosine(tensor: torch.Tensor, dtype: torch.dtype): """ Compute reconstruction loss when converting embeddings to a datatype and back using cosine similarity. This measures whether the embedding directions are preserved after quantization, independent of their magnitudes. """ # Original → quantize → dequantize roundtrip = tensor.detach().to(dtype).to(tensor.dtype) # Flatten both to 2D (num_vectors, dimensions) in case tensor is 1D or higher-D if tensor.ndim == 1: orig = tensor.unsqueeze(0) recon = roundtrip.unsqueeze(0) else: orig = tensor.view(tensor.shape[0], -1) recon = roundtrip.view(roundtrip.shape[0], -1) # Cosine similarity per vector, then average cos = torch.nn.functional.cosine_similarity(orig, recon, dim=1) return cos.mean().item() def export_embeddings(model_card: ModelCard, embeddings: torch.Tensor) -> None: vocab_size, dimensions = embeddings.shape # This logic can always be adjusted for models with different shapes. assert ( embeddings.dtype == torch.float32 ), f"The embeddings {embeddings.dtype} are assumed to be float32." for dim in model_card.matroyshka_dims: assert ( dim <= dimensions ), f"The Matroyshka dimensions {dim} were bigger than the models dimensions of {dimensions}" truncated = embeddings[:, :dim] assert truncated.shape == torch.Size([vocab_size, dim]) save_data(model_card.path() / f"fp32.d{dim}.npy", truncated) save_data( model_card.path() / f"fp16.d{dim}.npy", truncated.to(dtype=torch.float16), ) save_data( model_card.path() / f"fp8_e5m2.d{dim}.npy", truncated.to(dtype=torch.float8_e5m2), ) save_data( model_card.path() / f"fp8_e4m3.d{dim}.npy", truncated.to(dtype=torch.float8_e4m3fn), ) def normalized_mean_pooling(x: torch.Tensor) -> torch.Tensor: pooled = x.mean(dim=0) normalized = torch.nn.functional.normalize(pooled, dim=0) return normalized def export_readme( model_card: ModelCard, embeddings: torch.Tensor, tokenizer: Tokenizer, ): vocab_size, dimensions = embeddings.shape norms = torch.norm(embeddings, dim=1) # shape: [vocab_size] phrases = [ "The committee approved the proposal after hours of heated discussion and several last-minute amendments." "When training large neural networks, careful tuning of hyperparameters can significantly affect performance and stability." "Despite the heavy rain, the concert continued as planned and the crowd stayed enthusiastic until the final encore." "In ancient mythology, heroes often embarked on perilous journeys to discover hidden truths about themselves and their world." "The new smartphone model features an improved camera system, faster processing, and extended battery life compared to its predecessor." "He tried to explain the concept using simple analogies, but the underlying mathematics remained difficult to grasp for most listeners." "After weeks of negotiations, the two countries signed a historic trade agreement aimed at reducing tariffs and boosting cooperation." "She paused for a moment before answering, choosing her words carefully to avoid misunderstanding in such a delicate situation." "The detective pieced together the timeline of events, realizing that the key witness had provided a contradictory statement." "Remote work has changed the way teams collaborate, with online tools replacing traditional office routines and in-person meetings." ] cosine_similarity = { torch.float16: [], torch.float8_e4m3fn: [], torch.float8_e5m2: [], } for phrase in phrases: encoding: Encoding = tokenizer.encode(phrase) embedded_phrase = embeddings[torch.tensor(encoding.ids, dtype=torch.long)] for dtype in cosine_similarity.keys(): pooling_unquantized = normalized_mean_pooling(embedded_phrase) pooling_roundtrip = normalized_mean_pooling( embedded_phrase.to(dtype).to(torch.float32) ) cosine = torch.dot(pooling_unquantized, pooling_roundtrip).item() cosine_similarity[dtype].append(cosine) avg_cosine_similarity = { dtype: sum(values) / len(values) for dtype, values in cosine_similarity.items() } tokenizer_examples = [] for text in [ "This is an example of encoding", "The quick brown fox jumps over the lazy dog.", "Curaçao, naïve fiancé, jalapeño, déjà vu.", "Привет, как дела?", "Бързата кафява лисица прескача мързеливото куче.", "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.", "اللغة العربية جميلة وغنية بالتاريخ.", "مرحبا بالعالم!", "Simplified: 快速的棕色狐狸跳过懒狗。", "Traditional: 快速的棕色狐狸跳過懶狗。", "素早い茶色の狐が怠け者の犬を飛び越える。", "コンピュータープログラミング", "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.", "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।", "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।", "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.", "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.", "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።", "Hello 世界 مرحبا 🌍", "123, αβγ, абв, العربية, 中文, हिन्दी.", ]: encoding = tokenizer.encode(text) tokens = [f"`{token}`" for token in encoding.tokens] tokenizer_examples.append(f"**Input:** {text}
") tokenizer_examples.append(f"**Tokens**: {' '.join(tokens)}") tokenizer_examples.append("") tokenizer_output = "\n".join(tokenizer_examples) with (model_card.path() / "README.md").open("wt") as file: prefix = " " file.write( dedent( f""" # [{model_card.name()}](https://huggingface.co/{model_card.name()}) License: [{model_card.license}](https://choosealicense.com/licenses/{model_card.license}/) {indent(model_card.get_description(), prefix).strip()} ## Model Stats Stats that describe the embeddings tensor shapes and value distribution. | item | metric | value | | --------------| ----------------------- | ----- | | vocab | size | {vocab_size:,.0f} | | embedding | dimensions | {dimensions:,.0f} | | vector length | mean | {norms.mean().item():.2f} | | vector length | median | {norms.median().item():.2f} | | vector length | stddev | {norms.std().item():.2f} | | values | mean | {embeddings.mean().item():.2f} | | values | median | {embeddings.median().item():.2f} | | values | stddev | {embeddings.std().item():.2f} | ## Mean Pooled Quantization Loss This test roundtrips the vectors through quantization, but performs the mean pooling arithmetic in float32 space. The quantized and unquantized mean pooled vectors are compared to each other to determine their cosine similarity, to show how much the meaning of the vector has changed due to quantization. | Precision | Cosine Similarity | | ------------- | ----------------- | | fp16 | {avg_cosine_similarity[torch.float16]:.5f} | | fp8 e4m3 | {avg_cosine_similarity[torch.float8_e4m3fn]:.5f} | | fp8 e5m2 | {avg_cosine_similarity[torch.float8_e5m2]:.5f} | ## Quantization Loss Per Vector While ultimately the embedding vectors will be mean pooled together, it's still useful to look at the loss per-vector in the embedding table to see which quantization strategies retain the most vector meaning. - **Cosine Similarity** — measures how well the *direction* of embedding vectors is preserved after quantization, independent of scale. This is especially relevant when embeddings are used for similarity search or retrieval. - **MSE (Mean Squared Error)** — emphasizes large errors by squaring the differences. Useful for detecting whether any values are badly distorted. - **MAE (Mean Absolute Error)** — the average absolute difference between original and quantized values. Easier to interpret, less sensitive to outliers. | Precision | Metric | Value | | ------------- | ------ | ----- | | fp16 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float16):.5f} | | fp8 e4m3 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e4m3fn):.5f} | | fp8 e5m2 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e5m2):.5f} | | fp16 | MSE | {quantization_loss_mse(embeddings, torch.float16):.5f} | | fp8 e4m3 | MSE | {quantization_loss_mse(embeddings, torch.float8_e4m3fn):.5f} | | fp8 e5m2 | MSE | {quantization_loss_mse(embeddings, torch.float8_e5m2):.5f} | | fp16 | MAE | {quantization_loss_mae(embeddings, torch.float16):.5f} | | fp8 e4m3 | MAE | {quantization_loss_mae(embeddings, torch.float8_e4m3fn):.5f} | | fp8 e5m2 | MAE | {quantization_loss_mae(embeddings, torch.float8_e5m2):.5f} | ## Tokenizer Examples {indent(tokenizer_output, prefix).strip()} """ ).strip() ) def export_tokenizer(model_card: ModelCard, tokenizer: Tokenizer) -> None: tokenizer_path = model_card.path() / "tokenizer.json" print(f"Exporting tokenizer: {tokenizer_path}") tokenizer.save(str(tokenizer_path)) zst_compress_file(tokenizer_path) def export_sentence_transformers(model_card: ModelCard) -> None: """Extract the embeddings and tokenizer from SentenceTransformers""" print("Processing", model_card.name()) model = SentenceTransformer(model_card.name(), device="cpu") embedding_bag: EmbeddingBag = model[0].embedding # type: ignore model_card.path().mkdir(exist_ok=True, parents=True) embeddings = torch.Tensor(embedding_bag.weight) export_embeddings(model_card, embeddings) export_tokenizer(model_card, model.tokenizer) export_readme(model_card, embeddings, model.tokenizer) def export_model2vec(model_card: ModelCard) -> None: """Extract the embeddings and tokenizer from model2vec""" print("Processing", model_card.name()) model = StaticModel.from_pretrained(model_card.name()) model_card.path().mkdir(exist_ok=True, parents=True) embeddings = torch.from_numpy(model.embedding) export_embeddings(model_card, embeddings) export_tokenizer(model_card, model.tokenizer) export_readme(model_card, embeddings, model.tokenizer) def main() -> None: # Static embedders that use sentence_transformers models. sentence_transformers_models = [ ModelCard( owner="sentence-transformers", repo="static-similarity-mrl-multilingual-v1", description=""" Multi-lingual similarity embeddings that were trained with Matroyshka loss that allows for more effective truncation of the embedding vectors. It was trained on a variety of domains of multilingual datasets. It's a general purpose model that can be used for semantic textual similarity, paraphrase mining, text classification, clustering, and more """, matroyshka_dims=[32, 64, 128, 256, 512, 1024], license="apache-2.0", ), ModelCard( owner="sentence-transformers", repo="static-retrieval-mrl-en-v1", description=""" English-only uncased similarity embeddings that were trained with Matroyshka loss that allows for more effective truncation of the embedding vectors. It was trained on a variety of domains of monolingual datasets. I was designed specifically for similarity retrieval. """, matroyshka_dims=[32, 64, 128, 256, 512, 1024], license="apache-2.0", ), ] # Static embedders that use model2vec. model2vec_models = [ ModelCard( owner="minishlab", repo="potion-multilingual-128M", # These are assumed as their is no python reference implementation: matroyshka_dims=[32, 64, 128, 256], description=""" A multilingual embedder. The details are a bit scant on how it's trained as there is no source code for it. However, it's likely a close architecture to the potion-retrieval-32M model, but trained on Common Crawl data. The 128M references the number of parameters in the embeddings: 256 dimensions * 500,353 vocab. """, license="mit", ), ModelCard( owner="minishlab", repo="potion-retrieval-32M", matroyshka_dims=[32, 64, 128, 256, 512], description=""" The token embeddings from a monolingual English 32M parameter model that was distilled from embeddings that were initialized from the the multi-domain [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) The 32M references the number of parameters in the embeddings: 512 dimension * 63,091 vocab. """, license="mit", ), ] if models_path.exists(): print(f"Removing the old models folder: {models_path}") shutil.rmtree(models_path) models_path.mkdir() for model_card in sentence_transformers_models: export_sentence_transformers(model_card) for model_card in model2vec_models: export_model2vec(model_card) if __name__ == "__main__": main()