static-embeddings / example.mjs
gregtatum's picture
Add the initial models
f7fef32
import { pipeline, AutoTokenizer, AutoModel, TokenizerModel, PreTrainedTokenizer } from '@huggingface/transformers';
import fs from 'node:fs/promises';
import { constants } from 'node:fs';
import path from 'path';
import { fileURLToPath } from 'url';
const DIR = path.dirname(fileURLToPath(import.meta.url));
await main()
async function main() {
const url = "https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1/resolve/main/0_StaticEmbedding/tokenizer.json"
const config = await ensureTokenizerJson(url)
const tokenizer = new PreTrainedTokenizer(config, {})
const examples = [
"This is an example of encoding",
"The quick brown fox jumps over the lazy dog.",
"Curaçao, naïve fiancé, jalapeño, déjà vu.",
"Привет, как дела?",
"Бързата кафява лисица прескача мързеливото куче.",
"Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
"اللغة العربية جميلة وغنية بالتاريخ.",
"مرحبا بالعالم!",
"Simplified: 快速的棕色狐狸跳过懒狗。",
"Traditional: 快速的棕色狐狸跳過懶狗。",
"素早い茶色の狐が怠け者の犬を飛び越える。",
"コンピュータープログラミング",
"빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
"तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
"দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
"வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
"สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
"ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
// Mixed scripts:
"Hello 世界 مرحبا 🌍",
"123, αβγ, абв, العربية, 中文, हिन्दी.",
];
for (const example of examples) {
console.log(tokenizer.tokenize(example))
}
}
/**
* @param {string} path
* @returns {Promise<string>}
*/
async function loadJSON(path) {
return JSON.parse(await fs.readFile(path, { encoding: 'utf8' }));
}
/**
* Download tokenizer.json if it does not already exist.
*
* @param {string} url - The URL to download tokenizer.json from
* @returns {Promise<any>} - Path to tokenizer.json
*/
export async function ensureTokenizerJson(url) {
const tokenizerPath = path.join(DIR, 'tokenizer.json');
try {
await fs.access(tokenizerPath, constants.F_OK);
console.log('Using', tokenizerPath);
return loadJSON(tokenizerPath);
} catch {}
console.log("Downloading", url);
const response = await fetch(url);
const data = Buffer.from(await response.arrayBuffer());
await fs.writeFile(tokenizerPath, data);
return loadJSON(tokenizerPath);
}