|
|
import { pipeline, AutoTokenizer, AutoModel, TokenizerModel, PreTrainedTokenizer } from '@huggingface/transformers'; |
|
|
import fs from 'node:fs/promises'; |
|
|
import { constants } from 'node:fs'; |
|
|
import path from 'path'; |
|
|
import { fileURLToPath } from 'url'; |
|
|
|
|
|
const DIR = path.dirname(fileURLToPath(import.meta.url)); |
|
|
|
|
|
await main() |
|
|
|
|
|
async function main() { |
|
|
const url = "https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1/resolve/main/0_StaticEmbedding/tokenizer.json" |
|
|
|
|
|
const config = await ensureTokenizerJson(url) |
|
|
const tokenizer = new PreTrainedTokenizer(config, {}) |
|
|
|
|
|
const examples = [ |
|
|
"This is an example of encoding", |
|
|
"The quick brown fox jumps over the lazy dog.", |
|
|
"Curaçao, naïve fiancé, jalapeño, déjà vu.", |
|
|
"Привет, как дела?", |
|
|
"Бързата кафява лисица прескача мързеливото куче.", |
|
|
"Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.", |
|
|
"اللغة العربية جميلة وغنية بالتاريخ.", |
|
|
"مرحبا بالعالم!", |
|
|
"Simplified: 快速的棕色狐狸跳过懒狗。", |
|
|
"Traditional: 快速的棕色狐狸跳過懶狗。", |
|
|
"素早い茶色の狐が怠け者の犬を飛び越える。", |
|
|
"コンピュータープログラミング", |
|
|
"빠른 갈색 여우가 게으른 개를 뛰어넘습니다.", |
|
|
"तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।", |
|
|
"দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।", |
|
|
"வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.", |
|
|
"สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.", |
|
|
"ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።", |
|
|
|
|
|
"Hello 世界 مرحبا 🌍", |
|
|
"123, αβγ, абв, العربية, 中文, हिन्दी.", |
|
|
]; |
|
|
for (const example of examples) { |
|
|
console.log(tokenizer.tokenize(example)) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function loadJSON(path) { |
|
|
return JSON.parse(await fs.readFile(path, { encoding: 'utf8' })); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export async function ensureTokenizerJson(url) { |
|
|
const tokenizerPath = path.join(DIR, 'tokenizer.json'); |
|
|
|
|
|
try { |
|
|
await fs.access(tokenizerPath, constants.F_OK); |
|
|
console.log('Using', tokenizerPath); |
|
|
return loadJSON(tokenizerPath); |
|
|
} catch {} |
|
|
|
|
|
console.log("Downloading", url); |
|
|
const response = await fetch(url); |
|
|
const data = Buffer.from(await response.arrayBuffer()); |
|
|
await fs.writeFile(tokenizerPath, data); |
|
|
|
|
|
return loadJSON(tokenizerPath); |
|
|
|
|
|
} |
|
|
|