Spaces:
Running
Running
File size: 7,160 Bytes
8f5ac4a af2d08f 8f5ac4a af2d08f 8f5ac4a af2d08f 8f5ac4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
"""
Unit tests for the icons embeddings module.
"""
import importlib
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any
import numpy as np
def _reload_module_with_dummies(monkeypatch: Any, emb_dim: int = 4):
"""
Reload the icons_embeddings module after monkeypatching the
Transformers constructors to return lightweight dummy objects.
This prevents network/download or heavy model initialization during
tests and allows deterministic embeddings.
Args:
monkeypatch: The pytest monkeypatch fixture.
emb_dim: The embedding dimensionality that the dummy model
should produce.
Returns:
The reloaded module object.
"""
class DummyTokenizer:
def __call__(self, texts, return_tensors=None, padding=None,
max_length=None, truncation=None):
if isinstance(texts, str):
texts_list = [texts]
else:
texts_list = list(texts)
return {'texts': texts_list}
class DummyTensor:
def __init__(self, arr: np.ndarray) -> None:
self.arr = arr
def mean(self, dim: int) -> 'DummyTensor':
# Take numpy mean along the requested axis to emulate PyTorch.
return DummyTensor(self.arr.mean(axis=dim))
def detach(self) -> 'DummyTensor':
return self
def numpy(self) -> np.ndarray:
return self.arr
class DummyModel:
def __call__(self, **inputs: Any) -> SimpleNamespace:
texts = inputs.get('texts', [])
n = len(texts)
seq_len = 3
arr = np.arange(n * seq_len * emb_dim, dtype=float)
arr = arr.reshape((n, seq_len, emb_dim))
return SimpleNamespace(last_hidden_state=DummyTensor(arr))
monkeypatch.setattr(
'transformers.BertTokenizer.from_pretrained',
lambda name: DummyTokenizer(),
)
monkeypatch.setattr(
'transformers.BertModel.from_pretrained',
lambda name: DummyModel(),
)
if 'slidedeckai.helpers.icons_embeddings' in sys.modules:
mod = importlib.reload(sys.modules['slidedeckai.helpers.icons_embeddings'])
else:
mod = importlib.import_module('slidedeckai.helpers.icons_embeddings')
return mod
def test_get_icons_list(tmp_path: Path, monkeypatch: Any) -> None:
"""
get_icons_list should return the stems of PNG files in the
configured icons directory.
"""
mod = _reload_module_with_dummies(monkeypatch)
# Prepare a temporary icons directory with some files.
icons_dir = tmp_path / 'icons'
icons_dir.mkdir()
(icons_dir / 'apple.png').write_text('x')
(icons_dir / 'banana.png').write_text('y')
(icons_dir / 'not_an_icon.txt').write_text('z')
monkeypatch.setattr(mod.GlobalConfig, 'ICONS_DIR', icons_dir)
icons = mod.get_icons_list()
assert set(icons) == {'apple', 'banana'}
def test_get_embeddings_single_and_list(monkeypatch: Any) -> None:
"""
get_embeddings must return numpy arrays with the expected shapes for
single string and list inputs.
"""
emb_dim = 5
mod = _reload_module_with_dummies(monkeypatch, emb_dim=emb_dim)
# Single string -> shape (1, emb_dim)
arr1 = mod.get_embeddings('hello')
assert isinstance(arr1, np.ndarray)
assert arr1.shape == (1, emb_dim)
# List of strings -> shape (3, emb_dim)
arr2 = mod.get_embeddings(['a', 'b', 'c'])
assert arr2.shape == (3, emb_dim)
# Verify determinism from our dummy model for the first row.
# The dummy model fills values with a range; mean over axis=1 reduces
# the seq_len dimension.
expected_first_row = np.arange(3 * emb_dim).reshape((3, emb_dim)).mean(axis=0)
assert np.allclose(arr2[0], expected_first_row)
def test_save_and_load_embeddings(tmp_path: Path, monkeypatch: Any) -> None:
"""
save_icons_embeddings should write embeddings and file names to the
configured paths and load_saved_embeddings should read them back.
"""
emb_dim = 6
mod = _reload_module_with_dummies(monkeypatch, emb_dim=emb_dim)
# Create icons dir with files.
icons_dir = tmp_path / 'icons2'
icons_dir.mkdir()
(icons_dir / 'one.png').write_text('1')
(icons_dir / 'two.png').write_text('2')
monkeypatch.setattr(mod.GlobalConfig, 'ICONS_DIR', icons_dir)
emb_file = tmp_path / 'emb.npy'
names_file = tmp_path / 'names.npy'
monkeypatch.setattr(mod.GlobalConfig, 'EMBEDDINGS_FILE_NAME', str(emb_file))
monkeypatch.setattr(mod.GlobalConfig, 'ICONS_FILE_NAME', str(names_file))
# Run save which uses the dummy tokenizer/model to create embeddings.
mod.save_icons_embeddings()
assert emb_file.exists()
assert names_file.exists()
loaded_emb, loaded_names = mod.load_saved_embeddings()
assert isinstance(loaded_emb, np.ndarray)
assert isinstance(loaded_names, np.ndarray)
assert loaded_emb.shape[0] == len(loaded_names)
def test_find_icons(monkeypatch: Any, tmp_path: Path) -> None:
"""
find_icons should map keywords to the most similar icon filenames
based on cosine similarity against pre-saved embeddings.
"""
# Reload module with dummy model but we will monkeypatch get_embeddings
# to control keyword embeddings precisely.
mod = _reload_module_with_dummies(monkeypatch, emb_dim=3)
# Prepare saved embeddings with two icons.
emb = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
names = np.array(['a_icon', 'b_icon'])
emb_file = tmp_path / 'emb_s.npy'
names_file = tmp_path / 'names_s.npy'
np.save(str(emb_file), emb)
np.save(str(names_file), names)
monkeypatch.setattr(mod.GlobalConfig, 'EMBEDDINGS_FILE_NAME', str(emb_file))
monkeypatch.setattr(mod.GlobalConfig, 'ICONS_FILE_NAME', str(names_file))
# Make keyword embeddings match each saved one.
def fake_get_embeddings(keywords: list[str]) -> np.ndarray:
out = []
for kw in keywords:
if kw == 'match_a':
out.append([1.0, 0.0, 0.0])
else:
out.append([0.0, 1.0, 0.0])
return np.array(out)
monkeypatch.setattr(mod, 'get_embeddings', fake_get_embeddings)
res = mod.find_icons(['match_a', 'other'])
assert list(res) == ['a_icon', 'b_icon']
def test_main_calls_and_prints(monkeypatch: Any, capsys: Any) -> None:
"""
main should call save_icons_embeddings and find_icons and print the
zipped results. We monkeypatch the heavy functions to keep it fast.
"""
mod = _reload_module_with_dummies(monkeypatch)
called = {}
def fake_save():
called['saved'] = True
def fake_find(keywords: list[str]) -> list[str]:
called['found'] = True
return ['x' for _ in keywords]
monkeypatch.setattr(mod, 'save_icons_embeddings', fake_save)
monkeypatch.setattr(mod, 'find_icons', fake_find)
mod.main()
captured = capsys.readouterr()
assert 'The relevant icon files are' in captured.out
assert called.get('saved') is True
assert called.get('found') is True
|