File size: 7,160 Bytes
8f5ac4a
 
 
 
 
 
 
af2d08f
8f5ac4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af2d08f
8f5ac4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af2d08f
8f5ac4a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
Unit tests for the icons embeddings module.
"""
import importlib
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any

import numpy as np


def _reload_module_with_dummies(monkeypatch: Any, emb_dim: int = 4):
    """
    Reload the icons_embeddings module after monkeypatching the
    Transformers constructors to return lightweight dummy objects.

    This prevents network/download or heavy model initialization during
    tests and allows deterministic embeddings.

    Args:
        monkeypatch: The pytest monkeypatch fixture.
        emb_dim: The embedding dimensionality that the dummy model
            should produce.

    Returns:
        The reloaded module object.
    """
    class DummyTokenizer:
        def __call__(self, texts, return_tensors=None, padding=None,
                     max_length=None, truncation=None):
            if isinstance(texts, str):
                texts_list = [texts]
            else:
                texts_list = list(texts)

            return {'texts': texts_list}


    class DummyTensor:
        def __init__(self, arr: np.ndarray) -> None:
            self.arr = arr

        def mean(self, dim: int) -> 'DummyTensor':
            # Take numpy mean along the requested axis to emulate PyTorch.
            return DummyTensor(self.arr.mean(axis=dim))

        def detach(self) -> 'DummyTensor':
            return self

        def numpy(self) -> np.ndarray:
            return self.arr


    class DummyModel:
        def __call__(self, **inputs: Any) -> SimpleNamespace:
            texts = inputs.get('texts', [])
            n = len(texts)
            seq_len = 3
            arr = np.arange(n * seq_len * emb_dim, dtype=float)
            arr = arr.reshape((n, seq_len, emb_dim))
            return SimpleNamespace(last_hidden_state=DummyTensor(arr))

    monkeypatch.setattr(
        'transformers.BertTokenizer.from_pretrained',
        lambda name: DummyTokenizer(),
    )
    monkeypatch.setattr(
        'transformers.BertModel.from_pretrained',
        lambda name: DummyModel(),
    )

    if 'slidedeckai.helpers.icons_embeddings' in sys.modules:
        mod = importlib.reload(sys.modules['slidedeckai.helpers.icons_embeddings'])
    else:
        mod = importlib.import_module('slidedeckai.helpers.icons_embeddings')

    return mod


def test_get_icons_list(tmp_path: Path, monkeypatch: Any) -> None:
    """
    get_icons_list should return the stems of PNG files in the
    configured icons directory.
    """
    mod = _reload_module_with_dummies(monkeypatch)

    # Prepare a temporary icons directory with some files.
    icons_dir = tmp_path / 'icons'
    icons_dir.mkdir()
    (icons_dir / 'apple.png').write_text('x')
    (icons_dir / 'banana.png').write_text('y')
    (icons_dir / 'not_an_icon.txt').write_text('z')

    monkeypatch.setattr(mod.GlobalConfig, 'ICONS_DIR', icons_dir)

    icons = mod.get_icons_list()
    assert set(icons) == {'apple', 'banana'}


def test_get_embeddings_single_and_list(monkeypatch: Any) -> None:
    """
    get_embeddings must return numpy arrays with the expected shapes for
    single string and list inputs.
    """
    emb_dim = 5
    mod = _reload_module_with_dummies(monkeypatch, emb_dim=emb_dim)

    # Single string -> shape (1, emb_dim)
    arr1 = mod.get_embeddings('hello')
    assert isinstance(arr1, np.ndarray)
    assert arr1.shape == (1, emb_dim)

    # List of strings -> shape (3, emb_dim)
    arr2 = mod.get_embeddings(['a', 'b', 'c'])
    assert arr2.shape == (3, emb_dim)

    # Verify determinism from our dummy model for the first row.
    # The dummy model fills values with a range; mean over axis=1 reduces
    # the seq_len dimension.
    expected_first_row = np.arange(3 * emb_dim).reshape((3, emb_dim)).mean(axis=0)
    assert np.allclose(arr2[0], expected_first_row)


def test_save_and_load_embeddings(tmp_path: Path, monkeypatch: Any) -> None:
    """
    save_icons_embeddings should write embeddings and file names to the
    configured paths and load_saved_embeddings should read them back.
    """
    emb_dim = 6
    mod = _reload_module_with_dummies(monkeypatch, emb_dim=emb_dim)

    # Create icons dir with files.
    icons_dir = tmp_path / 'icons2'
    icons_dir.mkdir()
    (icons_dir / 'one.png').write_text('1')
    (icons_dir / 'two.png').write_text('2')

    monkeypatch.setattr(mod.GlobalConfig, 'ICONS_DIR', icons_dir)
    emb_file = tmp_path / 'emb.npy'
    names_file = tmp_path / 'names.npy'
    monkeypatch.setattr(mod.GlobalConfig, 'EMBEDDINGS_FILE_NAME', str(emb_file))
    monkeypatch.setattr(mod.GlobalConfig, 'ICONS_FILE_NAME', str(names_file))

    # Run save which uses the dummy tokenizer/model to create embeddings.
    mod.save_icons_embeddings()

    assert emb_file.exists()
    assert names_file.exists()

    loaded_emb, loaded_names = mod.load_saved_embeddings()
    assert isinstance(loaded_emb, np.ndarray)
    assert isinstance(loaded_names, np.ndarray)
    assert loaded_emb.shape[0] == len(loaded_names)


def test_find_icons(monkeypatch: Any, tmp_path: Path) -> None:
    """
    find_icons should map keywords to the most similar icon filenames
    based on cosine similarity against pre-saved embeddings.
    """
    # Reload module with dummy model but we will monkeypatch get_embeddings
    # to control keyword embeddings precisely.
    mod = _reload_module_with_dummies(monkeypatch, emb_dim=3)

    # Prepare saved embeddings with two icons.
    emb = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
    names = np.array(['a_icon', 'b_icon'])

    emb_file = tmp_path / 'emb_s.npy'
    names_file = tmp_path / 'names_s.npy'
    np.save(str(emb_file), emb)
    np.save(str(names_file), names)

    monkeypatch.setattr(mod.GlobalConfig, 'EMBEDDINGS_FILE_NAME', str(emb_file))
    monkeypatch.setattr(mod.GlobalConfig, 'ICONS_FILE_NAME', str(names_file))

    # Make keyword embeddings match each saved one.
    def fake_get_embeddings(keywords: list[str]) -> np.ndarray:
        out = []
        for kw in keywords:
            if kw == 'match_a':
                out.append([1.0, 0.0, 0.0])
            else:
                out.append([0.0, 1.0, 0.0])
        return np.array(out)

    monkeypatch.setattr(mod, 'get_embeddings', fake_get_embeddings)

    res = mod.find_icons(['match_a', 'other'])
    assert list(res) == ['a_icon', 'b_icon']


def test_main_calls_and_prints(monkeypatch: Any, capsys: Any) -> None:
    """
    main should call save_icons_embeddings and find_icons and print the
    zipped results. We monkeypatch the heavy functions to keep it fast.
    """
    mod = _reload_module_with_dummies(monkeypatch)
    called = {}

    def fake_save():
        called['saved'] = True


    def fake_find(keywords: list[str]) -> list[str]:
        called['found'] = True
        return ['x' for _ in keywords]


    monkeypatch.setattr(mod, 'save_icons_embeddings', fake_save)
    monkeypatch.setattr(mod, 'find_icons', fake_find)

    mod.main()

    captured = capsys.readouterr()
    assert 'The relevant icon files are' in captured.out
    assert called.get('saved') is True
    assert called.get('found') is True