lukeingawesome
/

llm2vec4cxr

@@ -88,11 +88,13 @@ class LLM2Vec4CXRModel(PreTrainedModel):
     def _build_separator_inputs(self, texts, max_length: int, separator: str):
         tok = self._get_tokenizer()
         # Split into [instruction | text]; we embed only the trailing "text" part.
         parts_after_sep = []
         original = []
         for t in texts:
             parts = t.split(separator)
-            parts_after_sep.append(parts[1] if len(parts) > 1 else "")
             original.append("".join(parts))
         tokenized = tok(original, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
@@ -103,6 +105,9 @@ class LLM2Vec4CXRModel(PreTrainedModel):
             m = torch.zeros_like(tokenized["attention_mask"][i])
             if len(sub["input_ids"][0]) > 0:
                 m[-len(sub["input_ids"][0]):] = 1
             embed_mask = m.unsqueeze(0) if embed_mask is None else torch.cat([embed_mask, m.unsqueeze(0)], dim=0)
         tokenized["embed_mask"] = embed_mask

     def _build_separator_inputs(self, texts, max_length: int, separator: str):
         tok = self._get_tokenizer()
         # Split into [instruction | text]; we embed only the trailing "text" part.
+        # If no separator, embed the entire text.
         parts_after_sep = []
         original = []
         for t in texts:
             parts = t.split(separator)
+            # If no separator found, use the entire text (not empty string)
+            parts_after_sep.append(parts[1] if len(parts) > 1 else parts[0])
             original.append("".join(parts))
         tokenized = tok(original, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
             m = torch.zeros_like(tokenized["attention_mask"][i])
             if len(sub["input_ids"][0]) > 0:
                 m[-len(sub["input_ids"][0]):] = 1
+            else:
+                # If tokenization resulted in 0 tokens, use attention_mask (embed everything)
+                m = tokenized["attention_mask"][i].clone()
             embed_mask = m.unsqueeze(0) if embed_mask is None else torch.cat([embed_mask, m.unsqueeze(0)], dim=0)
         tokenized["embed_mask"] = embed_mask