Upload LLM2Vec4CXR fine-tuned model
Browse files- README.md +5 -0
- usage_example.py +4 -0
README.md
CHANGED
|
@@ -82,6 +82,9 @@ tokenizer.padding_side = 'left'
|
|
| 82 |
# Example usage for chest X-ray report analysis
|
| 83 |
def encode_text(text):
|
| 84 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
|
|
|
|
|
| 85 |
with torch.no_grad():
|
| 86 |
embeddings = model(inputs)
|
| 87 |
return embeddings
|
|
@@ -91,6 +94,8 @@ report = "There is a small increase in the left-sided effusion. There continues
|
|
| 91 |
embedding = encode_text(report)
|
| 92 |
```
|
| 93 |
|
|
|
|
|
|
|
| 94 |
### Advanced Usage with Separator-based Processing
|
| 95 |
|
| 96 |
The model supports special separator-based processing for instruction-following tasks:
|
|
|
|
| 82 |
# Example usage for chest X-ray report analysis
|
| 83 |
def encode_text(text):
|
| 84 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 85 |
+
# IMPORTANT: Add embed_mask for proper model functioning
|
| 86 |
+
# For simple text encoding, embed_mask is the same as attention_mask
|
| 87 |
+
inputs["embed_mask"] = inputs["attention_mask"].clone()
|
| 88 |
with torch.no_grad():
|
| 89 |
embeddings = model(inputs)
|
| 90 |
return embeddings
|
|
|
|
| 94 |
embedding = encode_text(report)
|
| 95 |
```
|
| 96 |
|
| 97 |
+
**Note**: The model requires an `embed_mask` input. For simple text encoding, set `embed_mask` equal to `attention_mask`. For instruction-following tasks, use the separator-based tokenization shown below.
|
| 98 |
+
|
| 99 |
### Advanced Usage with Separator-based Processing
|
| 100 |
|
| 101 |
The model supports special separator-based processing for instruction-following tasks:
|
usage_example.py
CHANGED
|
@@ -162,6 +162,8 @@ def main():
|
|
| 162 |
report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
|
| 163 |
|
| 164 |
inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
|
|
| 165 |
inputs = inputs.to(device)
|
| 166 |
|
| 167 |
with torch.no_grad():
|
|
@@ -223,6 +225,8 @@ def main():
|
|
| 223 |
|
| 224 |
print("Computing embeddings for multiple reports...")
|
| 225 |
inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
|
|
| 226 |
inputs = inputs.to(device)
|
| 227 |
|
| 228 |
with torch.no_grad():
|
|
|
|
| 162 |
report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
|
| 163 |
|
| 164 |
inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 165 |
+
# Add embed_mask (same as attention_mask for simple text encoding)
|
| 166 |
+
inputs["embed_mask"] = inputs["attention_mask"].clone()
|
| 167 |
inputs = inputs.to(device)
|
| 168 |
|
| 169 |
with torch.no_grad():
|
|
|
|
| 225 |
|
| 226 |
print("Computing embeddings for multiple reports...")
|
| 227 |
inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 228 |
+
# Add embed_mask (same as attention_mask for simple text encoding)
|
| 229 |
+
inputs["embed_mask"] = inputs["attention_mask"].clone()
|
| 230 |
inputs = inputs.to(device)
|
| 231 |
|
| 232 |
with torch.no_grad():
|