hackergeek commited on
Commit
ab6a1b4
·
verified ·
1 Parent(s): 664d0b1

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +146 -3
  2. config.json +11 -0
  3. generation_config.json +1 -0
  4. pytorch_model.bin +3 -0
  5. vocab.json +0 -0
README.md CHANGED
@@ -1,3 +1,146 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # hackergeek/RADIOCAP13
3
+
4
+ **ROCO Radiology Image Captioning Model**
5
+
6
+ This model is a medical image captioning system designed for radiology reports. It utilizes a frozen ViT encoder for image feature extraction and a custom decoder trained to generate captions. The model was trained on the full ROCO-radiology dataset.
7
+
8
+ - **Encoder**: `google/vit-base-patch16-224-in21k` (frozen, features cached)
9
+ - **Decoder**: Trained on **full ROCO dataset** (~81k samples) for **3 epochs**
10
+ - **Trainable parameters**: Only decoder + ViT biases
11
+ - **Vocab size**: 75460
12
+ - **Sequence Length**: 32
13
+ - **Generation**: Beam search (size=3)
14
+
15
+ ---
16
+
17
+ ## Usage
18
+
19
+ ```python
20
+ from transformers import ViTModel
21
+ import torch
22
+ from PIL import Image
23
+ from torchvision import transforms
24
+ import json
25
+ import os
26
+
27
+ # Assuming SimpleTokenizer and BiasDecoder classes are available from your training script.
28
+ # For a full runnable example, their definitions are included below.
29
+
30
+ # Re-define necessary components and classes for a self-contained example
31
+ IMG_SIZE = 224
32
+ SEQ_LEN = 32
33
+ VOCAB_SIZE = 75460
34
+
35
+ transform = transforms.Compose([
36
+ transforms.Resize((IMG_SIZE, IMG_SIZE)),
37
+ transforms.ToTensor(),
38
+ ])
39
+
40
+ def preprocess_image(img):
41
+ if img is None: raise ValueError("Image is None")
42
+ if not isinstance(img, Image.Image): img = Image.fromarray(img)
43
+ if img.mode != "RGB": img = img.convert("RGB")
44
+ return transform(img)
45
+
46
+ # SimpleTokenizer class (copy-pasted from notebook for self-contained example)
47
+ class SimpleTokenizer:
48
+ def __init__(self, word2idx=None):
49
+ if word2idx is None:
50
+ # Placeholder for actual vocab loading or creation if not loaded from file
51
+ self.word2idx = {} # Escaped
52
+ else:
53
+ self.word2idx = word2idx
54
+ self.idx2word = {v: k for k, v in self.word2idx.items()} # Escaped
55
+
56
+ def encode(self, text, max_len=SEQ_LEN):
57
+ tokens = [self.word2idx.get(w, self.word2idx["<PAD>"]) for w in text.lower().split()]
58
+ tokens = [self.word2idx["<SOS>"]] + tokens[:max_len-2] + [self.word2idx["<EOS>"]]
59
+ tokens += [self.word2idx["<PAD>"]] * (max_len - len(tokens))
60
+ return torch.tensor(tokens, dtype=torch.long)
61
+
62
+ def decode(self, tokens):
63
+ return " ".join(self.idx2word.get(t.item(), "<UNK>") for t in tokens if t not in [self.word2idx["<PAD>"], self.word2idx["<SOS>"], self.word2idx["<EOS>"]])
64
+
65
+ @classmethod
66
+ def load(cls, path):
67
+ with open(f"{path}/vocab.json", "r") as f: # Correctly escaped
68
+ word2idx = json.load(f)
69
+ tokenizer = cls(word2idx)
70
+ return tokenizer
71
+
72
+ # BiasDecoder class (copy-pasted from notebook for self-contained example)
73
+ class BiasDecoder(torch.nn.Module):
74
+ def __init__(self, feature_dim=768, vocab_size=VOCAB_SIZE):
75
+ super().__init__()
76
+ self.token_emb = torch.nn.Embedding(vocab_size, feature_dim)
77
+ self.pos_emb = torch.nn.Embedding(SEQ_LEN-1, feature_dim)
78
+ self.final_layer = torch.nn.Linear(feature_dim, vocab_size)
79
+
80
+ def forward(self, img_feat, target_seq):
81
+ x = self.token_emb(target_seq)
82
+ pos = torch.arange(x.size(1), device=x.device).clamp(max=self.pos_emb.num_embeddings-1)
83
+ x = x + self.pos_emb(pos)
84
+ x = x + img_feat.unsqueeze(1)
85
+ return self.final_layer(x)
86
+
87
+ # Setup device
88
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
89
+
90
+ # Load ViT (frozen)
91
+ vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
92
+ vit.eval()
93
+ vit.to(device)
94
+
95
+ # Load decoder
96
+ decoder = BiasDecoder().to(device)
97
+ # Assuming 'pytorch_model.bin' is in the current directory or specified path
98
+ decoder.load_state_dict(torch.load("pytorch_model.bin", map_location=device))
99
+ decoder.eval()
100
+
101
+ # Load tokenizer
102
+ # Assuming 'vocab.json' is in the current directory or specified path
103
+ tokenizer = SimpleTokenizer.load("./")
104
+ pad_idx = tokenizer.word2idx["<PAD>"]
105
+
106
+ # Generation function
107
+ @torch.no_grad()
108
+ def generate_caption(model, img_feat, max_len=SEQ_LEN, beam_size=3):
109
+ model.eval()
110
+ img_feat = img_feat.to(device)
111
+ beams = [([tokenizer.word2idx["<SOS>"]], 0.0)]
112
+ for _ in range(max_len - 1):
113
+ candidates = []
114
+ for seq, score in beams:
115
+ inp = torch.tensor(seq + [pad_idx] * (SEQ_LEN - len(seq)), device=device).unsqueeze(0)
116
+ logits = model(img_feat, inp)
117
+ probs = torch.nn.functional.log_softmax(logits[0, len(seq)-1], dim=-1)
118
+ top_p, top_i = torch.topk(probs, beam_size)
119
+ for i in range(beam_size):
120
+ candidates.append((seq + [top_i[i].item()], score + top_p[i].item()))
121
+ beams = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]
122
+ if all(s[-1] == tokenizer.word2idx["<EOS>"] for s, _ in beams): break
123
+ words = [tokenizer.idx2word.get(i, "<UNK>") for i in beams[0][0][1:] if i != pad_idx]
124
+ return " ".join(words)
125
+
126
+ # Example: Generate a caption for an image
127
+ # For a real example, you would load an actual image and process it.
128
+ # img_path = "path/to/your/image.jpg"
129
+ # image = Image.open(img_path).convert("RGB")
130
+ # img_tensor = preprocess_image(image).unsqueeze(0).to(device)
131
+ # img_feat = vit(pixel_values=img_tensor).pooler_output
132
+ # generated_caption = generate_caption(decoder, img_feat)
133
+ # print(f"Generated caption: {generated_caption}")
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Evaluation (on ROCO Test Set)
139
+
140
+ - **BLEU-1**: N/A
141
+ - **BLEU-2**: N/A
142
+ - **BLEU-3**: N/A
143
+ - **BLEU-4**: N/A
144
+ - **Overall BLEU Score**: N/A
145
+
146
+ *Note: BLEU scores were interrupted during computation. Please re-run the evaluation cell (`eXra19D_oqcs`) after pushing to get accurate scores.*
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "vit-captioner-bias-decoder",
3
+ "feature_extractor": "google/vit-base-patch16-224-in21k",
4
+ "vocab_size": 75460,
5
+ "seq_len": 32,
6
+ "feature_dim": 768,
7
+ "training_epochs": 3,
8
+ "dataset": "ROCO-radiology (train + val + test)",
9
+ "trainable": "Decoder + ViT biases only",
10
+ "description": "ROCO radiology captioner trained for 3 epochs on full dataset using cached ViT features."
11
+ }
generation_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"max_length": 32, "beam_size": 3}
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9147eef4c764eb8c960f0b62ed15368d1e5b4f1cddead75885dad44b1595bc
3
+ size 464025945
vocab.json ADDED
The diff for this file is too large to render. See raw diff