Spaces:
Runtime error
Runtime error
updating for gpu
Browse files
app.py
CHANGED
|
@@ -53,6 +53,7 @@ def main():
|
|
| 53 |
"configs/ego_mcq/svitt.yml",
|
| 54 |
sample_videos,
|
| 55 |
)
|
|
|
|
| 56 |
def predict(text):
|
| 57 |
idx = sample_text_dict[text]
|
| 58 |
ft_action, gt_action = svitt.predict(idx, text)
|
|
|
|
| 53 |
"configs/ego_mcq/svitt.yml",
|
| 54 |
sample_videos,
|
| 55 |
)
|
| 56 |
+
|
| 57 |
def predict(text):
|
| 58 |
idx = sample_text_dict[text]
|
| 59 |
ft_action, gt_action = svitt.predict(idx, text)
|
demo.py
CHANGED
|
@@ -24,9 +24,13 @@ class VideoModel(nn.Module):
|
|
| 24 |
Parameters:
|
| 25 |
config: config file
|
| 26 |
"""
|
| 27 |
-
super(
|
| 28 |
self.cfg = load_cfg(config)
|
| 29 |
self.model = self.build_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
self.templates = ['{}']
|
| 31 |
self.dataset = self.cfg['data']['dataset']
|
| 32 |
self.eval()
|
|
@@ -74,7 +78,7 @@ class VideoModel(nn.Module):
|
|
| 74 |
class VideoCLSModel(VideoModel):
|
| 75 |
""" Video model for video classification tasks (Charades-Ego, EGTEA). """
|
| 76 |
def __init__(self, config, sample_videos):
|
| 77 |
-
super(
|
| 78 |
self.sample_videos = sample_videos
|
| 79 |
self.video_transform = self.init_video_transform()
|
| 80 |
|
|
@@ -125,7 +129,7 @@ class VideoCLSModel(VideoModel):
|
|
| 125 |
truncation=True,
|
| 126 |
max_length=self.model_cfg.max_txt_l.video,
|
| 127 |
return_tensors="pt",
|
| 128 |
-
)
|
| 129 |
_, class_embeddings = self.model.encode_text(embeddings)
|
| 130 |
return class_embeddings
|
| 131 |
|
|
@@ -143,7 +147,7 @@ class VideoCLSModel(VideoModel):
|
|
| 143 |
pooled_image_feat_all = []
|
| 144 |
for i in range(clips.shape[0]):
|
| 145 |
|
| 146 |
-
images = clips[i,:].unsqueeze(0)
|
| 147 |
bsz = images.shape[0]
|
| 148 |
|
| 149 |
_, pooled_image_feat, *outputs = self.model.encode_image(images)
|
|
@@ -161,5 +165,5 @@ class VideoCLSModel(VideoModel):
|
|
| 161 |
@torch.no_grad()
|
| 162 |
def predict(self, idx, text=None):
|
| 163 |
output, target = self.forward(idx, text)
|
| 164 |
-
return output.numpy(), target
|
| 165 |
|
|
|
|
| 24 |
Parameters:
|
| 25 |
config: config file
|
| 26 |
"""
|
| 27 |
+
super().__init__()
|
| 28 |
self.cfg = load_cfg(config)
|
| 29 |
self.model = self.build_model()
|
| 30 |
+
use_gpu = torch.cuda.is_available()
|
| 31 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 32 |
+
if use_gpu:
|
| 33 |
+
self.model = self.model.to(self.device)
|
| 34 |
self.templates = ['{}']
|
| 35 |
self.dataset = self.cfg['data']['dataset']
|
| 36 |
self.eval()
|
|
|
|
| 78 |
class VideoCLSModel(VideoModel):
|
| 79 |
""" Video model for video classification tasks (Charades-Ego, EGTEA). """
|
| 80 |
def __init__(self, config, sample_videos):
|
| 81 |
+
super().__init__(config)
|
| 82 |
self.sample_videos = sample_videos
|
| 83 |
self.video_transform = self.init_video_transform()
|
| 84 |
|
|
|
|
| 129 |
truncation=True,
|
| 130 |
max_length=self.model_cfg.max_txt_l.video,
|
| 131 |
return_tensors="pt",
|
| 132 |
+
).to(self.device)
|
| 133 |
_, class_embeddings = self.model.encode_text(embeddings)
|
| 134 |
return class_embeddings
|
| 135 |
|
|
|
|
| 147 |
pooled_image_feat_all = []
|
| 148 |
for i in range(clips.shape[0]):
|
| 149 |
|
| 150 |
+
images = clips[i,:].unsqueeze(0).to(self.device)
|
| 151 |
bsz = images.shape[0]
|
| 152 |
|
| 153 |
_, pooled_image_feat, *outputs = self.model.encode_image(images)
|
|
|
|
| 165 |
@torch.no_grad()
|
| 166 |
def predict(self, idx, text=None):
|
| 167 |
output, target = self.forward(idx, text)
|
| 168 |
+
return output.cpu().numpy(), target
|
| 169 |
|