Spaces:
Runtime error
Runtime error
enable fp16 inference
Browse files- app.py +1 -1
- utils/dc_utils.py +2 -0
- video_depth_anything/dpt_temporal.py +4 -2
- video_depth_anything/video_depth.py +2 -1
app.py
CHANGED
|
@@ -128,7 +128,7 @@ def construct_demo():
|
|
| 128 |
label="max process length",
|
| 129 |
minimum=-1,
|
| 130 |
maximum=1000,
|
| 131 |
-
value
|
| 132 |
step=1,
|
| 133 |
)
|
| 134 |
target_fps = gr.Slider(
|
|
|
|
| 128 |
label="max process length",
|
| 129 |
minimum=-1,
|
| 130 |
maximum=1000,
|
| 131 |
+
value=500,
|
| 132 |
step=1,
|
| 133 |
)
|
| 134 |
target_fps = gr.Slider(
|
utils/dc_utils.py
CHANGED
|
@@ -22,10 +22,12 @@ def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1):
|
|
| 22 |
original_height, original_width = vid.get_batch([0]).shape[1:3]
|
| 23 |
height = original_height
|
| 24 |
width = original_width
|
|
|
|
| 25 |
if max_res > 0 and max(height, width) > max_res:
|
| 26 |
scale = max_res / max(original_height, original_width)
|
| 27 |
height = ensure_even(round(original_height * scale))
|
| 28 |
width = ensure_even(round(original_width * scale))
|
|
|
|
| 29 |
|
| 30 |
vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
|
| 31 |
|
|
|
|
| 22 |
original_height, original_width = vid.get_batch([0]).shape[1:3]
|
| 23 |
height = original_height
|
| 24 |
width = original_width
|
| 25 |
+
print(f'==> original video size: {original_height} x {original_width}')
|
| 26 |
if max_res > 0 and max(height, width) > max_res:
|
| 27 |
scale = max_res / max(original_height, original_width)
|
| 28 |
height = ensure_even(round(original_height * scale))
|
| 29 |
width = ensure_even(round(original_width * scale))
|
| 30 |
+
print(f'==> downsample video size: {height} x {width}')
|
| 31 |
|
| 32 |
vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
|
| 33 |
|
video_depth_anything/dpt_temporal.py
CHANGED
|
@@ -91,6 +91,8 @@ class DPTHeadTemporal(DPTHead):
|
|
| 91 |
out = F.interpolate(
|
| 92 |
out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
|
| 93 |
)
|
| 94 |
-
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
return out
|
|
|
|
| 91 |
out = F.interpolate(
|
| 92 |
out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
|
| 93 |
)
|
| 94 |
+
ori_type = out.dtype
|
| 95 |
+
with torch.autocast(device_type="cuda", enabled=False):
|
| 96 |
+
out = self.scratch.output_conv2(out.float())
|
| 97 |
|
| 98 |
+
return out.to(ori_type)
|
video_depth_anything/video_depth.py
CHANGED
|
@@ -104,7 +104,8 @@ class VideoDepthAnything(nn.Module):
|
|
| 104 |
cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
|
| 105 |
|
| 106 |
with torch.no_grad():
|
| 107 |
-
|
|
|
|
| 108 |
|
| 109 |
depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
|
| 110 |
depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
|
|
|
|
| 104 |
cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
|
| 105 |
|
| 106 |
with torch.no_grad():
|
| 107 |
+
with torch.autocast(device_type=device, enabled=True):
|
| 108 |
+
depth = self.forward(cur_input) # depth shape: [1, T, H, W]
|
| 109 |
|
| 110 |
depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
|
| 111 |
depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
|