Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jul 9

Commit

cd14f82

1 Parent(s): 54c1d7b

support HubMixin

Browse files

Files changed (9) hide show

.gitignore +2 -19
README.md +1 -1
_viz/viz_template.html +338 -1
app.py +29 -33
app_3rd/spatrack_utils/infer_track.py +1 -1
models/SpaTrackV2/models/SpaTrack.py +13 -79
models/SpaTrackV2/models/predictor.py +6 -67
models/SpaTrackV2/models/tracker3D/TrackRefiner.py +6 -55
requirements.txt +2 -2

.gitignore CHANGED Viewed

@@ -23,47 +23,30 @@ __pycache__/
 /**/**/__pycache__
 /**/__pycache__
-outputs
-scripts/lauch_exp/config
-scripts/lauch_exp/submit_job.log
-scripts/lauch_exp/hydra_output
-scripts/lauch_wulan
-scripts/custom_video
 # ignore the visualizer
 viser
 viser_result
 benchmark/results
 benchmark
-ossutil_output
 prev_version
 spat_ceres
 wandb
 *.log
 seg_target.py
-eval_davis.py
-eval_multiple_gpu.py
-eval_pose_scan.py
-eval_single_gpu.py
 infer_cam.py
 infer_stream.py
 *.egg-info/
 **/*.egg-info
-eval_kinectics.py
-models/SpaTrackV2/datasets
-scripts
 config/fix_2d.yaml
-models/SpaTrackV2/datasets
-scripts/
 models/**/build
 models/**/dist
-temp_local

 /**/**/__pycache__
 /**/__pycache__
 # ignore the visualizer
 viser
 viser_result
 benchmark/results
 benchmark
 prev_version
 spat_ceres
 wandb
 *.log
 seg_target.py
 infer_cam.py
 infer_stream.py
 *.egg-info/
 **/*.egg-info
 config/fix_2d.yaml
 models/**/build
 models/**/dist
+temp_local
+examples/results

README.md CHANGED Viewed

@@ -11,4 +11,4 @@ license: mit
 short_description: Official Space for SpatialTrackerV2
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Official Space for SpatialTrackerV2
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

_viz/viz_template.html CHANGED Viewed

@@ -671,6 +671,38 @@
         </div>
       </div>
       <div class="settings-group">
         <div class="btn-group">
           <button id="reset-view-btn" style="flex: 1; margin-right: 5px;">Reset View</button>
@@ -739,7 +771,10 @@
           showCameraFrustum: document.getElementById('show-camera-frustum'),
           frustumSize: document.getElementById('frustum-size'),
           hideSettingsBtn: document.getElementById('hide-settings-btn'),
-          showSettingsBtn: document.getElementById('show-settings-btn')
         };
         this.scene = null;
@@ -750,6 +785,12 @@
         this.trajectories = [];
         this.cameraFrustum = null;
         this.initThreeJS();
         this.loadDefaultSettings().then(() => {
           this.initEventListeners();
@@ -977,6 +1018,28 @@
             this.ui.showSettingsBtn.style.display = 'none';
           });
         }
       }
       makeElementDraggable(element) {
@@ -1296,6 +1359,9 @@
         this.updateTrajectories(frameIndex);
         const progress = (frameIndex + 1) / this.config.totalFrames;
         this.ui.progress.style.width = `${progress * 100}%`;
@@ -1752,15 +1818,286 @@
         this.updateCameraFrustum(this.currentFrame);
       }
       resetSettings() {
         if (!this.defaultSettings) return;
         this.applyDefaultSettings();
         this.updatePointCloudSettings();
         this.updateTrajectorySettings();
         this.updateFrustumDimensions();
         this.ui.statusBar.textContent = "Settings reset to defaults";
         this.ui.statusBar.classList.remove('hidden');

         </div>
       </div>
+      <div class="settings-group">
+        <h3>Keep History</h3>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="enable-keep-history">
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="enable-keep-history">Enable Keep History</label>
+        </div>
+        <div class="slider-container">
+          <label for="history-stride">Stride</label>
+          <select id="history-stride">
+            <option value="1">1</option>
+            <option value="2">2</option>
+            <option value="5" selected>5</option>
+            <option value="10">10</option>
+            <option value="20">20</option>
+          </select>
+        </div>
+      </div>
+      <div class="settings-group">
+        <h3>Background</h3>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="white-background">
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="white-background">White Background</label>
+        </div>
+      </div>
       <div class="settings-group">
         <div class="btn-group">
           <button id="reset-view-btn" style="flex: 1; margin-right: 5px;">Reset View</button>
           showCameraFrustum: document.getElementById('show-camera-frustum'),
           frustumSize: document.getElementById('frustum-size'),
           hideSettingsBtn: document.getElementById('hide-settings-btn'),
+          showSettingsBtn: document.getElementById('show-settings-btn'),
+          enableKeepHistory: document.getElementById('enable-keep-history'),
+          historyStride: document.getElementById('history-stride'),
+          whiteBackground: document.getElementById('white-background')
         };
         this.scene = null;
         this.trajectories = [];
         this.cameraFrustum = null;
+        // Keep History functionality
+        this.historyPointClouds = [];
+        this.historyTrajectories = [];
+        this.historyFrames = [];
+        this.maxHistoryFrames = 20;
         this.initThreeJS();
         this.loadDefaultSettings().then(() => {
           this.initEventListeners();
             this.ui.showSettingsBtn.style.display = 'none';
           });
         }
+        // Keep History event listeners
+        if (this.ui.enableKeepHistory) {
+          this.ui.enableKeepHistory.addEventListener('change', () => {
+            if (!this.ui.enableKeepHistory.checked) {
+              this.clearHistory();
+            }
+          });
+        }
+        if (this.ui.historyStride) {
+          this.ui.historyStride.addEventListener('change', () => {
+            this.clearHistory();
+          });
+        }
+        // Background toggle event listener
+        if (this.ui.whiteBackground) {
+          this.ui.whiteBackground.addEventListener('change', () => {
+            this.toggleBackground();
+          });
+        }
       }
       makeElementDraggable(element) {
         this.updateTrajectories(frameIndex);
+        // Keep History management
+        this.updateHistory(frameIndex);
         const progress = (frameIndex + 1) / this.config.totalFrames;
         this.ui.progress.style.width = `${progress * 100}%`;
         this.updateCameraFrustum(this.currentFrame);
       }
+      // Keep History methods
+      updateHistory(frameIndex) {
+        if (!this.ui.enableKeepHistory.checked || !this.data) return;
+        const stride = parseInt(this.ui.historyStride.value);
+        const newHistoryFrames = this.calculateHistoryFrames(frameIndex, stride);
+        // Check if history frames changed
+        if (this.arraysEqual(this.historyFrames, newHistoryFrames)) return;
+        this.clearHistory();
+        this.historyFrames = newHistoryFrames;
+        // Create history point clouds and trajectories
+        this.historyFrames.forEach(historyFrame => {
+          if (historyFrame !== frameIndex) {
+            this.createHistoryPointCloud(historyFrame);
+            this.createHistoryTrajectories(historyFrame);
+          }
+        });
+      }
+      calculateHistoryFrames(currentFrame, stride) {
+        const frames = [];
+        let frame = 1; // Start from frame 1
+        while (frame <= currentFrame && frames.length < this.maxHistoryFrames) {
+          frames.push(frame);
+          frame += stride;
+        }
+        // Always include current frame
+        if (!frames.includes(currentFrame)) {
+          frames.push(currentFrame);
+        }
+        return frames.sort((a, b) => a - b);
+      }
+      createHistoryPointCloud(frameIndex) {
+        const numPoints = this.config.resolution[0] * this.config.resolution[1];
+        const positions = new Float32Array(numPoints * 3);
+        const colors = new Float32Array(numPoints * 3);
+        const geometry = new THREE.BufferGeometry();
+        geometry.setAttribute('position', new THREE.BufferAttribute(positions, 3));
+        geometry.setAttribute('color', new THREE.BufferAttribute(colors, 3));
+        const material = new THREE.PointsMaterial({
+          size: parseFloat(this.ui.pointSize.value),
+          vertexColors: true,
+          transparent: true,
+          opacity: 0.5, // Transparent for history
+          sizeAttenuation: true
+        });
+        const historyPointCloud = new THREE.Points(geometry, material);
+        this.scene.add(historyPointCloud);
+        this.historyPointClouds.push(historyPointCloud);
+        // Update the history point cloud with data
+        this.updateHistoryPointCloud(historyPointCloud, frameIndex);
+      }
+      updateHistoryPointCloud(pointCloud, frameIndex) {
+        const positions = pointCloud.geometry.attributes.position.array;
+        const colors = pointCloud.geometry.attributes.color.array;
+        const rgbVideo = this.data.rgb_video;
+        const depthsRgb = this.data.depths_rgb;
+        const intrinsics = this.data.intrinsics;
+        const invExtrinsics = this.data.inv_extrinsics;
+        const width = this.config.resolution[0];
+        const height = this.config.resolution[1];
+        const numPoints = width * height;
+        const K = this.get3x3Matrix(intrinsics.data, intrinsics.shape, frameIndex);
+        const fx = K[0][0], fy = K[1][1], cx = K[0][2], cy = K[1][2];
+        const invExtrMat = this.get4x4Matrix(invExtrinsics.data, invExtrinsics.shape, frameIndex);
+        const transform = this.getTransformElements(invExtrMat);
+        const rgbFrame = this.getFrame(rgbVideo.data, rgbVideo.shape, frameIndex);
+        const depthFrame = this.getFrame(depthsRgb.data, depthsRgb.shape, frameIndex);
+        const maxDepth = parseFloat(this.ui.maxDepth.value) || 10.0;
+        let validPointCount = 0;
+        for (let i = 0; i < numPoints; i++) {
+          const xPix = i % width;
+          const yPix = Math.floor(i / width);
+          const d0 = depthFrame[i * 3];
+          const d1 = depthFrame[i * 3 + 1];
+          const depthEncoded = d0 | (d1 << 8);
+          const depthValue = (depthEncoded / ((1 << 16) - 1)) *
+                           (this.config.depthRange[1] - this.config.depthRange[0]) +
+                           this.config.depthRange[0];
+          if (depthValue === 0 || depthValue > maxDepth) {
+            continue;
+          }
+          const X = ((xPix - cx) * depthValue) / fx;
+          const Y = ((yPix - cy) * depthValue) / fy;
+          const Z = depthValue;
+          const tx = transform.m11 * X + transform.m12 * Y + transform.m13 * Z + transform.m14;
+          const ty = transform.m21 * X + transform.m22 * Y + transform.m23 * Z + transform.m24;
+          const tz = transform.m31 * X + transform.m32 * Y + transform.m33 * Z + transform.m34;
+          const index = validPointCount * 3;
+          positions[index] = tx;
+          positions[index + 1] = -ty;
+          positions[index + 2] = -tz;
+          colors[index] = rgbFrame[i * 3] / 255;
+          colors[index + 1] = rgbFrame[i * 3 + 1] / 255;
+          colors[index + 2] = rgbFrame[i * 3 + 2] / 255;
+          validPointCount++;
+        }
+        pointCloud.geometry.setDrawRange(0, validPointCount);
+        pointCloud.geometry.attributes.position.needsUpdate = true;
+        pointCloud.geometry.attributes.color.needsUpdate = true;
+      }
+      createHistoryTrajectories(frameIndex) {
+        if (!this.data.trajectories) return;
+        const trajectoryData = this.data.trajectories.data;
+        const [totalFrames, numTrajectories] = this.data.trajectories.shape;
+        const palette = this.createColorPalette(numTrajectories);
+        const historyTrajectoryGroup = new THREE.Group();
+        for (let i = 0; i < numTrajectories; i++) {
+          const ballSize = parseFloat(this.ui.trajectoryBallSize.value);
+          const sphereGeometry = new THREE.SphereGeometry(ballSize, 16, 16);
+          const sphereMaterial = new THREE.MeshBasicMaterial({
+            color: palette[i],
+            transparent: true,
+            opacity: 0.3 // Transparent for history
+          });
+          const positionMarker = new THREE.Mesh(sphereGeometry, sphereMaterial);
+          const currentOffset = (frameIndex * numTrajectories + i) * 3;
+          positionMarker.position.set(
+            trajectoryData[currentOffset],
+            -trajectoryData[currentOffset + 1],
+            -trajectoryData[currentOffset + 2]
+          );
+          historyTrajectoryGroup.add(positionMarker);
+        }
+        this.scene.add(historyTrajectoryGroup);
+        this.historyTrajectories.push(historyTrajectoryGroup);
+      }
+      clearHistory() {
+        // Clear history point clouds
+        this.historyPointClouds.forEach(pointCloud => {
+          if (pointCloud.geometry) pointCloud.geometry.dispose();
+          if (pointCloud.material) pointCloud.material.dispose();
+          this.scene.remove(pointCloud);
+        });
+        this.historyPointClouds = [];
+        // Clear history trajectories
+        this.historyTrajectories.forEach(trajectoryGroup => {
+          trajectoryGroup.children.forEach(child => {
+            if (child.geometry) child.geometry.dispose();
+            if (child.material) child.material.dispose();
+          });
+          this.scene.remove(trajectoryGroup);
+        });
+        this.historyTrajectories = [];
+        this.historyFrames = [];
+      }
+      arraysEqual(a, b) {
+        if (a.length !== b.length) return false;
+        for (let i = 0; i < a.length; i++) {
+          if (a[i] !== b[i]) return false;
+        }
+        return true;
+      }
+      toggleBackground() {
+        const isWhiteBackground = this.ui.whiteBackground.checked;
+        if (isWhiteBackground) {
+          // Switch to white background
+          document.body.style.backgroundColor = '#ffffff';
+          this.scene.background = new THREE.Color(0xffffff);
+          // Update UI elements for white background
+          document.documentElement.style.setProperty('--bg', '#ffffff');
+          document.documentElement.style.setProperty('--text', '#333333');
+          document.documentElement.style.setProperty('--text-secondary', '#666666');
+          document.documentElement.style.setProperty('--border', '#cccccc');
+          document.documentElement.style.setProperty('--surface', '#f5f5f5');
+          document.documentElement.style.setProperty('--shadow', 'rgba(0, 0, 0, 0.1)');
+          document.documentElement.style.setProperty('--shadow-hover', 'rgba(0, 0, 0, 0.2)');
+          // Update status bar and control panel backgrounds
+          this.ui.statusBar.style.background = 'rgba(245, 245, 245, 0.9)';
+          this.ui.statusBar.style.color = '#333333';
+          const controlPanel = document.getElementById('control-panel');
+          if (controlPanel) {
+            controlPanel.style.background = 'rgba(245, 245, 245, 0.95)';
+          }
+          const settingsPanel = document.getElementById('settings-panel');
+          if (settingsPanel) {
+            settingsPanel.style.background = 'rgba(245, 245, 245, 0.98)';
+          }
+        } else {
+          // Switch back to dark background
+          document.body.style.backgroundColor = '#1a1a1a';
+          this.scene.background = new THREE.Color(0x1a1a1a);
+          // Restore original dark theme variables
+          document.documentElement.style.setProperty('--bg', '#1a1a1a');
+          document.documentElement.style.setProperty('--text', '#e0e0e0');
+          document.documentElement.style.setProperty('--text-secondary', '#a0a0a0');
+          document.documentElement.style.setProperty('--border', '#444444');
+          document.documentElement.style.setProperty('--surface', '#2c2c2c');
+          document.documentElement.style.setProperty('--shadow', 'rgba(0, 0, 0, 0.2)');
+          document.documentElement.style.setProperty('--shadow-hover', 'rgba(0, 0, 0, 0.3)');
+          // Restore original UI backgrounds
+          this.ui.statusBar.style.background = 'rgba(30, 30, 30, 0.9)';
+          this.ui.statusBar.style.color = '#e0e0e0';
+          const controlPanel = document.getElementById('control-panel');
+          if (controlPanel) {
+            controlPanel.style.background = 'rgba(44, 44, 44, 0.95)';
+          }
+          const settingsPanel = document.getElementById('settings-panel');
+          if (settingsPanel) {
+            settingsPanel.style.background = 'rgba(44, 44, 44, 0.98)';
+          }
+        }
+        // Show status message
+        this.ui.statusBar.textContent = isWhiteBackground ? "Switched to white background" : "Switched to dark background";
+        this.ui.statusBar.classList.remove('hidden');
+        setTimeout(() => {
+          this.ui.statusBar.classList.add('hidden');
+        }, 2000);
+      }
       resetSettings() {
         if (!this.defaultSettings) return;
         this.applyDefaultSettings();
+        // Reset background to dark theme
+        if (this.ui.whiteBackground) {
+          this.ui.whiteBackground.checked = false;
+          this.toggleBackground();
+        }
         this.updatePointCloudSettings();
         this.updateTrajectorySettings();
         this.updateFrustumDimensions();
+        // Clear history when resetting settings
+        this.clearHistory();
         this.ui.statusBar.textContent = "Settings reset to defaults";
         this.ui.statusBar.classList.remove('hidden');

app.py CHANGED Viewed

@@ -26,6 +26,9 @@ import logging
 from concurrent.futures import ThreadPoolExecutor
 import atexit
 import uuid
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -78,20 +81,15 @@ def create_user_temp_dir():
     return temp_dir
 from huggingface_hub import hf_hub_download
-# init the model
-os.environ["VGGT_DIR"] = hf_hub_download("Yuxihenry/SpatialTrackerCkpts", "spatrack_front.pth") #, force_download=True)
-if os.environ.get("VGGT_DIR", None) is not None:
-    from models.vggt.vggt.models.vggt_moe import VGGT_MoE
-    from models.vggt.vggt.utils.load_fn import preprocess_image
-    vggt_model = VGGT_MoE()
-    vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")), strict=False)
-    vggt_model.eval()
-    vggt_model = vggt_model.to("cuda")
 # Global model initialization
 print("🚀 Initializing local models...")
-tracker_model, _ = get_tracker_predictor(".", vo_points=756)
 predictor = get_sam_predictor()
 print("✅ Models loaded successfully!")
@@ -131,7 +129,8 @@ def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name,
         print("Initializing tracker models inside GPU function...")
         out_dir = os.path.join(temp_dir, "results")
         os.makedirs(out_dir, exist_ok=True)
-        tracker_model_arg, tracker_viser_arg = get_tracker_predictor(out_dir, vo_points=vo_points, tracker_model=tracker_model)
     # Setup paths
     video_path = os.path.join(temp_dir, f"{video_name}.mp4")
@@ -161,25 +160,23 @@ def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name,
     data_npz_load = {}
     # run vggt
-    if os.environ.get("VGGT_DIR", None) is not None:
-        # process the image tensor
-        video_tensor = preprocess_image(video_tensor)[None]
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                # Predict attributes including cameras, depth maps, and point maps.
-                predictions = vggt_model(video_tensor.cuda()/255)
-                extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"]
-                depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"]
-        depth_tensor = depth_map.squeeze().cpu().numpy()
-        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
-        extrs = extrinsic.squeeze().cpu().numpy()
-        intrs = intrinsic.squeeze().cpu().numpy()
-        video_tensor = video_tensor.squeeze()
-        #NOTE: 20% of the depth is not reliable
-        # threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
-        unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
     # Load and process mask
     if os.path.exists(mask_path):
         mask = cv2.imread(mask_path)
@@ -201,7 +198,6 @@ def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name,
     query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
     print(f"Query points shape: {query_xyt.shape}")
     # Run model inference
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
         (
@@ -212,8 +208,8 @@ def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name,
                             queries=query_xyt,
                             fps=1, full_point=False, iters_track=4,
                             query_no_BA=True, fixed_cam=False, stage=1, unc_metric=unc_metric,
-                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
         # Resize results to avoid large I/O
         max_size = 224
         h, w = video.shape[2:]
@@ -1117,7 +1113,7 @@ if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True,
         debug=True,
         show_error=True
     )

 from concurrent.futures import ThreadPoolExecutor
 import atexit
 import uuid
+from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track
+from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image
+from models.SpaTrackV2.models.predictor import Predictor
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     return temp_dir
 from huggingface_hub import hf_hub_download
+vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
+vggt4track_model.eval()
+vggt4track_model = vggt4track_model.to("cuda")
 # Global model initialization
 print("🚀 Initializing local models...")
+tracker_model = Predictor.from_pretrained("Yuxihenry/SpatialTrackerV2-Offline")
+tracker_model.eval()
 predictor = get_sam_predictor()
 print("✅ Models loaded successfully!")
         print("Initializing tracker models inside GPU function...")
         out_dir = os.path.join(temp_dir, "results")
         os.makedirs(out_dir, exist_ok=True)
+        tracker_model_arg, tracker_viser_arg = get_tracker_predictor(out_dir, vo_points=vo_points,
+                                                                         tracker_model=tracker_model.cuda())
     # Setup paths
     video_path = os.path.join(temp_dir, f"{video_name}.mp4")
     data_npz_load = {}
     # run vggt
+    # process the image tensor
+    video_tensor = preprocess_image(video_tensor)[None]
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            # Predict attributes including cameras, depth maps, and point maps.
+            predictions = vggt4track_model(video_tensor.cuda()/255)
+            extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"]
+            depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"]
+    depth_tensor = depth_map.squeeze().cpu().numpy()
+    extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+    extrs = extrinsic.squeeze().cpu().numpy()
+    intrs = intrinsic.squeeze().cpu().numpy()
+    video_tensor = video_tensor.squeeze()
+    #NOTE: 20% of the depth is not reliable
+    # threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
+    unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
     # Load and process mask
     if os.path.exists(mask_path):
         mask = cv2.imread(mask_path)
     query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
     print(f"Query points shape: {query_xyt.shape}")
     # Run model inference
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
         (
                             queries=query_xyt,
                             fps=1, full_point=False, iters_track=4,
                             query_no_BA=True, fixed_cam=False, stage=1, unc_metric=unc_metric,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
         # Resize results to avoid large I/O
         max_size = 224
         h, w = video.shape[2:]
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,
         debug=True,
         show_error=True
     )

app_3rd/spatrack_utils/infer_track.py CHANGED Viewed

@@ -20,7 +20,7 @@ from huggingface_hub import hf_hub_download
 config = {
     "ckpt_dir": "Yuxihenry/SpatialTrackerCkpts",  # HuggingFace repo ID
-    "cfg_dir": "config/magic_infer_moge.yaml",
 }
 def get_tracker_predictor(output_dir: str, vo_points: int = 756, tracker_model=None):

 config = {
     "ckpt_dir": "Yuxihenry/SpatialTrackerCkpts",  # HuggingFace repo ID
+    "cfg_dir": "config/magic_infer_offline.yaml",
 }
 def get_tracker_predictor(output_dir: str, vo_points: int = 756, tracker_model=None):

models/SpaTrackV2/models/SpaTrack.py CHANGED Viewed

@@ -40,6 +40,7 @@ class SpaTrack2(nn.Module, PyTorchModelHubMixin):
         resolution=518,
         max_len=600,  # the maximum video length we can preprocess,
         track_num=768,
     ):
         self.chunk_size = chunk_size
@@ -51,26 +52,29 @@ class SpaTrack2(nn.Module, PyTorchModelHubMixin):
         backbone_ckpt_dir = base_cfg.pop('ckpt_dir', None)
         super(SpaTrack2, self).__init__()
-        if os.path.exists(backbone_ckpt_dir)==False:
-            base_model = MoGeModel.from_pretrained('Ruicheng/moge-vitl')
         else:
-            checkpoint = torch.load(backbone_ckpt_dir, map_location='cpu', weights_only=True)
-            base_model = MoGeModel(**checkpoint["model_config"])
-            base_model.load_state_dict(checkpoint['model'])
         # avoid the base_model is a member of SpaTrack2
         object.__setattr__(self, 'base_model', base_model)
         # Tracker model
         self.Track3D = TrackRefiner3D(Track_cfg)
-        track_base_ckpt_dir = Track_cfg.base_ckpt
         if os.path.exists(track_base_ckpt_dir):
             track_pretrain = torch.load(track_base_ckpt_dir)
             self.Track3D.load_state_dict(track_pretrain, strict=False)
         # wrap the function of make lora trainable
         self.make_paras_trainable = partial(self.make_paras_trainable,
-                                            mode=ft_cfg.mode,
-                                            paras_name=ft_cfg.paras_name)
         self.track_num = track_num
     def make_paras_trainable(self, mode: str = 'fix', paras_name: List[str] = []):
@@ -145,7 +149,7 @@ class SpaTrack2(nn.Module, PyTorchModelHubMixin):
     ):
         # step 1 allocate the query points on the grid
         T, C, H, W = video.shape
         if annots_train is not None:
             vis_gt = annots_train["vis"]
             _, _, N = vis_gt.shape
@@ -296,39 +300,6 @@ class SpaTrack2(nn.Module, PyTorchModelHubMixin):
                                 **kwargs, annots=annots)
             if self.training:
                 loss += out["loss"].squeeze()
-            # from models.SpaTrackV2.utils.visualizer import Visualizer
-            # vis_track = Visualizer(grayscale=False,
-            #                     fps=10, pad_value=50, tracks_leave_trace=0)
-            # vis_track.visualize(video=segment,
-            #                         tracks=out["traj_est"][...,:2],
-            #                         visibility=out["vis_est"],
-            #                         save_video=True)
-            # # visualize 4d
-            # import os, json
-            # import os.path as osp
-            # viser4d_dir = os.path.join("viser_4d_results")
-            # os.makedirs(viser4d_dir, exist_ok=True)
-            # depth_est = annots["depth_gt"][0]
-            # unc_metric = out["unc_metric"]
-            # mask = (unc_metric > 0.5).squeeze(1)
-            # # pose_est = out["poses_pred"].squeeze(0)
-            # pose_est = annots["traj_mat"][0]
-            # rgb_tracks = out["rgb_tracks"].squeeze(0)
-            # intrinsics = out["intrs"].squeeze(0)
-            # for i_k in range(out["depth"].shape[0]):
-            #     img_i = out["imgs_raw"][0][i_k].permute(1, 2, 0).cpu().numpy()
-            #     img_i = cv2.cvtColor(img_i, cv2.COLOR_BGR2RGB)
-            #     cv2.imwrite(osp.join(viser4d_dir, f'frame_{i_k:04d}.png'), img_i)
-            #     if stage == 1:
-            #         depth = depth_est[i_k].squeeze().cpu().numpy()
-            #         np.save(osp.join(viser4d_dir, f'frame_{i_k:04d}.npy'), depth)
-            #     else:
-            #         point_map_vis = out["points_map"][i_k].cpu().numpy()
-            #         np.save(osp.join(viser4d_dir, f'point_{i_k:04d}.npy'), point_map_vis)
-            # np.save(os.path.join(viser4d_dir, f'intrinsics.npy'), intrinsics.cpu().numpy())
-            # np.save(os.path.join(viser4d_dir, f'extrinsics.npy'), pose_est.cpu().numpy())
-            # np.save(os.path.join(viser4d_dir, f'conf.npy'), mask.float().cpu().numpy())
-            # np.save(os.path.join(viser4d_dir, f'colored_track3d.npy'), rgb_tracks.cpu().numpy())
             queries_len = len(queries_new)
             # update the track3d and track2d
@@ -720,40 +691,3 @@ class SpaTrack2(nn.Module, PyTorchModelHubMixin):
                     }
         return ret
-# three stages of training
-# stage 1:
-# gt depth and intrinsics synthetic (includes Dynamic Replica, Kubric, Pointodyssey, Vkitti, TartanAir and Indoor() ) Motion Patern (tapvid3d)
-# Tracking and Pose as well -> based on gt depth and intrinsics
-# (Finished) -> (megasam + base model) vs. tapip3d.     (use depth from megasam or pose, which keep the same setting as tapip3d.)
-# stage 2: fixed 3D tracking
-# Joint depth refiner
-# input depth from whatever + rgb -> temporal module + scale and shift token -> coarse alignment -> scale and shift
-# estimate the 3D tracks -> 3D tracks combine with pointmap -> update for pointmap (iteratively) -> residual map B T 3 H W
-# ongoing two days
-# stage 3: train multi windows by propagation
-# 4 frames overlapped -> train on 64 -> fozen image encoder and finetuning the transformer  (learnable parameters pretty small)
-# types of scenarioes:
-# 1. auto driving (waymo open dataset)
-# 2. robot
-# 3. internet ego video
-# Iterative Transformer -- Solver -- General Neural MegaSAM + Tracks
-# Update Variables:
-# 1. 3D tracks B T N 3  xyz.
-# 2. 2D tracks B T N 2  x y.
-# 3. Dynamic Mask B T H W.
-# 4. Camera Pose B T 4 4.
-# 5. Video Depth.
-# (RGB, RGBD, RGBD+Pose) x (Static, Dynamic)
-# Campatiablity by product.

         resolution=518,
         max_len=600,  # the maximum video length we can preprocess,
         track_num=768,
+        moge_as_base=False,
     ):
         self.chunk_size = chunk_size
         backbone_ckpt_dir = base_cfg.pop('ckpt_dir', None)
         super(SpaTrack2, self).__init__()
+        if moge_as_base:
+            if os.path.exists(backbone_ckpt_dir)==False:
+                base_model = MoGeModel.from_pretrained('Ruicheng/moge-vitl')
+            else:
+                checkpoint = torch.load(backbone_ckpt_dir, map_location='cpu', weights_only=True)
+                base_model = MoGeModel(**checkpoint["model_config"])
+                base_model.load_state_dict(checkpoint['model'])
         else:
+            base_model = None
         # avoid the base_model is a member of SpaTrack2
         object.__setattr__(self, 'base_model', base_model)
         # Tracker model
         self.Track3D = TrackRefiner3D(Track_cfg)
+        track_base_ckpt_dir = Track_cfg["base_ckpt"]
         if os.path.exists(track_base_ckpt_dir):
             track_pretrain = torch.load(track_base_ckpt_dir)
             self.Track3D.load_state_dict(track_pretrain, strict=False)
         # wrap the function of make lora trainable
         self.make_paras_trainable = partial(self.make_paras_trainable,
+                                            mode=ft_cfg["mode"],
+                                            paras_name=ft_cfg["paras_name"])
         self.track_num = track_num
     def make_paras_trainable(self, mode: str = 'fix', paras_name: List[str] = []):
     ):
         # step 1 allocate the query points on the grid
         T, C, H, W = video.shape
         if annots_train is not None:
             vis_gt = annots_train["vis"]
             _, _, N = vis_gt.shape
                                 **kwargs, annots=annots)
             if self.training:
                 loss += out["loss"].squeeze()
             queries_len = len(queries_new)
             # update the track3d and track2d
                     }
         return ret

models/SpaTrackV2/models/predictor.py CHANGED Viewed

@@ -16,80 +16,20 @@ from typing import Union, Optional
 import cv2
 import os
 import decord
-class Predictor(torch.nn.Module):
     def __init__(self, args=None):
         super().__init__()
         self.args = args
         self.spatrack = SpaTrack2(loggers=[None, None, None], **args)
-        self.S_wind = args.Track_cfg.s_wind
-        self.overlap = args.Track_cfg.overlap
     def to(self, device: Union[str, torch.device]):
         self.spatrack.to(device)
-        self.spatrack.base_model.to(device)
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, Path],
-        *,
-        force_download: bool = False,
-        cache_dir: Optional[str] = None,
-        device: Optional[Union[str, torch.device]] = None,
-        model_cfg: Optional[dict] = None,
-        **kwargs,
-    ) -> "SpaTrack2":
-        """
-        Load a pretrained model from a local file or a remote repository.
-        Args:
-            pretrained_model_name_or_path (str or Path):
-                - Path to a local model file (e.g., `./model.pth`).
-                - HuggingFace Hub model ID (e.g., `username/model-name`).
-            force_download (bool, optional):
-                Whether to force re-download even if cached. Default: False.
-            cache_dir (str, optional):
-                Custom cache directory. Default: None (use default cache).
-            device (str or torch.device, optional):
-                Target device (e.g., "cuda", "cpu"). Default: None (keep original).
-            **kwargs:
-                Additional config overrides.
-        Returns:
-            SpaTrack2: Loaded pretrained model.
-        """
-        # (1) check the path is local or remote
-        if isinstance(pretrained_model_name_or_path, Path):
-            model_path = str(pretrained_model_name_or_path)
-        else:
-            model_path = pretrained_model_name_or_path
-        # (2) if the path is remote, download it
-        if not os.path.exists(model_path):
-            raise NotImplementedError("Remote download not implemented yet. Use a local path.")
-        # (3) load the model weights
-        state_dict = torch.load(model_path, map_location="cpu")
-        # (4) initialize the model (can load config.json if exists)
-        config_path = os.path.join(os.path.dirname(model_path), "config.json")
-        config = {}
-        if os.path.exists(config_path):
-            import json
-            with open(config_path, "r") as f:
-                config.update(json.load(f))
-        config.update(kwargs)  # allow override the config
-        if model_cfg is not None:
-            config = model_cfg
-        model = cls(config)
-        if "model" in state_dict:
-            model.spatrack.load_state_dict(state_dict["model"], strict=False)
-        else:
-            model.spatrack.load_state_dict(state_dict, strict=False)
-        # (5) device management
-        if device is not None:
-            model.to(device)
-        return model
     def forward(self, video: str|torch.Tensor|np.ndarray,
                  depth: str|torch.Tensor|np.ndarray=None,
@@ -145,7 +85,6 @@ class Predictor(torch.nn.Module):
                                                 window_len=self.S_wind, overlap_len=self.overlap, track2d_gt=track2d_gt, full_point=full_point, iters_track=iters_track,
                                                 fixed_cam=fixed_cam, query_no_BA=query_no_BA, stage=stage, support_frame=support_frame, replace_ratio=replace_ratio) + (video[:T_],)
         return ret

 import cv2
 import os
 import decord
+from huggingface_hub import PyTorchModelHubMixin  # used for model hub
+class Predictor(torch.nn.Module, PyTorchModelHubMixin):
     def __init__(self, args=None):
         super().__init__()
         self.args = args
         self.spatrack = SpaTrack2(loggers=[None, None, None], **args)
+        self.S_wind = args["Track_cfg"]["s_wind"]
+        self.overlap = args["Track_cfg"]["overlap"]
     def to(self, device: Union[str, torch.device]):
         self.spatrack.to(device)
+        if self.spatrack.base_model is not None:
+            self.spatrack.base_model.to(device)
     def forward(self, video: str|torch.Tensor|np.ndarray,
                  depth: str|torch.Tensor|np.ndarray=None,
                                                 window_len=self.S_wind, overlap_len=self.overlap, track2d_gt=track2d_gt, full_point=full_point, iters_track=iters_track,
                                                 fixed_cam=fixed_cam, query_no_BA=query_no_BA, stage=stage, support_frame=support_frame, replace_ratio=replace_ratio) + (video[:T_],)
         return ret

models/SpaTrackV2/models/tracker3D/TrackRefiner.py CHANGED Viewed

@@ -24,14 +24,13 @@ from models.SpaTrackV2.models.tracker3D.spatrack_modules.utils import (
 )
 from models.SpaTrackV2.models.tracker3D.spatrack_modules.ba import extract_static_from_3DTracks, ba_pycolmap
 from models.SpaTrackV2.models.tracker3D.spatrack_modules.pointmap_updator import PointMapUpdator
-from models.SpaTrackV2.models.depth_refiner.depth_refiner import TrackStablizer
 from models.SpaTrackV2.models.tracker3D.spatrack_modules.alignment import affine_invariant_global_loss
 from models.SpaTrackV2.models.tracker3D.delta_utils.upsample_transformer import UpsampleTransformerAlibi
 class TrackRefiner3D(CoTrackerThreeOffline):
     def __init__(self, args=None):
-        super().__init__(**args.base)
         """
         This is 3D warpper from cotracker, which load the cotracker pretrain and
@@ -47,15 +46,7 @@ class TrackRefiner3D(CoTrackerThreeOffline):
         self.proj_xyz_embed = Mlp(in_features=1210+50, hidden_features=1110, out_features=1110)
         # get the anchor point's embedding, and init the pts refiner
         update_pts = True
-        # self.corr_transformer = nn.ModuleList([
-        #     CorrPointformer(
-        #         dim=128,
-        #         num_heads=8,
-        #         head_dim=128 // 8,
-        #         mlp_ratio=4.0,
-        #     )
-        #     for _ in range(self.corr_levels)
-        # ])
         self.corr_transformer = nn.ModuleList([
             CorrPointformer(
                 dim=128,
@@ -68,29 +59,11 @@ class TrackRefiner3D(CoTrackerThreeOffline):
         self.fnet = BasicEncoder(input_dim=3,
                                  output_dim=self.latent_dim, stride=self.stride)
         self.corr3d_radius = 3
-        if args.stablizer:
-            self.scale_shift_tokens = nn.Parameter(torch.randn(1, 2, self.latent_dim, requires_grad=True))
-            self.upsample_kernel_size = 5
-            self.residual_embedding = nn.Parameter(torch.randn(
-                                            self.latent_dim, self.model_resolution[0]//16,
-                                            self.model_resolution[1]//16, requires_grad=True))
-            self.dense_mlp = nn.Conv2d(2*self.latent_dim+63, self.latent_dim, kernel_size=1, stride=1, padding=0)
-            self.upsample_factor = 4
-            self.upsample_transformer = UpsampleTransformerAlibi(
-                kernel_size=self.upsample_kernel_size, # kernel_size=3, #
-                stride=self.stride,
-                latent_dim=self.latent_dim,
-                num_attn_blocks=2,
-                upsample_factor=4,
-            )
-        else:
-            self.update_pointmap = None
-        self.mode = args.mode
         if self.mode == "online":
-            self.s_wind = args.s_wind
-            self.overlap = args.overlap
     def upsample_with_mask(
         self, inp: torch.Tensor, mask: torch.Tensor
@@ -1062,29 +1035,7 @@ class TrackRefiner3D(CoTrackerThreeOffline):
             vis_est = (vis_est>0.5).float()
             sync_loss += (vis_est.detach()[...,None]*(coords_proj_curr - coords_proj).norm(dim=-1, keepdim=True)*(1-mask_nan[...,None].float())).mean()
             # coords_proj_curr[~mask_nan.view(B*T, N)] = coords_proj.view(B*T, N, 2)[~mask_nan.view(B*T, N)].to(coords_proj_curr.dtype)
-            # if torch.isnan(coords_proj_curr).sum()>0:
-            #     import pdb; pdb.set_trace()
-            if False:
-                point_map_resize = point_map.clone().view(B, T, 3, H, W)
-                update_input = torch.cat([point_map_resize, metric_unc.view(B,T,1,H,W)], dim=2)
-                coords_append_resize = coords.clone().detach()
-                coords_append_resize[..., :2] = coords_append_resize[..., :2] * float(self.stride)
-                update_track_input = self.norm_xyz(cam_pts_est)*5
-                update_track_input = torch.cat([update_track_input, vis_est[...,None]], dim=-1)
-                update_track_input = posenc(update_track_input, min_deg=0, max_deg=12)
-                update = self.update_pointmap.stablizer(update_input,
-                                                        update_track_input, coords_append_resize)#, imgs=video, vis_track=viser)
-                #NOTE: update the point map
-                point_map_resize += update
-                point_map_refine_out = F.interpolate(point_map_resize.view(B*T, -1, H, W),
-                                                                size=(self.image_size[0].item(), self.image_size[1].item()), mode='nearest')
-                point_map_refine_out = rearrange(point_map_refine_out, '(b t) c h w -> b t c h w', t=T, b=B)
-                point_map_preds.append(self.denorm_xyz(point_map_refine_out))
-                point_map_org = self.denorm_xyz(point_map_refine_out).view(B*T, 3, H_, W_)
-            # if torch.isnan(coords).sum()>0:
-            #     import pdb; pdb.set_trace()
             #NOTE: the 2d tracking + unproject depth
             fix_cam_est = coords_append.clone()
             fix_cam_est[...,2] = depth_unproj

 )
 from models.SpaTrackV2.models.tracker3D.spatrack_modules.ba import extract_static_from_3DTracks, ba_pycolmap
 from models.SpaTrackV2.models.tracker3D.spatrack_modules.pointmap_updator import PointMapUpdator
 from models.SpaTrackV2.models.tracker3D.spatrack_modules.alignment import affine_invariant_global_loss
 from models.SpaTrackV2.models.tracker3D.delta_utils.upsample_transformer import UpsampleTransformerAlibi
 class TrackRefiner3D(CoTrackerThreeOffline):
     def __init__(self, args=None):
+        super().__init__(**args["base"])
         """
         This is 3D warpper from cotracker, which load the cotracker pretrain and
         self.proj_xyz_embed = Mlp(in_features=1210+50, hidden_features=1110, out_features=1110)
         # get the anchor point's embedding, and init the pts refiner
         update_pts = True
         self.corr_transformer = nn.ModuleList([
             CorrPointformer(
                 dim=128,
         self.fnet = BasicEncoder(input_dim=3,
                                  output_dim=self.latent_dim, stride=self.stride)
         self.corr3d_radius = 3
+        self.mode = args["mode"]
         if self.mode == "online":
+            self.s_wind = args["s_wind"]
+            self.overlap = args["overlap"]
     def upsample_with_mask(
         self, inp: torch.Tensor, mask: torch.Tensor
             vis_est = (vis_est>0.5).float()
             sync_loss += (vis_est.detach()[...,None]*(coords_proj_curr - coords_proj).norm(dim=-1, keepdim=True)*(1-mask_nan[...,None].float())).mean()
             # coords_proj_curr[~mask_nan.view(B*T, N)] = coords_proj.view(B*T, N, 2)[~mask_nan.view(B*T, N)].to(coords_proj_curr.dtype)
             #NOTE: the 2d tracking + unproject depth
             fix_cam_est = coords_append.clone()
             fix_cam_est[...,2] = depth_unproj

requirements.txt CHANGED Viewed

@@ -22,8 +22,8 @@ git+https://github.com/facebookresearch/segment-anything.git
 git+https://github.com/EasternJournalist/utils3d.git#egg=utils3d
 huggingface_hub
 pyceres
-kornia
-xformers
 timm
 PyJWT
 gdown

 git+https://github.com/EasternJournalist/utils3d.git#egg=utils3d
 huggingface_hub
 pyceres
+kornia==0.8.1
+xformers==0.0.28
 timm
 PyJWT
 gdown