add some comments

2024-08-05 23:36:58 +02:00
parent 36d1566750
commit 6e7bcd2d26
55 changed files with 3946 additions and 4095 deletions
--- a/cotracker/pycache/init.cpython-38.pyc
+++ b/cotracker/pycache/init.cpython-38.pyc
--- a/cotracker/pycache/init.cpython-39.pyc
+++ b/cotracker/pycache/init.cpython-39.pyc
--- a/cotracker/pycache/predictor.cpython-38.pyc
+++ b/cotracker/pycache/predictor.cpython-38.pyc
--- a/cotracker/pycache/predictor.cpython-39.pyc
+++ b/cotracker/pycache/predictor.cpython-39.pyc
--- a/cotracker/models/pycache/init.cpython-38.pyc
+++ b/cotracker/models/pycache/init.cpython-38.pyc
--- a/cotracker/models/pycache/init.cpython-39.pyc
+++ b/cotracker/models/pycache/init.cpython-39.pyc
--- a/cotracker/models/pycache/build_cotracker.cpython-38.pyc
+++ b/cotracker/models/pycache/build_cotracker.cpython-38.pyc
--- a/cotracker/models/pycache/build_cotracker.cpython-39.pyc
+++ b/cotracker/models/pycache/build_cotracker.cpython-39.pyc
--- a/cotracker/models/core/pycache/init.cpython-38.pyc
+++ b/cotracker/models/core/pycache/init.cpython-38.pyc
--- a/cotracker/models/core/pycache/init.cpython-39.pyc
+++ b/cotracker/models/core/pycache/init.cpython-39.pyc
--- a/cotracker/models/core/pycache/embeddings.cpython-38.pyc
+++ b/cotracker/models/core/pycache/embeddings.cpython-38.pyc
--- a/cotracker/models/core/pycache/embeddings.cpython-39.pyc
+++ b/cotracker/models/core/pycache/embeddings.cpython-39.pyc
--- a/cotracker/models/core/pycache/model_utils.cpython-38.pyc
+++ b/cotracker/models/core/pycache/model_utils.cpython-38.pyc
--- a/cotracker/models/core/pycache/model_utils.cpython-39.pyc
+++ b/cotracker/models/core/pycache/model_utils.cpython-39.pyc
--- a/cotracker/models/core/cotracker/pycache/init.cpython-38.pyc
+++ b/cotracker/models/core/cotracker/pycache/init.cpython-38.pyc
--- a/cotracker/models/core/cotracker/pycache/init.cpython-39.pyc
+++ b/cotracker/models/core/cotracker/pycache/init.cpython-39.pyc
--- a/cotracker/models/core/cotracker/pycache/blocks.cpython-38.pyc
+++ b/cotracker/models/core/cotracker/pycache/blocks.cpython-38.pyc
--- a/cotracker/models/core/cotracker/pycache/blocks.cpython-39.pyc
+++ b/cotracker/models/core/cotracker/pycache/blocks.cpython-39.pyc
--- a/cotracker/models/core/cotracker/pycache/cotracker.cpython-38.pyc
+++ b/cotracker/models/core/cotracker/pycache/cotracker.cpython-38.pyc
--- a/cotracker/models/core/cotracker/pycache/cotracker.cpython-39.pyc
+++ b/cotracker/models/core/cotracker/pycache/cotracker.cpython-39.pyc
--- a/cotracker/models/core/cotracker/blocks.py
+++ b/cotracker/models/core/cotracker/blocks.py
@@ -191,6 +191,7 @@ class BasicEncoder(nn.Module):
        x = self.norm1(x)
        x = self.relu1(x)
        # 四层残差块
        a = self.layer1(x)
        b = self.layer2(a)
        c = self.layer3(b)
--- a/cotracker/models/core/cotracker/cotracker.py
+++ b/cotracker/models/core/cotracker/cotracker.py
@@ -41,6 +41,7 @@ class CoTracker2(nn.Module):
        self.hidden_dim = 256
        self.latent_dim = 128
        self.add_space_attn = add_space_attn
        self.fnet = BasicEncoder(output_dim=self.latent_dim)
        self.num_virtual_tracks = num_virtual_tracks
        self.model_resolution = model_resolution
@@ -107,6 +108,7 @@ class CoTracker2(nn.Module):
        B, S_init, N, __ = track_mask.shape
        B, S, *_ = fmaps.shape
        # 填充使得track_mask 的帧数与特征图的帧数一致。
        track_mask = F.pad(track_mask, (0, 0, 0, 0, 0, S - S_init), "constant")
        track_mask_vis = (
            torch.cat([track_mask, vis], dim=-1).permute(0, 2, 1, 3).reshape(B * N, S, 2)
@@ -171,6 +173,7 @@ class CoTracker2(nn.Module):
            ],
            dim=-1,
        )
        # 双线性采样
        sample_track_feats = sample_features5d(fmaps, sample_coords)
        return sample_track_feats
@@ -227,22 +230,24 @@ class CoTracker2(nn.Module):
        # The first channel is the frame number
        # The rest are the coordinates of points we want to track
-        queried_frames = queries[:, :, 0].long()
+        queried_frames = queries[:, :, 0].long() # 获取帧数字
        queried_coords = queries[..., 1:]
-        queried_coords = queried_coords / self.stride
+        queried_coords = queried_coords / self.stride # 缩放
        # We store our predictions here
-        coords_predicted = torch.zeros((B, T, N, 2), device=device)
+        coords_predicted = torch.zeros((B, T, N, 2), device=device) # 等待处理的预测的点
        vis_predicted = torch.zeros((B, T, N), device=device)
        if is_online:
            # 如果online的话，坐标都制成0, vis都是false
            # 如果不是在线，就填充一圈0
            if self.online_coords_predicted is None:
                # Init online predictions with zeros
                self.online_coords_predicted = coords_predicted
                self.online_vis_predicted = vis_predicted
            else:
                # Pad online predictions with zeros for the current window
-                pad = min(step, T - step)
+                pad = min(step, T - step) # 确保填充量不会超过剩余的时间帧数
                coords_predicted = F.pad(
                    self.online_coords_predicted, (0, 0, 0, 0, 0, pad), "constant"
                )
@@ -250,19 +255,24 @@ class CoTracker2(nn.Module):
        all_coords_predictions, all_vis_predictions = [], []
        # Pad the video so that an integer number of sliding windows fit into it
        # 填充视频，使得一个整数的滑动窗口能够适应它
        # TODO: we may drop this requirement because the transformer should not care
        # TODO: pad the features instead of the video
        # 下面这行计算需要填充的帧数
        pad = S - T if is_online else (S - T % S) % S  # We don't want to pad if T % S == 0
        # 填充将最后一个帧复制pad遍
        video = F.pad(video.reshape(B, 1, T, C * H * W), (0, 0, 0, pad), "replicate").reshape(
            B, -1, C, H, W
        )
        # Compute convolutional features for the video or for the current chunk in case of online mode
        # 计算视频的卷积特征或者是在线计算当前的块
        fmaps = self.fnet(video.reshape(-1, C, H, W)).reshape(
            B, -1, self.latent_dim, H // self.stride, W // self.stride
        )
        # We compute track features
        # 内部是用双线性采样求feature maps feature
        track_feat = self.get_track_feat(
            fmaps,
            queried_frames - self.online_ind if is_online else queried_frames,
@@ -284,14 +294,17 @@ class CoTracker2(nn.Module):
        # We process only the current video chunk in the online mode
        indices = [self.online_ind] if is_online else range(0, step * num_windows, step)
        # 查询的坐标调整形状
        coords_init = queried_coords.reshape(B, 1, N, 2).expand(B, S, N, 2).float()
        vis_init = torch.ones((B, S, N, 1), device=device).float() * 10
        for ind in indices:
            # We copy over coords and vis for tracks that are queried
            # by the end of the previous window, which is ind + overlap
            # 处理重叠部分
            if ind > 0:
                overlap = S - step
                copy_over = (queried_frames < ind + overlap)[:, None, :, None]  # B 1 N 1
                # 复制前一个窗口的预测结果
                coords_prev = torch.nn.functional.pad(
                    coords_predicted[:, ind : ind + overlap] / self.stride,
                    (0, 0, 0, 0, 0, step),
@@ -304,16 +317,18 @@ class CoTracker2(nn.Module):
                )  # B S N 1
                coords_init = torch.where(
                    copy_over.expand_as(coords_init), coords_prev, coords_init
-                )
+                )# True就是coords_prev, False 就是coords_init
                vis_init = torch.where(copy_over.expand_as(vis_init), vis_prev, vis_init)
            # The attention mask is 1 for the spatio-temporal points within
            # a track which is updated in the current window
            # 用于表示在当前窗口内需要更新的时间-空间点
            attention_mask = (queried_frames < ind + S).reshape(B, 1, N).repeat(1, S, 1)  # B S N
            # The track mask is 1 for the spatio-temporal points that actually
            # need updating: only after begin queried, and not if contained
            # in a previous window
            # track_mask表示实际需要更新的
            track_mask = (
                queried_frames[:, None, :, None]
                <= torch.arange(ind, ind + S, device=device)[None, :, None, None]
@@ -323,6 +338,7 @@ class CoTracker2(nn.Module):
                track_mask[:, :overlap, :, :] = False
            # Predict the coordinates and visibility for the current window
            # 用forward_window 来更新coords和vis
            coords, vis = self.forward_window(
                fmaps=fmaps if is_online else fmaps[:, ind : ind + S],
                coords=coords_init,
--- a/cotracker/predictor.py
+++ b/cotracker/predictor.py
@@ -56,6 +56,10 @@ class CoTrackerPredictor(torch.nn.Module):
        return tracks, visibilities
    # gpu dense inference time
    # raft gpu comparison
    # vision effects
    # raft integrated
    def _compute_dense_tracks(self, video, grid_query_frame, grid_size=80, backward_tracking=False):
        *_, H, W = video.shape
        grid_step = W // grid_size
--- a/cotracker/utils/pycache/init.cpython-38.pyc
+++ b/cotracker/utils/pycache/init.cpython-38.pyc
--- a/cotracker/utils/pycache/init.cpython-39.pyc
+++ b/cotracker/utils/pycache/init.cpython-39.pyc
--- a/cotracker/utils/pycache/visualizer.cpython-38.pyc
+++ b/cotracker/utils/pycache/visualizer.cpython-38.pyc
--- a/cotracker/utils/pycache/visualizer.cpython-39.pyc
+++ b/cotracker/utils/pycache/visualizer.cpython-39.pyc
--- a/notebooks/demo.ipynb
+++ b/notebooks/demo.ipynb