Initial commit

This commit is contained in:
nikitakaraevv
2023-07-17 17:49:06 -07:00
commit 6d62d873fa
41 changed files with 5989 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

View File

@@ -0,0 +1,6 @@
defaults:
- default_config_eval
exp_dir: ./outputs/cotracker
dataset_name: badja

View File

@@ -0,0 +1,6 @@
defaults:
- default_config_eval
exp_dir: ./outputs/cotracker
dataset_name: fastcapture

View File

@@ -0,0 +1,6 @@
defaults:
- default_config_eval
exp_dir: ./outputs/cotracker
dataset_name: tapvid_davis_first

View File

@@ -0,0 +1,6 @@
defaults:
- default_config_eval
exp_dir: ./outputs/cotracker
dataset_name: tapvid_davis_strided

View File

@@ -0,0 +1,6 @@
defaults:
- default_config_eval
exp_dir: ./outputs/cotracker
dataset_name: tapvid_kinetics_first

View File

@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

View File

@@ -0,0 +1,144 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
from typing import Iterable, Mapping, Tuple, Union
def compute_tapvid_metrics(
query_points: np.ndarray,
gt_occluded: np.ndarray,
gt_tracks: np.ndarray,
pred_occluded: np.ndarray,
pred_tracks: np.ndarray,
query_mode: str,
) -> Mapping[str, np.ndarray]:
"""Computes TAP-Vid metrics (Jaccard, Pts. Within Thresh, Occ. Acc.)
See the TAP-Vid paper for details on the metric computation. All inputs are
given in raster coordinates. The first three arguments should be the direct
outputs of the reader: the 'query_points', 'occluded', and 'target_points'.
The paper metrics assume these are scaled relative to 256x256 images.
pred_occluded and pred_tracks are your algorithm's predictions.
This function takes a batch of inputs, and computes metrics separately for
each video. The metrics for the full benchmark are a simple mean of the
metrics across the full set of videos. These numbers are between 0 and 1,
but the paper multiplies them by 100 to ease reading.
Args:
query_points: The query points, an in the format [t, y, x]. Its size is
[b, n, 3], where b is the batch size and n is the number of queries
gt_occluded: A boolean array of shape [b, n, t], where t is the number
of frames. True indicates that the point is occluded.
gt_tracks: The target points, of shape [b, n, t, 2]. Each point is
in the format [x, y]
pred_occluded: A boolean array of predicted occlusions, in the same
format as gt_occluded.
pred_tracks: An array of track predictions from your algorithm, in the
same format as gt_tracks.
query_mode: Either 'first' or 'strided', depending on how queries are
sampled. If 'first', we assume the prior knowledge that all points
before the query point are occluded, and these are removed from the
evaluation.
Returns:
A dict with the following keys:
occlusion_accuracy: Accuracy at predicting occlusion.
pts_within_{x} for x in [1, 2, 4, 8, 16]: Fraction of points
predicted to be within the given pixel threshold, ignoring occlusion
prediction.
jaccard_{x} for x in [1, 2, 4, 8, 16]: Jaccard metric for the given
threshold
average_pts_within_thresh: average across pts_within_{x}
average_jaccard: average across jaccard_{x}
"""
metrics = {}
# Don't evaluate the query point. Numpy doesn't have one_hot, so we
# replicate it by indexing into an identity matrix.
one_hot_eye = np.eye(gt_tracks.shape[2])
query_frame = query_points[..., 0]
query_frame = np.round(query_frame).astype(np.int32)
evaluation_points = one_hot_eye[query_frame] == 0
# If we're using the first point on the track as a query, don't evaluate the
# other points.
if query_mode == "first":
for i in range(gt_occluded.shape[0]):
index = np.where(gt_occluded[i] == 0)[0][0]
evaluation_points[i, :index] = False
elif query_mode != "strided":
raise ValueError("Unknown query mode " + query_mode)
# Occlusion accuracy is simply how often the predicted occlusion equals the
# ground truth.
occ_acc = (
np.sum(
np.equal(pred_occluded, gt_occluded) & evaluation_points,
axis=(1, 2),
)
/ np.sum(evaluation_points)
)
metrics["occlusion_accuracy"] = occ_acc
# Next, convert the predictions and ground truth positions into pixel
# coordinates.
visible = np.logical_not(gt_occluded)
pred_visible = np.logical_not(pred_occluded)
all_frac_within = []
all_jaccard = []
for thresh in [1, 2, 4, 8, 16]:
# True positives are points that are within the threshold and where both
# the prediction and the ground truth are listed as visible.
within_dist = (
np.sum(
np.square(pred_tracks - gt_tracks),
axis=-1,
)
< np.square(thresh)
)
is_correct = np.logical_and(within_dist, visible)
# Compute the frac_within_threshold, which is the fraction of points
# within the threshold among points that are visible in the ground truth,
# ignoring whether they're predicted to be visible.
count_correct = np.sum(
is_correct & evaluation_points,
axis=(1, 2),
)
count_visible_points = np.sum(visible & evaluation_points, axis=(1, 2))
frac_correct = count_correct / count_visible_points
metrics["pts_within_" + str(thresh)] = frac_correct
all_frac_within.append(frac_correct)
true_positives = np.sum(
is_correct & pred_visible & evaluation_points, axis=(1, 2)
)
# The denominator of the jaccard metric is the true positives plus
# false positives plus false negatives. However, note that true positives
# plus false negatives is simply the number of points in the ground truth
# which is easier to compute than trying to compute all three quantities.
# Thus we just add the number of points in the ground truth to the number
# of false positives.
#
# False positives are simply points that are predicted to be visible,
# but the ground truth is not visible or too far from the prediction.
gt_positives = np.sum(visible & evaluation_points, axis=(1, 2))
false_positives = (~visible) & pred_visible
false_positives = false_positives | ((~within_dist) & pred_visible)
false_positives = np.sum(false_positives & evaluation_points, axis=(1, 2))
jaccard = true_positives / (gt_positives + false_positives)
metrics["jaccard_" + str(thresh)] = jaccard
all_jaccard.append(jaccard)
metrics["average_jaccard"] = np.mean(
np.stack(all_jaccard, axis=1),
axis=1,
)
metrics["average_pts_within_thresh"] = np.mean(
np.stack(all_frac_within, axis=1),
axis=1,
)
return metrics

View File

@@ -0,0 +1,252 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from collections import defaultdict
import os
from typing import Optional
import torch
from tqdm import tqdm
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from cotracker.datasets.utils import dataclass_to_cuda_
from cotracker.utils.visualizer import Visualizer
from cotracker.models.core.model_utils import reduce_masked_mean
from cotracker.evaluation.core.eval_utils import compute_tapvid_metrics
import logging
class Evaluator:
"""
A class defining the CoTracker evaluator.
"""
def __init__(self, exp_dir) -> None:
# Visualization
self.exp_dir = exp_dir
os.makedirs(exp_dir, exist_ok=True)
self.visualization_filepaths = defaultdict(lambda: defaultdict(list))
self.visualize_dir = os.path.join(exp_dir, "visualisations")
def compute_metrics(self, metrics, sample, pred_trajectory, dataset_name):
if isinstance(pred_trajectory, tuple):
pred_trajectory, pred_visibility = pred_trajectory
else:
pred_visibility = None
if dataset_name == "badja":
sample.segmentation = (sample.segmentation > 0).float()
*_, N, _ = sample.trajectory.shape
accs = []
accs_3px = []
for s1 in range(1, sample.video.shape[1]): # target frame
for n in range(N):
vis = sample.visibility[0, s1, n]
if vis > 0:
coord_e = pred_trajectory[0, s1, n] # 2
coord_g = sample.trajectory[0, s1, n] # 2
dist = torch.sqrt(torch.sum((coord_e - coord_g) ** 2, dim=0))
area = torch.sum(sample.segmentation[0, s1])
# print_('0.2*sqrt(area)', 0.2*torch.sqrt(area))
thr = 0.2 * torch.sqrt(area)
# correct =
accs.append((dist < thr).float())
# print('thr',thr)
accs_3px.append((dist < 3.0).float())
res = torch.mean(torch.stack(accs)) * 100.0
res_3px = torch.mean(torch.stack(accs_3px)) * 100.0
metrics[sample.seq_name[0]] = res.item()
metrics[sample.seq_name[0] + "_accuracy"] = res_3px.item()
print(metrics)
print(
"avg", np.mean([v for k, v in metrics.items() if "accuracy" not in k])
)
print(
"avg acc 3px",
np.mean([v for k, v in metrics.items() if "accuracy" in k]),
)
elif dataset_name == "fastcapture" or ("kubric" in dataset_name):
*_, N, _ = sample.trajectory.shape
accs = []
for s1 in range(1, sample.video.shape[1]): # target frame
for n in range(N):
vis = sample.visibility[0, s1, n]
if vis > 0:
coord_e = pred_trajectory[0, s1, n] # 2
coord_g = sample.trajectory[0, s1, n] # 2
dist = torch.sqrt(torch.sum((coord_e - coord_g) ** 2, dim=0))
thr = 3
correct = (dist < thr).float()
accs.append(correct)
res = torch.mean(torch.stack(accs)) * 100.0
metrics[sample.seq_name[0] + "_accuracy"] = res.item()
print(metrics)
print("avg", np.mean([v for v in metrics.values()]))
elif "tapvid" in dataset_name:
B, T, N, D = sample.trajectory.shape
traj = sample.trajectory.clone()
thr = 0.9
if pred_visibility is None:
logging.warning("visibility is NONE")
pred_visibility = torch.zeros_like(sample.visibility)
if not pred_visibility.dtype == torch.bool:
pred_visibility = pred_visibility > thr
# pred_trajectory
query_points = sample.query_points.clone().cpu().numpy()
pred_visibility = pred_visibility[:, :, :N]
pred_trajectory = pred_trajectory[:, :, :N]
gt_tracks = traj.permute(0, 2, 1, 3).cpu().numpy()
gt_occluded = (
torch.logical_not(sample.visibility.clone().permute(0, 2, 1))
.cpu()
.numpy()
)
pred_occluded = (
torch.logical_not(pred_visibility.clone().permute(0, 2, 1))
.cpu()
.numpy()
)
pred_tracks = pred_trajectory.permute(0, 2, 1, 3).cpu().numpy()
out_metrics = compute_tapvid_metrics(
query_points,
gt_occluded,
gt_tracks,
pred_occluded,
pred_tracks,
query_mode="strided" if "strided" in dataset_name else "first",
)
metrics[sample.seq_name[0]] = out_metrics
for metric_name in out_metrics.keys():
if "avg" not in metrics:
metrics["avg"] = {}
metrics["avg"][metric_name] = np.mean(
[v[metric_name] for k, v in metrics.items() if k != "avg"]
)
logging.info(f"Metrics: {out_metrics}")
logging.info(f"avg: {metrics['avg']}")
print("metrics", out_metrics)
print("avg", metrics["avg"])
else:
rgbs = sample.video
trajs_g = sample.trajectory
valids = sample.valid
vis_g = sample.visibility
B, S, C, H, W = rgbs.shape
assert C == 3
B, S, N, D = trajs_g.shape
assert torch.sum(valids) == B * S * N
vis_g = (torch.sum(vis_g, dim=1, keepdim=True) >= 4).float().repeat(1, S, 1)
ate = torch.norm(pred_trajectory - trajs_g, dim=-1) # B, S, N
metrics["things_all"] = reduce_masked_mean(ate, valids).item()
metrics["things_vis"] = reduce_masked_mean(ate, valids * vis_g).item()
metrics["things_occ"] = reduce_masked_mean(
ate, valids * (1.0 - vis_g)
).item()
@torch.no_grad()
def evaluate_sequence(
self,
model,
test_dataloader: torch.utils.data.DataLoader,
dataset_name: str,
train_mode=False,
writer: Optional[SummaryWriter] = None,
step: Optional[int] = 0,
):
metrics = {}
vis = Visualizer(
save_dir=self.exp_dir,
fps=7,
)
for ind, sample in enumerate(tqdm(test_dataloader)):
if isinstance(sample, tuple):
sample, gotit = sample
if not all(gotit):
print("batch is None")
continue
dataclass_to_cuda_(sample)
if (
not train_mode
and hasattr(model, "sequence_len")
and (sample.visibility[:, : model.sequence_len].sum() == 0)
):
print(f"skipping batch {ind}")
continue
if "tapvid" in dataset_name:
queries = sample.query_points.clone().float()
queries = torch.stack(
[
queries[:, :, 0],
queries[:, :, 2],
queries[:, :, 1],
],
dim=2,
)
else:
queries = torch.cat(
[
torch.zeros_like(sample.trajectory[:, 0, :, :1]),
sample.trajectory[:, 0],
],
dim=2,
)
pred_tracks = model(sample.video, queries)
if "strided" in dataset_name:
inv_video = sample.video.flip(1).clone()
inv_queries = queries.clone()
inv_queries[:, :, 0] = inv_video.shape[1] - inv_queries[:, :, 0] - 1
pred_trj, pred_vsb = pred_tracks
inv_pred_trj, inv_pred_vsb = model(inv_video, inv_queries)
inv_pred_trj = inv_pred_trj.flip(1)
inv_pred_vsb = inv_pred_vsb.flip(1)
mask = pred_trj == 0
pred_trj[mask] = inv_pred_trj[mask]
pred_vsb[mask[:, :, :, 0]] = inv_pred_vsb[mask[:, :, :, 0]]
pred_tracks = pred_trj, pred_vsb
if dataset_name == "badja" or dataset_name == "fastcapture":
seq_name = sample.seq_name[0]
else:
seq_name = str(ind)
vis.visualize(
sample.video,
pred_tracks[0] if isinstance(pred_tracks, tuple) else pred_tracks,
filename=dataset_name + "_" + seq_name,
writer=writer,
step=step,
)
self.compute_metrics(metrics, sample, pred_tracks, dataset_name)
return metrics

View File

@@ -0,0 +1,179 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import json
import os
from dataclasses import dataclass, field
import hydra
import numpy as np
import torch
from omegaconf import OmegaConf
from cotracker.datasets.badja_dataset import BadjaDataset
from cotracker.datasets.fast_capture_dataset import FastCaptureDataset
from cotracker.datasets.tap_vid_datasets import TapVidDataset
from cotracker.datasets.utils import collate_fn
from cotracker.models.evaluation_predictor import EvaluationPredictor
from cotracker.evaluation.core.evaluator import Evaluator
from cotracker.models.build_cotracker import (
build_cotracker,
)
@dataclass(eq=False)
class DefaultConfig:
# Directory where all outputs of the experiment will be saved.
exp_dir: str = "./outputs"
# Name of the dataset to be used for the evaluation.
dataset_name: str = "badja"
# The root directory of the dataset.
dataset_root: str = "./"
# Path to the pre-trained model checkpoint to be used for the evaluation.
# The default value is the path to a specific CoTracker model checkpoint.
# Other available options are commented.
checkpoint: str = "./checkpoints/cotracker_stride_4_wind_8.pth"
# cotracker_stride_4_wind_12
# cotracker_stride_8_wind_16
# EvaluationPredictor parameters
# The size (N) of the support grid used in the predictor.
# The total number of points is (N*N).
grid_size: int = 6
# The size (N) of the local support grid.
local_grid_size: int = 6
# A flag indicating whether to evaluate one ground truth point at a time.
single_point: bool = True
# The number of iterative updates for each sliding window.
n_iters: int = 6
seed: int = 0
gpu_idx: int = 0
# Override hydra's working directory to current working dir,
# also disable storing the .hydra logs:
hydra: dict = field(
default_factory=lambda: {
"run": {"dir": "."},
"output_subdir": None,
}
)
def run_eval(cfg: DefaultConfig):
"""
The function evaluates CoTracker on a specified benchmark dataset based on a provided configuration.
Args:
cfg (DefaultConfig): An instance of DefaultConfig class which includes:
- exp_dir (str): The directory path for the experiment.
- dataset_name (str): The name of the dataset to be used.
- dataset_root (str): The root directory of the dataset.
- checkpoint (str): The path to the CoTracker model's checkpoint.
- single_point (bool): A flag indicating whether to evaluate one ground truth point at a time.
- n_iters (int): The number of iterative updates for each sliding window.
- seed (int): The seed for setting the random state for reproducibility.
- gpu_idx (int): The index of the GPU to be used.
"""
# Creating the experiment directory if it doesn't exist
os.makedirs(cfg.exp_dir, exist_ok=True)
# Saving the experiment configuration to a .yaml file in the experiment directory
cfg_file = os.path.join(cfg.exp_dir, "expconfig.yaml")
with open(cfg_file, "w") as f:
OmegaConf.save(config=cfg, f=f)
evaluator = Evaluator(cfg.exp_dir)
cotracker_model = build_cotracker(cfg.checkpoint)
# Creating the EvaluationPredictor object
predictor = EvaluationPredictor(
cotracker_model,
grid_size=cfg.grid_size,
local_grid_size=cfg.local_grid_size,
single_point=cfg.single_point,
n_iters=cfg.n_iters,
)
# Setting the random seeds
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
# Constructing the specified dataset
curr_collate_fn = collate_fn
if cfg.dataset_name == "badja":
test_dataset = BadjaDataset(data_root=os.path.join(cfg.dataset_root, "BADJA"))
elif cfg.dataset_name == "fastcapture":
test_dataset = FastCaptureDataset(
data_root=os.path.join(cfg.dataset_root, "fastcapture"),
max_seq_len=100,
max_num_points=20,
)
elif "tapvid" in cfg.dataset_name:
dataset_type = cfg.dataset_name.split("_")[1]
if dataset_type == "davis":
data_root = os.path.join(cfg.dataset_root, "/tapvid_davis/tapvid_davis.pkl")
elif dataset_type == "kinetics":
data_root = os.path.join(
cfg.dataset_root, "/kinetics/kinetics-dataset/k700-2020/tapvid_kinetics"
)
test_dataset = TapVidDataset(
dataset_type=dataset_type,
data_root=data_root,
queried_first=not "strided" in cfg.dataset_name,
)
# Creating the DataLoader object
test_dataloader = torch.utils.data.DataLoader(
test_dataset,
batch_size=1,
shuffle=False,
num_workers=14,
collate_fn=curr_collate_fn,
)
# Timing and conducting the evaluation
import time
start = time.time()
evaluate_result = evaluator.evaluate_sequence(
predictor,
test_dataloader,
dataset_name=cfg.dataset_name,
)
end = time.time()
print(end - start)
# Saving the evaluation results to a .json file
if not "tapvid" in cfg.dataset_name:
print("evaluate_result", evaluate_result)
else:
evaluate_result = evaluate_result["avg"]
result_file = os.path.join(cfg.exp_dir, f"result_eval_.json")
evaluate_result["time"] = end - start
print(f"Dumping eval results to {result_file}.")
with open(result_file, "w") as f:
json.dump(evaluate_result, f)
cs = hydra.core.config_store.ConfigStore.instance()
cs.store(name="default_config_eval", node=DefaultConfig)
@hydra.main(config_path="./configs/", config_name="default_config_eval")
def evaluate(cfg: DefaultConfig) -> None:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.gpu_idx)
run_eval(cfg)
if __name__ == "__main__":
evaluate()