From c79beb54777ecb834420671378fad6e0716657df Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Fri, 6 Feb 2026 16:37:37 +0100 Subject: [PATCH 01/14] Very first runnable BoT-SORT. Implement camera motion compensation with orb. --- trackers/core/botsort/__init__.py | 8 + trackers/core/botsort/cmc.py | 181 ++++++++++++ trackers/core/botsort/kalman_box_tracker.py | 146 ++++++++++ trackers/core/botsort/tracker.py | 303 ++++++++++++++++++++ 4 files changed, 638 insertions(+) create mode 100644 trackers/core/botsort/__init__.py create mode 100644 trackers/core/botsort/cmc.py create mode 100644 trackers/core/botsort/kalman_box_tracker.py create mode 100644 trackers/core/botsort/tracker.py diff --git a/trackers/core/botsort/__init__.py b/trackers/core/botsort/__init__.py new file mode 100644 index 00000000..e0bc8c7c --- /dev/null +++ b/trackers/core/botsort/__init__.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------ +# Trackers +# Copyright (c) 2026 Roboflow. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +from .tracker import BoTSORTTracker + +__all__ = ["BoTSORTTracker"] \ No newline at end of file diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py new file mode 100644 index 00000000..838eb356 --- /dev/null +++ b/trackers/core/botsort/cmc.py @@ -0,0 +1,181 @@ +# ------------------------------------------------------------------------ +# Trackers +# Copyright (c) 2026 Roboflow. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import copy +import numpy as np +import cv2 + + +@dataclass +class CMCConfig: + downscale: int = 2 + fast_threshold: int = 20 + + # Affine estimation + ransac_reproj_threshold: float = 3.0 + + # Filtering matches by spatial displacement (fraction of image size) + max_spatial_distance_frac: float = 0.25 + + # Keep features from central ROI (avoid borders) + roi_min_frac: float = 0.02 + roi_max_frac: float = 0.98 + + +class CMC: + def __init__(self, cfg: Optional[CMCConfig] = None) -> None: + self.cfg = cfg or CMCConfig() + self.downscale = max(1, int(self.cfg.downscale)) + + self.detector = cv2.FastFeatureDetector_create(self.cfg.fast_threshold) + self.extractor = cv2.ORB_create() + self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) + + self._initialized = False + self._prev_kps = None + self._prev_desc: Optional[np.ndarray] = None + + def reset(self) -> None: + self._initialized = False + self._prev_kps = None + self._prev_desc = None + + def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + if frame_bgr is None: + return np.eye(2, 3, dtype=np.float32) + + H_img, W_img = frame_bgr.shape[:2] + gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + + # Downscale for speed / robustness + if self.downscale > 1: + gray = cv2.resize(gray, (W_img // self.downscale, H_img // self.downscale)) + H, W = gray.shape[:2] + + # Build mask: central ROI + remove detections (background features) + mask = np.zeros_like(gray, dtype=np.uint8) + y0 = int(self.cfg.roi_min_frac * H) + y1 = int(self.cfg.roi_max_frac * H) + x0 = int(self.cfg.roi_min_frac * W) + x1 = int(self.cfg.roi_max_frac * W) + mask[y0:y1, x0:x1] = 255 + + if dets_xyxy is not None and len(dets_xyxy) > 0: + dets = np.asarray(dets_xyxy, dtype=np.float32) / float(self.downscale) + dets = dets.astype(np.int32) + dets[:, 0] = np.clip(dets[:, 0], 0, W - 1) + dets[:, 2] = np.clip(dets[:, 2], 0, W - 1) + dets[:, 1] = np.clip(dets[:, 1], 0, H - 1) + dets[:, 3] = np.clip(dets[:, 3], 0, H - 1) + for x1b, y1b, x2b, y2b in dets: + if x2b > x1b and y2b > y1b: + mask[y1b:y2b, x1b:x2b] = 0 + + # Detect + describe + kps = self.detector.detect(gray, mask) + kps, desc = self.extractor.compute(gray, kps) + + H_aff = np.eye(2, 3, dtype=np.float32) + + # First frame: only initialize + if not self._initialized: + self._prev_kps = copy.copy(kps) + self._prev_desc = None if desc is None else copy.copy(desc) + self._initialized = True + return H_aff + + # If missing descriptors + if self._prev_desc is None or desc is None or len(desc) == 0: + self._prev_kps = copy.copy(kps) + self._prev_desc = None if desc is None else copy.copy(desc) + return H_aff + + # KNN match (k=2) + ratio test + knn = self.matcher.knnMatch(self._prev_desc, desc, k=2) + if len(knn) == 0: + self._prev_kps = copy.copy(kps) + self._prev_desc = copy.copy(desc) + return H_aff + + max_spatial = self.cfg.max_spatial_distance_frac * np.array([W, H], dtype=np.float32) + + prev_pts = [] + curr_pts = [] + spatial = [] + + for pair in knn: + if len(pair) < 2: + continue + m, n = pair + if m.distance < 0.9 * n.distance: + p_prev = np.array(self._prev_kps[m.queryIdx].pt, dtype=np.float32) + p_curr = np.array(kps[m.trainIdx].pt, dtype=np.float32) + d = p_prev - p_curr + if (abs(d[0]) < max_spatial[0]) and (abs(d[1]) < max_spatial[1]): + spatial.append(d) + prev_pts.append(p_prev) + curr_pts.append(p_curr) + + if len(prev_pts) >= 5: + spatial = np.asarray(spatial, dtype=np.float32) + mean = spatial.mean(axis=0) + std = spatial.std(axis=0) + 1e-6 + inl = np.logical_and( + np.abs(spatial[:, 0] - mean[0]) < 2.5 * std[0], + np.abs(spatial[:, 1] - mean[1]) < 2.5 * std[1], + ) + prev_pts_np = np.asarray(prev_pts, dtype=np.float32)[inl] + curr_pts_np = np.asarray(curr_pts, dtype=np.float32)[inl] + + if len(prev_pts_np) >= 5: + H_est, _ = cv2.estimateAffinePartial2D( + prev_pts_np, + curr_pts_np, + method=cv2.RANSAC, + ransacReprojThreshold=self.cfg.ransac_reproj_threshold, + ) + if H_est is not None: + H_aff = H_est.astype(np.float32) + if self.downscale > 1: + H_aff[0, 2] *= self.downscale + H_aff[1, 2] *= self.downscale + + # Update prev + self._prev_kps = copy.copy(kps) + self._prev_desc = copy.copy(desc) + + return H_aff + + @staticmethod + def apply_to_tracks(tracks: list, H: np.ndarray) -> None: + if H is None or len(tracks) == 0: + return + + H = H.astype(np.float32) + R = H[:2, :2] + t = H[:2, 2:3] # (2,1) + + # A4 maps [x1,y1,x2,y2] + A4 = np.zeros((4, 4), dtype=np.float32) + A4[0:2, 0:2] = R + A4[2:4, 2:4] = R + + # A8 maps state (pos and vel blocks) + A8 = np.zeros((8, 8), dtype=np.float32) + A8[0:4, 0:4] = A4 + A8[4:8, 4:8] = A4 + + trans4 = np.array([t[0, 0], t[1, 0], t[0, 0], t[1, 0]], dtype=np.float32).reshape(4, 1) + + for trk in tracks: + trk.state = (A8 @ trk.state).astype(np.float32) + trk.state[0:4] += trans4 + trk.P = (A8 @ trk.P @ A8.T).astype(np.float32) diff --git a/trackers/core/botsort/kalman_box_tracker.py b/trackers/core/botsort/kalman_box_tracker.py new file mode 100644 index 00000000..dc19df67 --- /dev/null +++ b/trackers/core/botsort/kalman_box_tracker.py @@ -0,0 +1,146 @@ +# ------------------------------------------------------------------------ +# Trackers +# Copyright (c) 2026 Roboflow. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import numpy as np + +class BoTSORTKalmanBoxTracker: + """ + The `BoTSORTKalmanBoxTracker` class represents the internals of a single + tracked object (bounding box), with a Kalman filter to predict and update + its position. + + Attributes: + tracker_id: Unique identifier for the tracker. + number_of_successful_updates: Number of times the object has been + updated successfully. + time_since_update: Number of frames since the last update. + state: State vector of the bounding box. + F: State transition matrix. + H: Measurement matrix. + Q: Process noise covariance matrix. + R: Measurement noise covariance matrix. + P: Error covariance matrix. + count_id: Class variable to assign unique IDs to each tracker. + + Args: + bbox: Initial bounding box in the form [x1, y1, x2, y2]. + """ + + count_id = 0 + + @classmethod + def get_next_tracker_id(cls) -> int: + """ + Class method that returns the next available tracker ID. + + Returns: + The next available tracker ID. + """ + next_id = cls.count_id + cls.count_id += 1 + return next_id + + def __init__(self, bbox: np.ndarray): + # Initialize with a temporary ID of -1 + # Will be assigned a real ID when the track is considered mature + self.tracker_id = -1 + + # Number of hits indicates how many times the object has been + # updated successfully + self.number_of_successful_updates = 1 + # Number of frames since the last update + self.time_since_update = 0 + + # For simplicity, we keep a small state vector: + # (x, y, x2, y2, vx, vy, vx2, vy2). + # We'll store the bounding box in "self.state" + self.state = np.zeros((8, 1), dtype=np.float32) + + # Initialize state directly from the first detection + self.state[0] = bbox[0] + self.state[1] = bbox[1] + self.state[2] = bbox[2] + self.state[3] = bbox[3] + + # Basic constant velocity model + self._initialize_kalman_filter() + + def _initialize_kalman_filter(self) -> None: + """ + Sets up the matrices for the Kalman filter. + """ + # State transition matrix (F): 8x8 + # We assume a constant velocity model. Positions are incremented by + # velocity each step. + self.F = np.eye(8, dtype=np.float32) + for i in range(4): + self.F[i, i + 4] = 1.0 + + # Measurement matrix (H): we directly measure x1, y1, x2, y2 + self.H = np.eye(4, 8, dtype=np.float32) # 4x8 + + # Process covariance matrix (Q) + self.Q = np.eye(8, dtype=np.float32) * 0.01 + + # Measurement covariance (R): noise in detection + self.R = np.eye(4, dtype=np.float32) * 0.1 + + # Error covariance matrix (P) + self.P = np.eye(8, dtype=np.float32) + + def predict(self) -> None: + """ + Predict the next state of the bounding box (applies the state transition). + """ + # Predict state + self.state = self.F @ self.state + # Predict error covariance + self.P = self.F @ self.P @ self.F.T + self.Q + + # Increase time since update + self.time_since_update += 1 + + def update(self, bbox: np.ndarray) -> None: + """ + Updates the state with a new detected bounding box. + + Args: + bbox: Detected bounding box in the form [x1, y1, x2, y2]. + """ + self.time_since_update = 0 + self.number_of_successful_updates += 1 + + # Kalman Gain + S = self.H @ self.P @ self.H.T + self.R + K = self.P @ self.H.T @ np.linalg.inv(S) + + # Residual + measurement = bbox.reshape((4, 1)) + y = measurement - self.H @ self.state + + # Update state + self.state = self.state + K @ y + + # Update covariance + identity_matrix = np.eye(8, dtype=np.float32) + self.P = (identity_matrix - K @ self.H) @ self.P + + def get_state_bbox(self) -> np.ndarray: + """ + Returns the current bounding box estimate from the state vector. + + Returns: + The bounding box [x1, y1, x2, y2]. + """ + return np.array( + [ + self.state[0], # x1 + self.state[1], # y1 + self.state[2], # x2 + self.state[3], # y2 + ], + dtype=float, + ).reshape(-1) diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py new file mode 100644 index 00000000..c9ebcdba --- /dev/null +++ b/trackers/core/botsort/tracker.py @@ -0,0 +1,303 @@ +# ------------------------------------------------------------------------ +# Trackers +# Copyright (c) 2026 Roboflow. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +from copy import deepcopy +from typing import cast + +import numpy as np +import supervision as sv +from scipy.optimize import linear_sum_assignment + +from trackers.core.base import BaseTracker +from trackers.core.botsort.kalman_box_tracker import BoTSORTKalmanBoxTracker +from trackers.utils.sort_utils import ( + get_alive_trackers, + get_iou_matrix, +) +from trackers.core.botsort.cmc import CMC, CMCConfig + +class BoTSORTTracker(BaseTracker): + def __init__( + self, + lost_track_buffer: int = 30, + frame_rate: float = 30.0, + track_activation_threshold: float = 0.7, + minimum_consecutive_frames: int = 2, + minimum_iou_threshold: float = 0.1, + high_conf_det_threshold: float = 0.6, + enable_cmc: bool = True + + ) -> None: + # Calculate maximum frames without update based on lost_track_buffer and + # frame_rate. This scales the buffer based on the frame rate to ensure + # consistent time-based tracking across different frame rates. + self.maximum_frames_without_update = int(frame_rate / 30.0 * lost_track_buffer) + self.minimum_consecutive_frames = minimum_consecutive_frames + self.minimum_iou_threshold = minimum_iou_threshold + self.track_activation_threshold = track_activation_threshold + self.high_conf_det_threshold = high_conf_det_threshold + self.tracks: list[BoTSORTKalmanBoxTracker] = [] + + self.enable_cmc = enable_cmc + self.cmc = CMC(CMCConfig()) if enable_cmc else None + + def _update_detections( + self, + tracks: list[BoTSORTKalmanBoxTracker], + detections: sv.Detections, + updated_detections: list[sv.Detections], + matched_indices: list[tuple[int, int]], + ) -> list[sv.Detections]: + # Update matched tracks with assigned detections. + det_bboxes = detections.xyxy + for row, col in matched_indices: + t = tracks[row] + t.update(det_bboxes[col]) + # If tracker is mature but still has ID -1, assign a new ID + if ( + t.number_of_successful_updates >= self.minimum_consecutive_frames + and t.tracker_id == -1 + ): # Check maturity before assigning ID + t.tracker_id = BoTSORTKalmanBoxTracker.get_next_tracker_id() + + new_det = deepcopy(detections[col : col + 1]) + # Add cast to clarify type for mypy + new_det = cast(sv.Detections, new_det) # ADDED cast + new_det.tracker_id = np.array([t.tracker_id]) + updated_detections.append(new_det) + return updated_detections + + def update( + self, + detections: sv.Detections, + frame: np.ndarray, + ) -> sv.Detections: + if len(self.tracks) == 0 and len(detections) == 0: + detections.tracker_id = np.array([], dtype=int) + return detections + updated_detections: list[ + sv.Detections + ] = [] # List for returning the updated detections with its new assigned track id # noqa: E501 + + # Predict new locations for existing tracks + for tracker in self.tracks: + tracker.predict() + # Assign a default tracker_id with the correct shape + detections.tracker_id = -np.ones(len(detections)) + # Split into high confidence boxes and lower based on self.high_conf_det_threshold # noqa: E501 + high_prob_detections, low_prob_detections = ( + self._get_high_and_low_probability_detections(detections) + ) + + # CMC (ORB) apply to all predicted tracks before association + if self.enable_cmc and self.cmc is not None and frame is not None: + mask_boxes = high_prob_detections.xyxy if len(high_prob_detections) > 0 else None + H = self.cmc.estimate(frame, mask_boxes) + self.cmc.apply_to_tracks(self.tracks, H) + + # Step 1: first association, with high confidence boxes + matched_indices, unmatched_tracks, unmatched_high_prob_detections = ( + self._similarity_step( + high_prob_detections, + self.tracks, + ) + ) + + # Update matched tracks with high-confidence detections + self._update_detections( + self.tracks, + high_prob_detections, + updated_detections, + matched_indices, + ) + + remaining_tracks = [self.tracks[i] for i in unmatched_tracks] + + # Step 2: associate Low Probability detections with remaining tracks + matched_indices, unmatched_tracks, unmatched_detections = self._similarity_step( + low_prob_detections, remaining_tracks + ) + + # Update matched tracks with low-confidence detections + self._update_detections( + remaining_tracks, + low_prob_detections, + updated_detections, + matched_indices, + ) + + # Add unmatched low prob predictions to updated predictions + for det_index in unmatched_detections: + new_det = deepcopy(low_prob_detections[det_index : det_index + 1]) + + new_det.tracker_id = np.array([-1]) + updated_detections.append(new_det) + + self._spawn_new_trackers( + high_prob_detections, + high_prob_detections.xyxy, + unmatched_high_prob_detections, + updated_detections, + ) + + # Kill lost tracks + self.tracks = get_alive_trackers( + trackers=self.tracks, + maximum_frames_without_update=self.maximum_frames_without_update, + minimum_consecutive_frames=self.minimum_consecutive_frames, + ) + final_updated_detections: sv.Detections = sv.Detections.merge( + updated_detections + ) + if len(final_updated_detections) == 0: + final_updated_detections.tracker_id = np.array([], dtype=int) + return final_updated_detections + + def _get_high_and_low_probability_detections( + self, detections: sv.Detections + ) -> tuple[sv.Detections, sv.Detections]: + """ + Splits the input detections into high-confidence and low-confidence sets + based on the `self.high_conf_det_threshold`. + + Args: + detections: The input detections with confidence scores. + + Returns: + A tuple containing two `sv.Detections objects`: the first for + high-confidence detections `(confidence >= threshold)` and the second + for low-confidence detections `(confidence < threshold)`. + """ + # Check if confidence scores exist before comparing + if detections.confidence is not None: + # Perform element-wise comparison if confidence is a NumPy array + condition = detections.confidence >= self.high_conf_det_threshold + else: + # If no confidence scores, no detections meet the threshold + # Create a boolean array of False with the same length as detections + condition = np.zeros(len(detections), dtype=bool) + + high_confidence = detections[condition] + low_confidence = detections[np.logical_not(condition)] + return high_confidence, low_confidence + + def _get_associated_indices( + self, + similarity_matrix: np.ndarray, + min_similarity_thresh: float, + ) -> tuple[list[tuple[int, int]], set[int], set[int]]: + """ + Associate detections to tracks based on Similarity (IoU) using the + Jonker-Volgenant algorithm approach with no initialization instead of the + Hungarian algorithm as mentioned in the SORT paper, but it solves the + assignment problem in an optimal way. + + Args: + similarity_matrix: Similarity matrix between tracks (rows) and detections (columns). + min_similarity_thresh: Minimum similarity threshold for a valid match. + + Returns: + Matched indices (list of (tracker_idx, detection_idx)), indices of + unmatched tracks, indices of unmatched detections. + """ # noqa: E501 + matched_indices = [] + n_tracks, n_detections = similarity_matrix.shape + unmatched_tracks = set(range(n_tracks)) + unmatched_detections = set(range(n_detections)) + + if n_tracks > 0 and n_detections > 0: + row_indices, col_indices = linear_sum_assignment( + similarity_matrix, maximize=True + ) + for row, col in zip(row_indices, col_indices): + if similarity_matrix[row, col] >= min_similarity_thresh: + matched_indices.append((row, col)) + unmatched_tracks.remove(row) + unmatched_detections.remove(col) + + return matched_indices, unmatched_tracks, unmatched_detections + + def _spawn_new_trackers( + self, + detections: sv.Detections, + detection_boxes: np.ndarray, + unmatched_detections: set[int], + updated_detections: list[sv.Detections], + ): + """ + Create new trackers for unmatched detections and + append detections to updated_detections detections. + + Args: + detections: Current detections. + detection_boxes: Bounding boxes for detections. + unmatched_detections: Indices of unmatched detections. + updated_detections: List with all the detections + + """ + for detection_idx in unmatched_detections: + # Check for detections.confidence existence and index bounds + if detections.confidence is not None and detection_idx < len( + detections.confidence + ): + # Assign to a temporary variable with explicit type hint + confidence_score: float = float(detections.confidence[detection_idx]) + + # Use the temporary variable in the comparison + if confidence_score >= self.track_activation_threshold: + # Original logic for high confidence detection + + new_tracker = BoTSORTKalmanBoxTracker( + bbox=detection_boxes[detection_idx] + ) + self.tracks.append(new_tracker) + + new_det = deepcopy(detections[detection_idx : detection_idx + 1]) + new_det = cast(sv.Detections, new_det) # Cast added previously + new_det.tracker_id = np.array([-1]) + updated_detections.append(new_det) + else: + pass # Do nothing, the detection remains unmatched + + def _similarity_step( + self, + detections: sv.Detections, + tracks: list[BoTSORTKalmanBoxTracker], + ) -> tuple[list[tuple[int, int]], set[int], set[int]]: + """Measures similarity based on IoU between tracks and detections and returns the matches + and unmatched tracks/detections. Is used for step 1 and 2 of the BYTE algorithm. + + Args: + detections: The set of object detections. + tracks: The list of tracks that will be matched to the detections. + + Returns: + A tuple containing: + - matched_indices: A list of (tracker_idx, detection_idx) pairs. + - unmatched_tracks_indices: A set of indices for tracks that + were not matched. + - unmatched_detections_indices: A set of indices for detections + that were not matched. + """ # noqa: E501 + # Build IoU cost matrix between detections and predicted bounding boxes + similarity_matrix = get_iou_matrix(tracks, detections.xyxy) + thresh = self.minimum_iou_threshold + + # Associate detections to tracks based on the higher value of the + # similarity matrix, using the Jonker-Volgenant algorithm (linear_sum_assignment). # noqa: E501 + matched_indices, unmatched_tracks, unmatched_detections = ( + self._get_associated_indices(similarity_matrix, thresh) + ) + return matched_indices, unmatched_tracks, unmatched_detections + + def reset(self) -> None: + """Reset tracker state by clearing all tracks and resetting ID counter. + Call this method when switching to a new video or scene. + """ + self.tracks = [] + BoTSORTKalmanBoxTracker.count_id = 0 + if self.cmc is not None: + self.cmc.reset() From de8a7b52cc6b97b2b8667df9adb61acfc35ff678 Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Wed, 4 Mar 2026 10:07:51 +0100 Subject: [PATCH 02/14] Add sparse optical flow cmc to BoT-SORT. Update code documentation. --- trackers/core/botsort/cmc.py | 333 ++++++++++++++++++++++++++++--- trackers/core/botsort/tracker.py | 101 +++++++++- trackers/core/botsort/utils.py | 153 ++++++++++++++ 3 files changed, 561 insertions(+), 26 deletions(-) create mode 100644 trackers/core/botsort/utils.py diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 838eb356..4fec6eec 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -7,60 +7,244 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Optional +from typing import Optional, Literal import copy import numpy as np import cv2 +CMCTMethod = Literal["orb", "sparseOptFlow"] @dataclass class CMCConfig: + """ + Configuration for camera motion compensation (CMC). + + The CMC module estimates a global 2D affine transform `H` (2x3) between consecutive frames. + This transform is then applied to predicted track states before data association. + + Attributes: + method: + Camera motion estimation method. + + - "orb": Feature matching using FAST keypoints + ORB descriptors + BFMatcher, + followed by robust affine estimation (RANSAC). + Optionally masks out detection boxes so features are extracted from background. + - "sparseOptFlow": Sparse optical flow using corner tracking: + goodFeaturesToTrack -> calcOpticalFlowPyrLK -> robust affine estimation (RANSAC). + + downscale: + Integer downscale factor applied to frames before running CMC. + + Purpose: + - Speeds up feature extraction / optical flow. + + Behavior: + - Frames are resized to (W//downscale, H//downscale) for motion estimation. + - The resulting affine translation components H[0,2], H[1,2] are scaled back + by multiplying by `downscale`, so the transform is in original image coordinates. + + fast_threshold: + (ORB only) Threshold for the FAST keypoint detector. + Higher values yield fewer keypoints (more selective); lower values yield more keypoints. + + ransac_reproj_threshold: + (ORB only) RANSAC reprojection threshold in pixels passed to + OpenCV’s affine estimation. It controls how far a point is allowed to deviate from the + estimated model while still being counted as an inlier. + Smaller values are stricter (reject more matches); larger values are more tolerant. + + max_spatial_distance_frac: + (ORB only) Maximum allowed spatial displacement for a tentative match, expressed as a + fraction of (image width, image height) *after downscale*. + + Example: + If max_spatial_distance_frac = 0.25 and the downscaled frame is (W, H), + then a match is rejected if |dx| >= 0.25*W or |dy| >= 0.25*H. + + Motivation: + Reject obviously incorrect descriptor matches whose displacement is implausibly large. + + roi_min_frac: + (ORB only) Lower bound of the region-of-interest (ROI) used to select keypoints, + expressed as a fraction of frame size. Points outside the ROI are masked out. + + Example: + roi_min_frac=0.02 means we ignore a ~2% border on each side. + + roi_max_frac: + (ORB only) Upper bound of the ROI used to select keypoints (fraction of frame size). + Together with roi_min_frac, it defines a central rectangle: + [roi_min_frac..roi_max_frac] in both x and y. + + sof_max_corners: + (SparseOptFlow only) `maxCorners` passed to `cv2.goodFeaturesToTrack`. + Maximum number of corners to detect for tracking. + Larger values can improve robustness (more points), but cost more compute. + + sof_quality_level: + (SparseOptFlow only) `qualityLevel` passed to `cv2.goodFeaturesToTrack`. + Minimum accepted quality of corners. A higher value keeps only stronger corners; + a lower value yields more corners (including weaker ones). + + sof_min_distance: + (SparseOptFlow only) `minDistance` passed to `cv2.goodFeaturesToTrack`. + Minimum Euclidean distance (in pixels) between returned corners. + Higher values produce more spatially spread points; lower values allow clustering. + + sof_block_size: + (SparseOptFlow only) `blockSize` passed to `cv2.goodFeaturesToTrack`. + Size of the neighborhood used to compute corner quality (structure tensor window). + + sof_use_harris: + (SparseOptFlow only) `useHarrisDetector` passed to `cv2.goodFeaturesToTrack`. + If True, uses the Harris corner measure; if False, uses the Shi-Tomasi measure. + + sof_k: + (SparseOptFlow only) `k` passed to `cv2.goodFeaturesToTrack`. + Harris detector free parameter. Ignored if `sof_use_harris` is False. + """ + method: CMCTMethod = "orb" downscale: int = 2 - fast_threshold: int = 20 - # Affine estimation + # ORB parameters + fast_threshold: int = 20 ransac_reproj_threshold: float = 3.0 - - # Filtering matches by spatial displacement (fraction of image size) max_spatial_distance_frac: float = 0.25 - - # Keep features from central ROI (avoid borders) roi_min_frac: float = 0.02 roi_max_frac: float = 0.98 + # Sparse optical flow parameters (goodFeaturesToTrack) + sof_max_corners: int = 1000 + sof_quality_level: float = 0.01 + sof_min_distance: int = 1 + sof_block_size: int = 3 + sof_use_harris: bool = False + sof_k: float = 0.04 + class CMC: + """ + Camera motion compensation estimator and track state warper. + + Typical usage in the tracker loop: + H = cmc.estimate(frame_bgr, mask_boxes_xyxy) + CMC.apply_to_tracks(tracks, H) + + Internal state: + - Keeps previous-frame features / points depending on the chosen method. + - On the first frame (or after reset), returns identity transform. + + Notes: + - H maps points from previous frame coordinates to current frame coordinates. + - This class does not perform any drawing/visualization; it only estimates transforms. + """ + def __init__(self, cfg: Optional[CMCConfig] = None) -> None: + """ + Initialize CMC. + + Args: + cfg: Optional configuration. If None, defaults are used. + + Notes: + - ORB detector/extractor/matcher are only created if method == "orb". + - Sparse optical flow parameters are always initialized (cheap). + """ self.cfg = cfg or CMCConfig() self.downscale = max(1, int(self.cfg.downscale)) - self.detector = cv2.FastFeatureDetector_create(self.cfg.fast_threshold) - self.extractor = cv2.ORB_create() - self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) + # ORB init (only if needed) + self.detector = None + self.extractor = None + self.matcher = None + if self.cfg.method == "orb": + self.detector = cv2.FastFeatureDetector_create(self.cfg.fast_threshold) + self.extractor = cv2.ORB_create() + self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) + + # SparseOptFlow params + self.feature_params = dict( + maxCorners=self.cfg.sof_max_corners, + qualityLevel=self.cfg.sof_quality_level, + minDistance=self.cfg.sof_min_distance, + blockSize=self.cfg.sof_block_size, + useHarrisDetector=self.cfg.sof_use_harris, + k=self.cfg.sof_k, + ) + + self.reset() + def reset(self) -> None: + """ + Reset internal state. + + After calling reset: + - The next `estimate()` call returns identity and initializes prev-frame state. + - This should be called when starting a new sequence or after a scene cut. + """ self._initialized = False + + # ORB state self._prev_kps = None self._prev_desc: Optional[np.ndarray] = None - def reset(self) -> None: - self._initialized = False - self._prev_kps = None - self._prev_desc = None + # SparseOptFlow state + self._prev_frame_gray: Optional[np.ndarray] = None + self._prev_points: Optional[np.ndarray] = None # shape (N,1,2) from goodFeaturesToTrack def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + """ + Estimate global affine transform H (2x3) from previous frame to current frame. + + Args: + frame_bgr: Current frame in BGR format (uint8), shape (H, W, 3). + dets_xyxy: Optional detections (N,4) in xyxy format, in original image scale. + Used only by ORB method for masking out object regions (background-only features). + + Returns: + H: Affine transform matrix of shape (2, 3), dtype float32. + Identity if not enough correspondences or if not initialized yet. + """ if frame_bgr is None: return np.eye(2, 3, dtype=np.float32) + if self.cfg.method == "orb": + return self._estimate_orb(frame_bgr, dets_xyxy) + + if self.cfg.method == "sparseOptFlow": + return self._estimate_sparse_optflow(frame_bgr) + + # fallback + return np.eye(2, 3, dtype=np.float32) + + def _estimate_orb(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + """ + ORB-based affine estimation. + + Steps: + 1) Convert to grayscale (+ optional downscale). + 2) Create ROI mask and optionally mask out detections (background emphasis). + 3) Detect FAST keypoints and compute ORB descriptors. + 4) KNN match descriptors against previous frame (ratio test). + 5) Filter matches by max spatial displacement and by 2.5*std inliers. + 6) Estimate affine transform with RANSAC. + 7) Scale translation back up if downscaled. + + Args: + frame_bgr: Current BGR frame. + dets_xyxy: Optional detection boxes for masking (original image scale). + + Returns: + H: (2,3) affine transform mapping previous-current, float32. + """ H_img, W_img = frame_bgr.shape[:2] gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) - # Downscale for speed / robustness if self.downscale > 1: gray = cv2.resize(gray, (W_img // self.downscale, H_img // self.downscale)) H, W = gray.shape[:2] - # Build mask: central ROI + remove detections (background features) mask = np.zeros_like(gray, dtype=np.uint8) y0 = int(self.cfg.roi_min_frac * H) y1 = int(self.cfg.roi_max_frac * H) @@ -71,10 +255,13 @@ def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None if dets_xyxy is not None and len(dets_xyxy) > 0: dets = np.asarray(dets_xyxy, dtype=np.float32) / float(self.downscale) dets = dets.astype(np.int32) + + # Safety clipping dets[:, 0] = np.clip(dets[:, 0], 0, W - 1) dets[:, 2] = np.clip(dets[:, 2], 0, W - 1) dets[:, 1] = np.clip(dets[:, 1], 0, H - 1) dets[:, 3] = np.clip(dets[:, 3], 0, H - 1) + for x1b, y1b, x2b, y2b in dets: if x2b > x1b and y2b > y1b: mask[y1b:y2b, x1b:x2b] = 0 @@ -85,20 +272,17 @@ def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None H_aff = np.eye(2, 3, dtype=np.float32) - # First frame: only initialize if not self._initialized: self._prev_kps = copy.copy(kps) self._prev_desc = None if desc is None else copy.copy(desc) self._initialized = True return H_aff - # If missing descriptors if self._prev_desc is None or desc is None or len(desc) == 0: self._prev_kps = copy.copy(kps) self._prev_desc = None if desc is None else copy.copy(desc) return H_aff - # KNN match (k=2) + ratio test knn = self.matcher.knnMatch(self._prev_desc, desc, k=2) if len(knn) == 0: self._prev_kps = copy.copy(kps) @@ -148,14 +332,119 @@ def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None H_aff[0, 2] *= self.downscale H_aff[1, 2] *= self.downscale - # Update prev self._prev_kps = copy.copy(kps) self._prev_desc = copy.copy(desc) + return H_aff + + def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: + """ + Sparse optical-flow-based affine estimation. + + Steps: + 1) grayscale (+ optional downscale) + 2) detect corners using goodFeaturesToTrack + 3) compute correspondences via calcOpticalFlowPyrLK(prev, curr, prev_points) + 4) keep only points with status == 1 + 5) estimate affine transform with RANSAC + 6) scale translation back up if downscaled + + Args: + frame_bgr: Current BGR frame. + + Returns: + H: (2,3) affine transform mapping previous-current, float32. + """ + H_img, W_img = frame_bgr.shape[:2] + frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + + H_aff = np.eye(2, 3, dtype=np.float32) + + # Downscale + if self.downscale > 1: + frame = cv2.resize(frame, (W_img // self.downscale, H_img // self.downscale)) + + # Find keypoints in current frame + keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params) + + # First frame: init and return identity + if not self._initialized: + self._prev_frame_gray = frame.copy() + self._prev_points = copy.copy(keypoints) + self._initialized = True + return H_aff + + # If we don't have points, re-init + if self._prev_frame_gray is None or self._prev_points is None or keypoints is None: + self._prev_frame_gray = frame.copy() + self._prev_points = copy.copy(keypoints) + return H_aff + + # Optical flow correspondences + # calcOpticalFlowPyrLK will throw or return nonsense if we give it None + matched, status, _err = cv2.calcOpticalFlowPyrLK(self._prev_frame_gray, frame, self._prev_points, None) + + if status is None or matched is None: + self._prev_frame_gray = frame.copy() + self._prev_points = copy.copy(keypoints) + return H_aff + + # Keep only good correspondences + prev_pts = [] + curr_pts = [] + # status is (N,1) or (N,) + status_flat = status.reshape(-1) + + for i in range(len(status_flat)): + if status_flat[i]: + prev_pts.append(self._prev_points[i]) + curr_pts.append(matched[i]) + + prev_pts = np.array(prev_pts) + curr_pts = np.array(curr_pts) + + # Find rigid matrix + # if (np.size(prev_pts, 0) > 4) and (np.size(prev_pts, 0) == np.size(prev_pts, 0)): + if (np.size(prev_pts, 0) > 4) and (np.size(prev_pts, 0) == np.size(curr_pts, 0)): + H_est, _ = cv2.estimateAffinePartial2D(prev_pts, curr_pts, cv2.RANSAC) + if H_est is not None: + H_aff = H_est.astype(np.float32) + + # Handle downscale translation back to original image coords + if self.downscale > 1: + H_aff[0, 2] *= self.downscale + H_aff[1, 2] *= self.downscale + else: + print('Warning: not enough matching points') + + # Store to next iteration + self._prev_frame_gray = frame.copy() + # self._prev_points = copy.copy(keypoints) + self._prev_points = None if keypoints is None else keypoints.copy() return H_aff @staticmethod def apply_to_tracks(tracks: list, H: np.ndarray) -> None: + """ + Apply affine transform H (2x3) to tracker states and covariances in-place. + + This implementation assumes each track has: + - `state`: (8,1) float vector [x1,y1,x2,y2,vx,vy,vx2,vy2]^T + - `P`: (8,8) covariance matrix + + The transform is applied as: + state := A * state + translation + P := A * P * A^T + + Where A applies the 2x2 rotation/shear block to each 2D component block in the state. + + Args: + tracks: List of track objects with `.state` and `.P` attributes. + H: Affine transform (2,3) mapping prev -> curr. + + Returns: + None. Tracks are modified in-place. + """ if H is None or len(tracks) == 0: return @@ -163,12 +452,10 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: R = H[:2, :2] t = H[:2, 2:3] # (2,1) - # A4 maps [x1,y1,x2,y2] A4 = np.zeros((4, 4), dtype=np.float32) A4[0:2, 0:2] = R A4[2:4, 2:4] = R - # A8 maps state (pos and vel blocks) A8 = np.zeros((8, 8), dtype=np.float32) A8[0:4, 0:4] = A4 A8[4:8, 4:8] = A4 @@ -178,4 +465,4 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: for trk in tracks: trk.state = (A8 @ trk.state).astype(np.float32) trk.state[0:4] += trans4 - trk.P = (A8 @ trk.P @ A8.T).astype(np.float32) + trk.P = (A8 @ trk.P @ A8.T).astype(np.float32) \ No newline at end of file diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index c9ebcdba..c49808a2 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -13,13 +13,40 @@ from trackers.core.base import BaseTracker from trackers.core.botsort.kalman_box_tracker import BoTSORTKalmanBoxTracker -from trackers.utils.sort_utils import ( +from trackers.core.botsort.utils import ( get_alive_trackers, get_iou_matrix, ) from trackers.core.botsort.cmc import CMC, CMCConfig class BoTSORTTracker(BaseTracker): + """ + BoT-SORT-style multi-object tracker (IoU association + optional CMC). + + The tracker maintains a list of active tracks (Kalman-filter-based) and, for each frame, + performs: + 1) Predict existing track states (Kalman predict) + 2) Split detections into high/low confidence groups + 3) Apply camera motion compensation to predicted tracks + 4) Associate high-confidence detections to tracks (IoU + assignment) + 5) Associate low-confidence detections to remaining tracks + 6) Spawn new tracks from unmatched high-confidence detections + 7) Remove tracks that have been lost for too long + + Parameters in __init__ control thresholds and lifecycle logic similarly to ByteTrack/BoT-SORT. + + Attributes: + tracks: List of active `BoTSORTKalmanBoxTracker` objects. + maximum_frames_without_update: Max number of consecutive frames a track can go unmatched + before being removed. + minimum_consecutive_frames: Track maturity threshold before assigning a permanent ID. + minimum_iou_threshold: Minimum IoU required for a valid match. + track_activation_threshold: Confidence threshold for spawning a new track. + high_conf_det_threshold: Confidence threshold splitting detections into high/low groups. + enable_cmc: Whether to run camera motion compensation each frame (if `cmc` is set). + cmc: Camera motion compensation instance (or None if disabled). + """ + def __init__( self, lost_track_buffer: int = 30, @@ -28,9 +55,36 @@ def __init__( minimum_consecutive_frames: int = 2, minimum_iou_threshold: float = 0.1, high_conf_det_threshold: float = 0.6, - enable_cmc: bool = True + enable_cmc: bool = True, + # cmc_method: str = "orb", + cmc_method: str = "sparseOptFlow", + cmc_downscale: int = 2, ) -> None: + """ + Initialize the tracker. + + Args: + lost_track_buffer: Time buffer (in frames at 30 FPS) for keeping lost tracks alive + before deletion. This is scaled by `frame_rate`. + frame_rate: Video frame rate used to scale the lost track buffer to time-like behavior. + track_activation_threshold: Minimum detection confidence to spawn a new track. + minimum_consecutive_frames: Number of successful updates required before assigning + a stable track ID (different than initial -1). + minimum_iou_threshold: Minimum IoU to accept a detection-track association. + high_conf_det_threshold: Confidence threshold used to split detections into: + - high confidence: confidence >= threshold + - low confidence: confidence < threshold + enable_cmc: Whether to enable camera motion compensation (CMC). + cmc_method: CMC method string passed into `CMCConfig(method=...)`. Supported values + depend on `CMC` (e.g. "orb", "sparseOptFlow"). See CMCConfig. + cmc_downscale: Downscale factor used inside CMC for speed/robustness. + + Notes: + - `maximum_frames_without_update` is computed as: + int(frame_rate / 30.0 * lost_track_buffer) + to maintain consistent “seconds” worth of buffer across different FPS. + """ # Calculate maximum frames without update based on lost_track_buffer and # frame_rate. This scales the buffer based on the frame rate to ensure # consistent time-based tracking across different frame rates. @@ -42,7 +96,7 @@ def __init__( self.tracks: list[BoTSORTKalmanBoxTracker] = [] self.enable_cmc = enable_cmc - self.cmc = CMC(CMCConfig()) if enable_cmc else None + self.cmc = CMC(CMCConfig(method=cmc_method, downscale=cmc_downscale)) if enable_cmc else None def _update_detections( self, @@ -51,6 +105,26 @@ def _update_detections( updated_detections: list[sv.Detections], matched_indices: list[tuple[int, int]], ) -> list[sv.Detections]: + """ + Apply matched detection updates to tracks and append corresponding outputs. + + For each (track_idx, det_idx) match: + - Update the track’s Kalman state with the detection bbox. + - If the track is “mature” (>= minimum_consecutive_frames) and still has tracker_id == -1, + assign a new unique tracker ID. + - Create a single-row `sv.Detections` object for the matched detection and set its + tracker_id to the track ID (or -1 if not mature yet). + - Append it to `updated_detections`. + + Args: + tracks: Tracks being updated. + detections: Detections used for update. + updated_detections: Accumulator list of per-detection outputs for this frame. + matched_indices: List of (track_row_index, detection_col_index) pairs. + + Returns: + The same `updated_detections` list, returned for convenience. + """ # Update matched tracks with assigned detections. det_bboxes = detections.xyxy for row, col in matched_indices: @@ -75,6 +149,27 @@ def update( detections: sv.Detections, frame: np.ndarray, ) -> sv.Detections: + """ + Update the tracker with detections from the current frame. + + This is the main per-frame entry point. + + Args: + detections: Supervision detections for the current frame. Must include `.xyxy`. + Confidence (`detections.confidence`) is optional but recommended. + The method writes/overwrites `detections.tracker_id`. + frame: Current video frame in BGR format (H, W, 3), required if CMC is enabled. + + Returns: + A merged `sv.Detections` object containing detections from this frame with + `tracker_id` assigned: + - >= 0 indicates a confirmed track ID + - -1 indicates unconfirmed/untracked (e.g., new / low confidence / not yet mature) + + Notes: + - If CMC is enabled, the tracker estimates a global affine transform (2x3) from the + frame and uses it to warp predicted track states before association. + """ if len(self.tracks) == 0 and len(detections) == 0: detections.tracker_id = np.array([], dtype=int) return detections diff --git a/trackers/core/botsort/utils.py b/trackers/core/botsort/utils.py new file mode 100644 index 00000000..078542e4 --- /dev/null +++ b/trackers/core/botsort/utils.py @@ -0,0 +1,153 @@ +# ------------------------------------------------------------------------ +# Trackers +# Copyright (c) 2026 Roboflow. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +from collections.abc import Sequence +from copy import deepcopy +from typing import TypeVar + +import numpy as np +import supervision as sv + +from trackers.core.botsort.kalman_box_tracker import BoTSORTKalmanBoxTracker + +KalmanBoxTrackerType = TypeVar( + "KalmanBoxTrackerType", bound=BoTSORTKalmanBoxTracker +) + +BoTSORTKalmanBoxTracker + + +def get_alive_trackers( + trackers: Sequence[KalmanBoxTrackerType], + minimum_consecutive_frames: int, + maximum_frames_without_update: int, +) -> list[KalmanBoxTrackerType]: + """ + Remove dead or immature lost tracklets and get alive trackers + that are within `maximum_frames_without_update` AND (it's mature OR + it was just updated). + + Args: + trackers: List of KalmanBoxTracker objects. + minimum_consecutive_frames: Number of consecutive frames that an object + must be tracked before it is considered a 'valid' track. + maximum_frames_without_update: Maximum number of frames without update + before a track is considered dead. + + Returns: + List of alive trackers. + """ + alive_trackers = [] + for tracker in trackers: + is_mature = tracker.number_of_successful_updates >= minimum_consecutive_frames + is_active = tracker.time_since_update == 0 + if tracker.time_since_update < maximum_frames_without_update and ( + is_mature or is_active + ): + alive_trackers.append(tracker) + return alive_trackers + + +def get_iou_matrix( + trackers: Sequence[KalmanBoxTrackerType], detection_boxes: np.ndarray +) -> np.ndarray: + """ + Build IOU cost matrix between detections and predicted bounding boxes + + Args: + trackers: List of KalmanBoxTracker objects. + detection_boxes: Detected bounding boxes in the + form [x1, y1, x2, y2]. + + Returns: + IOU cost matrix. + """ + predicted_boxes = np.array([t.get_state_bbox() for t in trackers]) + if len(predicted_boxes) == 0 and len(trackers) > 0: + # Handle case where get_state_bbox might return empty array + predicted_boxes = np.zeros((len(trackers), 4), dtype=np.float32) + + if len(trackers) > 0 and len(detection_boxes) > 0: + iou_matrix = sv.box_iou_batch(predicted_boxes, detection_boxes) + else: + iou_matrix = np.zeros((len(trackers), len(detection_boxes)), dtype=np.float32) + + return iou_matrix + + +def update_detections_with_track_ids( + trackers: Sequence[KalmanBoxTrackerType], + detections: sv.Detections, + detection_boxes: np.ndarray, + minimum_iou_threshold: float, + minimum_consecutive_frames: int, +) -> sv.Detections: + """ + The function prepares the updated Detections with track IDs. + If a tracker is "mature" (>= `minimum_consecutive_frames`) or recently updated, + it is assigned an ID to the detection that just updated it. + + Args: + trackers: List of BoTSORTKalmanBoxTracker objects. + detections: The latest set of object detections. + detection_boxes: Detected bounding boxes in the + form [x1, y1, x2, y2]. + minimum_iou_threshold: IOU threshold for associating detections to + existing tracks. + minimum_consecutive_frames: Number of consecutive frames that an object + must be tracked before it is considered a 'valid' track. + + Returns: + A copy of the detections with `tracker_id` set + for each detection that is tracked. + """ + # Re-run association in the same way (could also store direct mapping) + final_tracker_ids = [-1] * len(detection_boxes) + + # Recalculate predicted_boxes based on current trackers after some may have + # been removed + predicted_boxes = np.array([t.get_state_bbox() for t in trackers]) + iou_matrix_final = np.zeros((len(trackers), len(detection_boxes)), dtype=np.float32) + + # Ensure predicted_boxes is properly shaped before the second iou calculation + if len(predicted_boxes) == 0 and len(trackers) > 0: + predicted_boxes = np.zeros((len(trackers), 4), dtype=np.float32) + + if len(trackers) > 0 and len(detection_boxes) > 0: + iou_matrix_final = sv.box_iou_batch(predicted_boxes, detection_boxes) + + row_indices, col_indices = np.where(iou_matrix_final > minimum_iou_threshold) + sorted_pairs = sorted( + zip(row_indices, col_indices), + key=lambda x: iou_matrix_final[x[0], x[1]], + reverse=True, + ) + used_rows: set[int] = set() + used_cols: set[int] = set() + for row, col in sorted_pairs: + # Double check index is in range + if row < len(trackers): + tracker_obj = trackers[int(row)] + # Only assign if the track is "mature" or is new but has enough hits + if (int(row) not in used_rows) and (int(col) not in used_cols): + if ( + tracker_obj.number_of_successful_updates + >= minimum_consecutive_frames + ): + # If tracker is mature but still has ID -1, assign a new ID + if tracker_obj.tracker_id == -1: + tracker_obj.tracker_id = ( + BoTSORTKalmanBoxTracker.get_next_tracker_id() + ) + final_tracker_ids[int(col)] = tracker_obj.tracker_id + used_rows.add(int(row)) + used_cols.add(int(col)) + + # Assign tracker IDs to the returned Detections + updated_detections = deepcopy(detections) + updated_detections.tracker_id = np.array(final_tracker_ids) + + return updated_detections From 13db13687603a5f5bcf55f149b6cd39e08b7a423 Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Wed, 4 Mar 2026 17:05:47 +0100 Subject: [PATCH 03/14] Add SIFT cmc to BoT-SORT. Refactor code. Update code documentation. --- trackers/core/botsort/cmc.py | 180 +++++++++++++++++++++---------- trackers/core/botsort/tracker.py | 90 +++++++++------- 2 files changed, 174 insertions(+), 96 deletions(-) diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 4fec6eec..5db0cb74 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -4,8 +4,6 @@ # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ -from __future__ import annotations - from dataclasses import dataclass from typing import Optional, Literal @@ -13,25 +11,35 @@ import numpy as np import cv2 -CMCTMethod = Literal["orb", "sparseOptFlow"] +CMCTMethod = Literal["orb", "sift", "sparseOptFlow"] @dataclass class CMCConfig: """ Configuration for camera motion compensation (CMC). - The CMC module estimates a global 2D affine transform `H` (2x3) between consecutive frames. - This transform is then applied to predicted track states before data association. + The CMC module estimates a global 2D affine transform `H` (2x3) between consecutive + frames. This transform is then applied to predicted track states before data + association. Attributes: method: Camera motion estimation method. - - "orb": Feature matching using FAST keypoints + ORB descriptors + BFMatcher, + - "orb": Feature matching using + FAST keypoints + ORB descriptors + BFMatcher (Hamming), followed by robust affine estimation (RANSAC). - Optionally masks out detection boxes so features are extracted from background. + Optionally masks out detection boxes so features are extracted from + background. + - "sift": Feature matching using + SIFT keypoints + SIFT descriptors + BFMatcher (L2), + followed by robust affine estimation (RANSAC). + Optionally masks out detection boxes so features are extracted from + background. "sift" generally produces fewer but more distinctive matches + than ORB at higher compute cost. - "sparseOptFlow": Sparse optical flow using corner tracking: - goodFeaturesToTrack -> calcOpticalFlowPyrLK -> robust affine estimation (RANSAC). + goodFeaturesToTrack -> calcOpticalFlowPyrLK -> robust affine estimation + (RANSAC). downscale: Integer downscale factor applied to frames before running CMC. @@ -42,41 +50,61 @@ class CMCConfig: Behavior: - Frames are resized to (W//downscale, H//downscale) for motion estimation. - The resulting affine translation components H[0,2], H[1,2] are scaled back - by multiplying by `downscale`, so the transform is in original image coordinates. + by multiplying by `downscale`, so the transform is in original image + coordinates. fast_threshold: (ORB only) Threshold for the FAST keypoint detector. - Higher values yield fewer keypoints (more selective); lower values yield more keypoints. + Higher values yield fewer keypoints (more selective); lower values yield + more keypoints. ransac_reproj_threshold: (ORB only) RANSAC reprojection threshold in pixels passed to - OpenCV’s affine estimation. It controls how far a point is allowed to deviate from the - estimated model while still being counted as an inlier. - Smaller values are stricter (reject more matches); larger values are more tolerant. + OpenCV’s affine estimation. It controls how far a point is allowed to + deviate from the estimated model while still being counted as an inlier. + Smaller values are stricter (reject more matches); larger values are more + tolerant. max_spatial_distance_frac: - (ORB only) Maximum allowed spatial displacement for a tentative match, expressed as a - fraction of (image width, image height) *after downscale*. + (ORB only) Maximum allowed spatial displacement for a tentative match, + expressed as a fraction of (image width, image height) *after downscale*. Example: If max_spatial_distance_frac = 0.25 and the downscaled frame is (W, H), then a match is rejected if |dx| >= 0.25*W or |dy| >= 0.25*H. Motivation: - Reject obviously incorrect descriptor matches whose displacement is implausibly large. + Reject obviously incorrect descriptor matches whose displacement is + implausibly large. roi_min_frac: - (ORB only) Lower bound of the region-of-interest (ROI) used to select keypoints, - expressed as a fraction of frame size. Points outside the ROI are masked out. + (ORB only) Lower bound of the region-of-interest (ROI) used to select + keypoints, expressed as a fraction of frame size. Points outside the ROI + are masked out. Example: roi_min_frac=0.02 means we ignore a ~2% border on each side. roi_max_frac: - (ORB only) Upper bound of the ROI used to select keypoints (fraction of frame size). - Together with roi_min_frac, it defines a central rectangle: + (ORB only) Upper bound of the ROI used to select keypoints (fraction of + frame size). Together with roi_min_frac, it defines a central rectangle: [roi_min_frac..roi_max_frac] in both x and y. + sift_n_octave_layers: + (SIFT only) Number of octave layers used by SIFT when constructing the + scale-space pyramid. Increasing this can increase sensitivity to scale + changes, at higher compute cost. + + sift_contrast_threshold: + (SIFT only) Threshold controlling how sensitive SIFT is + to low-contrast keypoints. Lower values generally produce more keypoints; + higher values are stricter. + + sift_edge_threshold: + (SIFT only) Threshold controlling rejection of keypoints on edges. + Lower values reject more edge-like responses; higher values are more + permissive. + sof_max_corners: (SparseOptFlow only) `maxCorners` passed to `cv2.goodFeaturesToTrack`. Maximum number of corners to detect for tracking. @@ -84,36 +112,46 @@ class CMCConfig: sof_quality_level: (SparseOptFlow only) `qualityLevel` passed to `cv2.goodFeaturesToTrack`. - Minimum accepted quality of corners. A higher value keeps only stronger corners; - a lower value yields more corners (including weaker ones). + Minimum accepted quality of corners. A higher value keeps only stronger + corners; a lower value yields more corners (including weaker ones). sof_min_distance: (SparseOptFlow only) `minDistance` passed to `cv2.goodFeaturesToTrack`. Minimum Euclidean distance (in pixels) between returned corners. - Higher values produce more spatially spread points; lower values allow clustering. + Higher values produce more spatially spread points; lower values allow + clustering. sof_block_size: (SparseOptFlow only) `blockSize` passed to `cv2.goodFeaturesToTrack`. - Size of the neighborhood used to compute corner quality (structure tensor window). + Size of the neighborhood used to compute corner quality (structure tensor + window). sof_use_harris: - (SparseOptFlow only) `useHarrisDetector` passed to `cv2.goodFeaturesToTrack`. - If True, uses the Harris corner measure; if False, uses the Shi-Tomasi measure. + (SparseOptFlow only) `useHarrisDetector` passed to + `cv2.goodFeaturesToTrack`. If True, uses the Harris corner measure; + if False, uses the Shi-Tomasi measure. sof_k: (SparseOptFlow only) `k` passed to `cv2.goodFeaturesToTrack`. Harris detector free parameter. Ignored if `sof_use_harris` is False. """ - method: CMCTMethod = "orb" + method: CMCTMethod = "sparseOptFlow" downscale: int = 2 - # ORB parameters - fast_threshold: int = 20 + # Shared ORB and SIFT parameters (_estimate_feature_affine) ransac_reproj_threshold: float = 3.0 max_spatial_distance_frac: float = 0.25 roi_min_frac: float = 0.02 roi_max_frac: float = 0.98 + # ORB parameters + fast_threshold: int = 20 + + # SIFT parameters + sift_n_octave_layers: int = 3 + sift_contrast_threshold: float = 0.02 + sift_edge_threshold: int = 20 + # Sparse optical flow parameters (goodFeaturesToTrack) sof_max_corners: int = 1000 sof_quality_level: float = 0.01 @@ -137,7 +175,8 @@ class CMC: Notes: - H maps points from previous frame coordinates to current frame coordinates. - - This class does not perform any drawing/visualization; it only estimates transforms. + - This class does not perform any drawing/visualization; it only estimates + transforms. """ def __init__(self, cfg: Optional[CMCConfig] = None) -> None: @@ -148,8 +187,8 @@ def __init__(self, cfg: Optional[CMCConfig] = None) -> None: cfg: Optional configuration. If None, defaults are used. Notes: - - ORB detector/extractor/matcher are only created if method == "orb". - - Sparse optical flow parameters are always initialized (cheap). + - Detector/extractor/matcher are only created if method is "orb" or "sift". + - feature_paramsare only created if method is "sparseOptFlow". """ self.cfg = cfg or CMCConfig() self.downscale = max(1, int(self.cfg.downscale)) @@ -162,16 +201,27 @@ def __init__(self, cfg: Optional[CMCConfig] = None) -> None: self.detector = cv2.FastFeatureDetector_create(self.cfg.fast_threshold) self.extractor = cv2.ORB_create() self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) - - # SparseOptFlow params - self.feature_params = dict( - maxCorners=self.cfg.sof_max_corners, - qualityLevel=self.cfg.sof_quality_level, - minDistance=self.cfg.sof_min_distance, - blockSize=self.cfg.sof_block_size, - useHarrisDetector=self.cfg.sof_use_harris, - k=self.cfg.sof_k, - ) + elif self.cfg.method == "sift": + self.detector = cv2.SIFT_create( + nOctaveLayers=self.cfg.sift_n_octave_layers, + contrastThreshold=self.cfg.sift_contrast_threshold, + edgeThreshold=int(self.cfg.sift_edge_threshold), + ) + self.extractor = cv2.SIFT_create( + nOctaveLayers=self.cfg.sift_n_octave_layers, + contrastThreshold=self.cfg.sift_contrast_threshold, + edgeThreshold=int(self.cfg.sift_edge_threshold), + ) + self.matcher = cv2.BFMatcher(cv2.NORM_L2) + elif self.cfg.method == "sparseOptFlow": + self.feature_params = dict( + maxCorners=self.cfg.sof_max_corners, + qualityLevel=self.cfg.sof_quality_level, + minDistance=self.cfg.sof_min_distance, + blockSize=self.cfg.sof_block_size, + useHarrisDetector=self.cfg.sof_use_harris, + k=self.cfg.sof_k, + ) self.reset() @@ -191,16 +241,20 @@ def reset(self) -> None: # SparseOptFlow state self._prev_frame_gray: Optional[np.ndarray] = None - self._prev_points: Optional[np.ndarray] = None # shape (N,1,2) from goodFeaturesToTrack - def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + # shape (N,1,2) from goodFeaturesToTrack + self._prev_points: Optional[np.ndarray] = None + + def estimate(self, frame_bgr: np.ndarray, + dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: """ Estimate global affine transform H (2x3) from previous frame to current frame. Args: frame_bgr: Current frame in BGR format (uint8), shape (H, W, 3). - dets_xyxy: Optional detections (N,4) in xyxy format, in original image scale. - Used only by ORB method for masking out object regions (background-only features). + dets_xyxy: Optional detections (N,4) in xyxy format, in original image + scale. Used only by ORB method for masking out object regions + (background-only features). Returns: H: Affine transform matrix of shape (2, 3), dtype float32. @@ -209,8 +263,8 @@ def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None if frame_bgr is None: return np.eye(2, 3, dtype=np.float32) - if self.cfg.method == "orb": - return self._estimate_orb(frame_bgr, dets_xyxy) + if self.cfg.method == "orb" or self.cfg.method == "sift": + return self._estimate_feature_affine(frame_bgr, dets_xyxy) if self.cfg.method == "sparseOptFlow": return self._estimate_sparse_optflow(frame_bgr) @@ -218,14 +272,17 @@ def estimate(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None # fallback return np.eye(2, 3, dtype=np.float32) - def _estimate_orb(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + def _estimate_feature_affine(self, frame_bgr: np.ndarray, + dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: """ - ORB-based affine estimation. + Feature affine estimation. ORB-based or SIFT-based + (different initializations of self.detector, self.extractor and self.matcher for + ORB and SIFT) Steps: 1) Convert to grayscale (+ optional downscale). 2) Create ROI mask and optionally mask out detections (background emphasis). - 3) Detect FAST keypoints and compute ORB descriptors. + 3) Detect FAST keypoints and compute ORB or SIFT descriptors. 4) KNN match descriptors against previous frame (ratio test). 5) Filter matches by max spatial displacement and by 2.5*std inliers. 6) Estimate affine transform with RANSAC. @@ -256,7 +313,7 @@ def _estimate_orb(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = dets = np.asarray(dets_xyxy, dtype=np.float32) / float(self.downscale) dets = dets.astype(np.int32) - # Safety clipping + # Safety clipping to avoid negative/out-of-bounds slicing dets[:, 0] = np.clip(dets[:, 0], 0, W - 1) dets[:, 2] = np.clip(dets[:, 2], 0, W - 1) dets[:, 1] = np.clip(dets[:, 1], 0, H - 1) @@ -266,12 +323,13 @@ def _estimate_orb(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = if x2b > x1b and y2b > y1b: mask[y1b:y2b, x1b:x2b] = 0 - # Detect + describe + # Detect + describe (ORB) kps = self.detector.detect(gray, mask) kps, desc = self.extractor.compute(gray, kps) H_aff = np.eye(2, 3, dtype=np.float32) + # First frame init if not self._initialized: self._prev_kps = copy.copy(kps) self._prev_desc = None if desc is None else copy.copy(desc) @@ -289,7 +347,8 @@ def _estimate_orb(self, frame_bgr: np.ndarray, dets_xyxy: Optional[np.ndarray] = self._prev_desc = copy.copy(desc) return H_aff - max_spatial = self.cfg.max_spatial_distance_frac * np.array([W, H], dtype=np.float32) + max_spatial = self.cfg.max_spatial_distance_frac * np.array([W, H], + dtype=np.float32) prev_pts = [] curr_pts = [] @@ -361,7 +420,8 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: # Downscale if self.downscale > 1: - frame = cv2.resize(frame, (W_img // self.downscale, H_img // self.downscale)) + frame = cv2.resize(frame, (W_img // self.downscale, + H_img // self.downscale)) # Find keypoints in current frame keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params) @@ -381,7 +441,8 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: # Optical flow correspondences # calcOpticalFlowPyrLK will throw or return nonsense if we give it None - matched, status, _err = cv2.calcOpticalFlowPyrLK(self._prev_frame_gray, frame, self._prev_points, None) + matched, status, _err = cv2.calcOpticalFlowPyrLK(self._prev_frame_gray, frame, + self._prev_points, None) if status is None or matched is None: self._prev_frame_gray = frame.copy() @@ -403,7 +464,6 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: curr_pts = np.array(curr_pts) # Find rigid matrix - # if (np.size(prev_pts, 0) > 4) and (np.size(prev_pts, 0) == np.size(prev_pts, 0)): if (np.size(prev_pts, 0) > 4) and (np.size(prev_pts, 0) == np.size(curr_pts, 0)): H_est, _ = cv2.estimateAffinePartial2D(prev_pts, curr_pts, cv2.RANSAC) if H_est is not None: @@ -436,7 +496,8 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: state := A * state + translation P := A * P * A^T - Where A applies the 2x2 rotation/shear block to each 2D component block in the state. + Where A applies the 2x2 rotation/shear block to each 2D component block in the + state. Args: tracks: List of track objects with `.state` and `.P` attributes. @@ -460,7 +521,8 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: A8[0:4, 0:4] = A4 A8[4:8, 4:8] = A4 - trans4 = np.array([t[0, 0], t[1, 0], t[0, 0], t[1, 0]], dtype=np.float32).reshape(4, 1) + trans4 = np.array([t[0, 0], t[1, 0], t[0, 0], t[1, 0]], + dtype=np.float32).reshape(4, 1) for trk in tracks: trk.state = (A8 @ trk.state).astype(np.float32) diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index c49808a2..05717551 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -23,8 +23,8 @@ class BoTSORTTracker(BaseTracker): """ BoT-SORT-style multi-object tracker (IoU association + optional CMC). - The tracker maintains a list of active tracks (Kalman-filter-based) and, for each frame, - performs: + The tracker maintains a list of active tracks (Kalman-filter-based) and, for each + frame, performs: 1) Predict existing track states (Kalman predict) 2) Split detections into high/low confidence groups 3) Apply camera motion compensation to predicted tracks @@ -33,17 +33,21 @@ class BoTSORTTracker(BaseTracker): 6) Spawn new tracks from unmatched high-confidence detections 7) Remove tracks that have been lost for too long - Parameters in __init__ control thresholds and lifecycle logic similarly to ByteTrack/BoT-SORT. + Parameters in __init__ control thresholds and lifecycle logic similarly to + ByteTrack. Attributes: tracks: List of active `BoTSORTKalmanBoxTracker` objects. - maximum_frames_without_update: Max number of consecutive frames a track can go unmatched - before being removed. - minimum_consecutive_frames: Track maturity threshold before assigning a permanent ID. + maximum_frames_without_update: Max number of consecutive frames a track can go + unmatched before being removed. + minimum_consecutive_frames: Track maturity threshold before assigning a + permanent ID. minimum_iou_threshold: Minimum IoU required for a valid match. track_activation_threshold: Confidence threshold for spawning a new track. - high_conf_det_threshold: Confidence threshold splitting detections into high/low groups. - enable_cmc: Whether to run camera motion compensation each frame (if `cmc` is set). + high_conf_det_threshold: Confidence threshold splitting detections into + high/low groups. + enable_cmc: Whether to run camera motion compensation each frame + (if `cmc` is set). cmc: Camera motion compensation instance (or None if disabled). """ @@ -56,7 +60,6 @@ def __init__( minimum_iou_threshold: float = 0.1, high_conf_det_threshold: float = 0.6, enable_cmc: bool = True, - # cmc_method: str = "orb", cmc_method: str = "sparseOptFlow", cmc_downscale: int = 2, @@ -65,19 +68,22 @@ def __init__( Initialize the tracker. Args: - lost_track_buffer: Time buffer (in frames at 30 FPS) for keeping lost tracks alive - before deletion. This is scaled by `frame_rate`. - frame_rate: Video frame rate used to scale the lost track buffer to time-like behavior. - track_activation_threshold: Minimum detection confidence to spawn a new track. - minimum_consecutive_frames: Number of successful updates required before assigning - a stable track ID (different than initial -1). + lost_track_buffer: Time buffer (in frames at 30 FPS) for keeping lost tracks + alive before deletion. This is scaled by `frame_rate`. + frame_rate: Video frame rate used to scale the lost track buffer to + time-like behavior. + track_activation_threshold: Minimum detection confidence to spawn a new + track. + minimum_consecutive_frames: Number of successful updates required before + assigning a stable track ID (different than initial -1). minimum_iou_threshold: Minimum IoU to accept a detection-track association. high_conf_det_threshold: Confidence threshold used to split detections into: - high confidence: confidence >= threshold - low confidence: confidence < threshold enable_cmc: Whether to enable camera motion compensation (CMC). - cmc_method: CMC method string passed into `CMCConfig(method=...)`. Supported values - depend on `CMC` (e.g. "orb", "sparseOptFlow"). See CMCConfig. + cmc_method: CMC method string passed into `CMCConfig(method=...)`. + Supported values depend on `CMC` (e.g. "orb", "sift", "sparseOptFlow"). + See CMCConfig. cmc_downscale: Downscale factor used inside CMC for speed/robustness. Notes: @@ -96,7 +102,8 @@ def __init__( self.tracks: list[BoTSORTKalmanBoxTracker] = [] self.enable_cmc = enable_cmc - self.cmc = CMC(CMCConfig(method=cmc_method, downscale=cmc_downscale)) if enable_cmc else None + self.cmc = CMC(CMCConfig(method=cmc_method, + downscale=cmc_downscale)) if enable_cmc else None def _update_detections( self, @@ -110,16 +117,17 @@ def _update_detections( For each (track_idx, det_idx) match: - Update the track’s Kalman state with the detection bbox. - - If the track is “mature” (>= minimum_consecutive_frames) and still has tracker_id == -1, - assign a new unique tracker ID. - - Create a single-row `sv.Detections` object for the matched detection and set its - tracker_id to the track ID (or -1 if not mature yet). + - If the track is “mature” (>= minimum_consecutive_frames) and still has + tracker_id == -1, assign a new unique tracker ID. + - Create a single-row `sv.Detections` object for the matched detection and set + its tracker_id to the track ID (or -1 if not mature yet). - Append it to `updated_detections`. Args: tracks: Tracks being updated. detections: Detections used for update. - updated_detections: Accumulator list of per-detection outputs for this frame. + updated_detections: Accumulator list of per-detection outputs for this + frame. matched_indices: List of (track_row_index, detection_col_index) pairs. Returns: @@ -155,34 +163,39 @@ def update( This is the main per-frame entry point. Args: - detections: Supervision detections for the current frame. Must include `.xyxy`. - Confidence (`detections.confidence`) is optional but recommended. - The method writes/overwrites `detections.tracker_id`. - frame: Current video frame in BGR format (H, W, 3), required if CMC is enabled. + detections: Supervision detections for the current frame. Must include ` + .xyxy`. Confidence (`detections.confidence`) is optional but + recommended. The method writes/overwrites `detections.tracker_id`. + frame: Current video frame in BGR format (H, W, 3), required if CMC is + enabled. Returns: A merged `sv.Detections` object containing detections from this frame with `tracker_id` assigned: - >= 0 indicates a confirmed track ID - - -1 indicates unconfirmed/untracked (e.g., new / low confidence / not yet mature) + - -1 indicates unconfirmed/untracked (e.g., new / low confidence / not yet + mature) Notes: - - If CMC is enabled, the tracker estimates a global affine transform (2x3) from the - frame and uses it to warp predicted track states before association. + - If CMC is enabled, the tracker estimates a global affine transform (2x3) + from the frame and uses it to warp predicted track states before + association. """ if len(self.tracks) == 0 and len(detections) == 0: detections.tracker_id = np.array([], dtype=int) return detections updated_detections: list[ sv.Detections - ] = [] # List for returning the updated detections with its new assigned track id # noqa: E501 + ] = [] # List for returning the updated detections with its new assigned + # track id # noqa: E501 # Predict new locations for existing tracks for tracker in self.tracks: tracker.predict() # Assign a default tracker_id with the correct shape detections.tracker_id = -np.ones(len(detections)) - # Split into high confidence boxes and lower based on self.high_conf_det_threshold # noqa: E501 + # Split into high confidence boxes and lower based on + # self.high_conf_det_threshold # noqa: E501 high_prob_detections, low_prob_detections = ( self._get_high_and_low_probability_detections(detections) ) @@ -291,8 +304,9 @@ def _get_associated_indices( assignment problem in an optimal way. Args: - similarity_matrix: Similarity matrix between tracks (rows) and detections (columns). - min_similarity_thresh: Minimum similarity threshold for a valid match. + similarity_matrix: Similarity matrix between tracks (rows) and detections + (columns). min_similarity_thresh: Minimum similarity threshold for a valid + match. Returns: Matched indices (list of (tracker_idx, detection_idx)), indices of @@ -362,8 +376,9 @@ def _similarity_step( detections: sv.Detections, tracks: list[BoTSORTKalmanBoxTracker], ) -> tuple[list[tuple[int, int]], set[int], set[int]]: - """Measures similarity based on IoU between tracks and detections and returns the matches - and unmatched tracks/detections. Is used for step 1 and 2 of the BYTE algorithm. + """Measures similarity based on IoU between tracks and detections and returns + the matches and unmatched tracks/detections. Is used for step 1 and 2 of the + BYTE algorithm. Args: detections: The set of object detections. @@ -382,7 +397,8 @@ def _similarity_step( thresh = self.minimum_iou_threshold # Associate detections to tracks based on the higher value of the - # similarity matrix, using the Jonker-Volgenant algorithm (linear_sum_assignment). # noqa: E501 + # similarity matrix, using the Jonker-Volgenant algorithm + # (linear_sum_assignment). # noqa: E501 matched_indices, unmatched_tracks, unmatched_detections = ( self._get_associated_indices(similarity_matrix, thresh) ) From 1db6826f346b1fc88c87f1b79075355bb13cd6fc Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Tue, 10 Mar 2026 15:30:10 +0100 Subject: [PATCH 04/14] Adjust Kalman filter as in original BoT-SORT --- trackers/core/botsort/cmc.py | 126 ++++-- trackers/core/botsort/kalman_box_tracker.py | 433 ++++++++++++++++---- 2 files changed, 460 insertions(+), 99 deletions(-) diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 5db0cb74..0fc16f72 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -302,6 +302,7 @@ def _estimate_feature_affine(self, frame_bgr: np.ndarray, gray = cv2.resize(gray, (W_img // self.downscale, H_img // self.downscale)) H, W = gray.shape[:2] + # Build mask: central ROI + remove detections (background features) mask = np.zeros_like(gray, dtype=np.uint8) y0 = int(self.cfg.roi_min_frac * H) y1 = int(self.cfg.roi_max_frac * H) @@ -486,45 +487,120 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: @staticmethod def apply_to_tracks(tracks: list, H: np.ndarray) -> None: """ - Apply affine transform H (2x3) to tracker states and covariances in-place. + Apply a global affine motion transform to tracker states and covariances in-place. - This implementation assumes each track has: - - `state`: (8,1) float vector [x1,y1,x2,y2,vx,vy,vx2,vy2]^T - - `P`: (8,8) covariance matrix + This method updates each track according to the affine transform - The transform is applied as: - state := A * state + translation - P := A * P * A^T + x' = R x + t - Where A applies the 2x2 rotation/shear block to each 2D component block in the - state. + where: + R: + 2x2 linear part of the affine transform (rotation / shear / scale-like part). + t: + 2D translation vector. + + The input transform `H` is expected in standard OpenCV affine form: + + H = [ R | t ] + + with shape (2, 3). + + Tracker state convention: + Each track is assumed to store its Kalman state as + + [xc, yc, w, h, vxc, vyc, vw, vh]^T + + where: + xc, yc: + Bounding box center coordinates. + w, h: + Bounding box width and height. + vxc, vyc: + Velocities of the center coordinates. + vw, vh: + Velocities of the width and height. + + State update logic: + The affine transform is applied only to the geometric quantities that live in + the 2D image plane as position or velocity vectors: + + 1) Center position: + [xc, yc]^T = R @ [xc, yc]^T + t + + 2) Center velocity: + [vxc, vyc]^T = R @ [vxc, vyc]^T + + 3) Width, height, and their velocities: + [w, h, vw, vh] remain unchanged + + Why width and height are not transformed here: + Width and height are scalar box dimensions, not 2D point coordinates. + In this implementation, camera motion compensation is used to correct the + object center location and its image-plane velocity, while the box size terms + are left unchanged. This keeps the compensation simple and consistent with the + state representation used by the tracker. + + Covariance update: + Each track also stores a covariance matrix `P` describing uncertainty in the + 8D Kalman state. After the mean state is transformed, the covariance is updated + using the linear transform + + P = A @ P @ A.T + + where `A` is an 8x8 block matrix that applies `R` to: + - the center position block [xc, yc] + - the center velocity block [vxc, vyc] + + and leaves the remaining state dimensions unchanged. + + Concretely: + - A[0:2, 0:2] = R + - A[4:6, 4:6] = R + - all other diagonal entries remain 1 Args: - tracks: List of track objects with `.state` and `.P` attributes. - H: Affine transform (2,3) mapping prev -> curr. + tracks: + List of track objects. Each track is expected to expose: + - `state`: NumPy array of shape (8, 1) + - `P`: NumPy array of shape (8, 8) + H: + Affine transform matrix of shape (2, 3), mapping previous-frame image + coordinates to current-frame image coordinates. Returns: - None. Tracks are modified in-place. + None. + The tracks are modified in-place. + + Notes: + - If `H` is None or `tracks` is empty, this method does nothing. + - The method assumes that `H` has already been estimated in image coordinates + consistent with the tracker state. + - This method does not perform any validity checks on whether the estimated + transform is physically plausible; it simply applies the provided transform. """ if H is None or len(tracks) == 0: return H = H.astype(np.float32) R = H[:2, :2] - t = H[:2, 2:3] # (2,1) + t = H[:2, 2] - A4 = np.zeros((4, 4), dtype=np.float32) - A4[0:2, 0:2] = R - A4[2:4, 2:4] = R + for trk in tracks: + x = trk.state.reshape(-1) - A8 = np.zeros((8, 8), dtype=np.float32) - A8[0:4, 0:4] = A4 - A8[4:8, 4:8] = A4 + # Update the state mean using the affine transform. + pos = x[0:2] + vel = x[4:6] - trans4 = np.array([t[0, 0], t[1, 0], t[0, 0], t[1, 0]], - dtype=np.float32).reshape(4, 1) + x[0:2] = R @ pos + t + x[4:6] = R @ vel - for trk in tracks: - trk.state = (A8 @ trk.state).astype(np.float32) - trk.state[0:4] += trans4 - trk.P = (A8 @ trk.P @ A8.T).astype(np.float32) \ No newline at end of file + trk.state = x.reshape(8, 1).astype(np.float32) + + # Update the state covariance under the corresponding linear transform. + A = np.eye(8, dtype=np.float32) + A[0:2, 0:2] = R # center position + A[4:6, 4:6] = R # center velocity + # Box size terms (w, h, vw, vh) are not transformed in this implementation. + + trk.P = (A @ trk.P @ A.T).astype(np.float32) \ No newline at end of file diff --git a/trackers/core/botsort/kalman_box_tracker.py b/trackers/core/botsort/kalman_box_tracker.py index dc19df67..90cc04b2 100644 --- a/trackers/core/botsort/kalman_box_tracker.py +++ b/trackers/core/botsort/kalman_box_tracker.py @@ -6,119 +6,412 @@ import numpy as np + class BoTSORTKalmanBoxTracker: """ - The `BoTSORTKalmanBoxTracker` class represents the internals of a single - tracked object (bounding box), with a Kalman filter to predict and update - its position. - - Attributes: - tracker_id: Unique identifier for the tracker. - number_of_successful_updates: Number of times the object has been - updated successfully. - time_since_update: Number of frames since the last update. - state: State vector of the bounding box. - F: State transition matrix. - H: Measurement matrix. - Q: Process noise covariance matrix. - R: Measurement noise covariance matrix. - P: Error covariance matrix. - count_id: Class variable to assign unique IDs to each tracker. - - Args: - bbox: Initial bounding box in the form [x1, y1, x2, y2]. + Kalman-filter-based state estimator for a single tracked object. + + This class maintains the motion state of one object using a linear Kalman filter + with a constant-velocity model. The tracker stores the object state internally in + center-width-height form, but accepts detections and returns boxes in standard + corner format. + + Internal state vector: + [xc, yc, w, h, vxc, vyc, vw, vh]^T + + where: + xc, yc: + Bounding box center coordinates. + w, h: + Bounding box width and height. + vxc, vyc: + Velocities of the center coordinates. + vw, vh: + Velocities of the width and height. + + Public input/output convention: + - input detections to `__init__()` and `update()` are expected in xyxy format: + [x1, y1, x2, y2] + - output from `get_state_bbox()` is returned in xyxy format: + [x1, y1, x2, y2] + + Kalman filter matrices used in this class: + F: + State transition matrix. Propagates the state from one frame to the next + under a constant-velocity assumption. + H: + Measurement matrix. Maps the internal 8D state to the observable 4D + measurement space [xc, yc, w, h]. + Q: + Process noise covariance. Models uncertainty in the motion model used + during prediction. + R: + Measurement noise covariance. Models uncertainty in incoming detections + during the update step. + P: + State covariance matrix. Represents the current uncertainty of the full + 8D state estimate. + + Lifecycle-related attributes: + tracker_id: + Permanent track identifier. Starts at -1 and is assigned later by the + outer tracking logic once the track is considered mature. + number_of_successful_updates: + Number of successful detection-based updates received by this track. + time_since_update: + Number of consecutive prediction steps since the last measurement update. + + Notes: + - The process and measurement noise are scaled using the current object width + and height. This makes the uncertainty proportional to object size. + - Width and height are constrained to remain positive after prediction and + update to avoid degenerate boxes. """ count_id = 0 @classmethod def get_next_tracker_id(cls) -> int: - """ - Class method that returns the next available tracker ID. - - Returns: - The next available tracker ID. - """ next_id = cls.count_id cls.count_id += 1 return next_id def __init__(self, bbox: np.ndarray): - # Initialize with a temporary ID of -1 - # Will be assigned a real ID when the track is considered mature - self.tracker_id = -1 + """ + Initialize a new track from the first observed bounding box. + + Args: + bbox: + Initial detection in xyxy format: [x1, y1, x2, y2]. + + Initialization steps: + 1) Set track-management attributes such as `tracker_id`, + `number_of_successful_updates`, and `time_since_update`. + 2) Allocate the internal 8D Kalman state vector: + [xc, yc, w, h, vxc, vyc, vw, vh]^T + 3) Convert the input bounding box from xyxy to xywh form: + [xc, yc, w, h] + 4) Store that measurement in the position/size part of the state. + 5) Initialize the Kalman filter matrices F, H, Q, R, and P. - # Number of hits indicates how many times the object has been - # updated successfully + Notes: + - Initial velocities are set to zero. + - The initial covariance matrix P is set in `_initialize_kalman_filter()` + and reflects uncertainty about both position/size and velocity. + """ + self.tracker_id = -1 self.number_of_successful_updates = 1 - # Number of frames since the last update self.time_since_update = 0 - # For simplicity, we keep a small state vector: - # (x, y, x2, y2, vx, vy, vx2, vy2). - # We'll store the bounding box in "self.state" + # State mean: [xc, yc, w, h, vxc, vyc, vw, vh]^T self.state = np.zeros((8, 1), dtype=np.float32) - # Initialize state directly from the first detection - self.state[0] = bbox[0] - self.state[1] = bbox[1] - self.state[2] = bbox[2] - self.state[3] = bbox[3] + # Initialize from first detection in xyxy + measurement = self.xyxy_to_xywh(bbox) + self.state[0:4, 0] = measurement - # Basic constant velocity model - self._initialize_kalman_filter() + self._initialize_kalman_filter(measurement) - def _initialize_kalman_filter(self) -> None: + @staticmethod + def xyxy_to_xywh(bbox: np.ndarray) -> np.ndarray: """ - Sets up the matrices for the Kalman filter. + Convert a bounding box from corner format to center-size format. + + Args: + bbox: + Bounding box in xyxy format: [x1, y1, x2, y2]. + + Returns: + Bounding box in xywh format: [xc, yc, w, h]. + """ + x1, y1, x2, y2 = bbox.astype(np.float32) + w = x2 - x1 + h = y2 - y1 + xc = x1 + w / 2.0 + yc = y1 + h / 2.0 + return np.array([xc, yc, w, h], dtype=np.float32) + + @staticmethod + def xywh_to_xyxy(state_xywh: np.ndarray) -> np.ndarray: + """ + Convert a bounding box from center-size format to corner format. + + Args: + state_xywh: + Bounding box in xywh format: [xc, yc, w, h]. + + Returns: + Bounding box in xyxy format: [x1, y1, x2, y2]. + """ + xc, yc, w, h = state_xywh.astype(np.float32) + x1 = xc - w / 2.0 + y1 = yc - h / 2.0 + x2 = xc + w / 2.0 + y2 = yc + h / 2.0 + return np.array([x1, y1, x2, y2], dtype=np.float32) + + def _initialize_kalman_filter(self, measurement: np.ndarray) -> None: + """ + Initialize the Kalman filter matrices for the current track. + + Args: + measurement: + Initial object measurement in xywh format: + [xc, yc, w, h]. + + This method initializes the following matrices: + + State transition matrix: + F is an 8x8 matrix defining how the state evolves from one frame to the next. + It implements a constant-velocity model: + xc <- xc + vxc + yc <- yc + vyc + w <- w + vw + h <- h + vh + while the velocity terms are carried forward unchanged. + + Measurement matrix: + H is a 4x8 matrix mapping the internal 8D state + [xc, yc, w, h, vxc, vyc, vw, vh]^T + to the observable 4D measurement + [xc, yc, w, h]^T. + In other words, only the first four state components are directly observed. + + Process noise covariance: + Q is an 8x8 diagonal matrix representing uncertainty in the motion model + used during prediction. Larger values allow the predicted state to change + more freely from frame to frame. + + Measurement noise covariance: + R is a 4x4 diagonal matrix representing uncertainty in the detector + measurements used during correction/update. + + State covariance: + P is the initial 8x8 covariance matrix representing uncertainty in the + initial state estimate. The velocity terms are initialized with larger + uncertainty than the position/size terms because they are not directly + observed in the first frame. + + Noise scaling: + The diagonal entries of Q, R, and P are scaled using the initial object + width and height. This makes the uncertainty proportional to object size: + larger objects are allowed proportionally larger absolute motion and noise. + + Notes: + - `sigma_p` controls the scale of position/size process noise. + - `sigma_v` controls the scale of velocity process noise. + - `sigma_m` controls the scale of measurement noise. + - All covariance matrices are diagonal in this implementation. """ - # State transition matrix (F): 8x8 - # We assume a constant velocity model. Positions are incremented by - # velocity each step. self.F = np.eye(8, dtype=np.float32) for i in range(4): self.F[i, i + 4] = 1.0 - # Measurement matrix (H): we directly measure x1, y1, x2, y2 - self.H = np.eye(4, 8, dtype=np.float32) # 4x8 + self.H = np.eye(4, 8, dtype=np.float32) + + # BoT-SORT-style scale-aware noise using width/height. + sigma_p = 0.05 + sigma_v = 0.00625 + sigma_m = 0.05 - # Process covariance matrix (Q) - self.Q = np.eye(8, dtype=np.float32) * 0.01 + w, h = measurement[2], measurement[3] + + q_diag = np.array([ + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + ], dtype=np.float32) + self.Q = np.diag(q_diag) + + r_diag = np.array([ + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + ], dtype=np.float32) + self.R = np.diag(r_diag) + + # Initial covariance, as in original BoT-SORT KF + p_diag = np.array([ + (2 * sigma_p * w) ** 2, + (2 * sigma_p * h) ** 2, + (2 * sigma_p * w) ** 2, + (2 * sigma_p * h) ** 2, + (10 * sigma_v * w) ** 2, + (10 * sigma_v * h) ** 2, + (10 * sigma_v * w) ** 2, + (10 * sigma_v * h) ** 2, + ], dtype=np.float32) + self.P = np.diag(p_diag) + + def _update_process_and_measurement_noise(self) -> None: + """ + Recompute the process and measurement noise covariances from the current box size. - # Measurement covariance (R): noise in detection - self.R = np.eye(4, dtype=np.float32) * 0.1 + This method updates: - # Error covariance matrix (P) - self.P = np.eye(8, dtype=np.float32) + Q: + Process noise covariance, used in the prediction step. + It models uncertainty in how the state changes from one frame to the next. + + R: + Measurement noise covariance, used in the update step. + It models uncertainty in the current detection measurement. + + Why this update is needed: + The scale of the uncertainty should depend on the current object size. + For example, a 2-pixel error is relatively more important for a small object + than for a large one. Therefore, the diagonal entries of Q and R are computed + from the current predicted width and height stored in the state. + + Implementation details: + - Width and height are read from the current state: + w = state[2], h = state[3] + - They are clamped to a small positive minimum to avoid zero or negative values. + - The resulting Q and R matrices remain diagonal. + + Notes: + This method does not update P directly. It only refreshes the noise models + used later in `predict()` and `update()`. + """ + sigma_p = 0.05 + sigma_v = 0.00625 + sigma_m = 0.05 + + w = max(float(self.state[2, 0]), 1e-3) + h = max(float(self.state[3, 0]), 1e-3) + + q_diag = np.array([ + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + ], dtype=np.float32) + self.Q = np.diag(q_diag) + + r_diag = np.array([ + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + ], dtype=np.float32) + self.R = np.diag(r_diag) def predict(self) -> None: """ - Predict the next state of the bounding box (applies the state transition). + Predict the next state and covariance using the Kalman motion model. + + This method performs the Kalman filter prediction step: + + state <- F @ state + P <- F @ P @ F.T + Q + + where: + F: + State transition matrix. + P: + Current state covariance matrix. + Q: + Process noise covariance. + + Effect of the prediction: + - The center position and box size are advanced using their current velocities. + - The covariance matrix P is propagated forward and increased by Q to reflect + additional uncertainty introduced during motion prediction. + + Additional behavior: + - The process and measurement noise matrices are refreshed first by calling + `_update_process_and_measurement_noise()`. + - Width and height are clamped to remain positive after prediction. + - `time_since_update` is incremented because this frame has not yet received + a measurement update. + + Notes: + This method does not use any detection input. It only extrapolates the track + state forward in time. """ + self._update_process_and_measurement_noise() + # Predict state self.state = self.F @ self.state - # Predict error covariance + + # Predict error (uncertainty) covariance self.P = self.F @ self.P @ self.F.T + self.Q + # Prevent degenerate box shape + self.state[2, 0] = max(self.state[2, 0], 1e-3) + self.state[3, 0] = max(self.state[3, 0], 1e-3) + # Increase time since update self.time_since_update += 1 def update(self, bbox: np.ndarray) -> None: """ - Updates the state with a new detected bounding box. + Correct the predicted state using a new detection. Args: - bbox: Detected bounding box in the form [x1, y1, x2, y2]. + bbox: + Detection bounding box in xyxy format: [x1, y1, x2, y2]. + + This method performs the Kalman filter correction/update step: + + measurement = xyxy_to_xywh(bbox) + S = H @ P @ H.T + R + K = P @ H.T @ inv(S) + y = measurement - H @ state + state = state + K @ y + P = (I - K @ H) @ P + + where: + measurement: + Observed bounding box converted to [xc, yc, w, h]. + S: + Innovation covariance. Represents uncertainty in the predicted + measurement. + K: + Kalman gain. Controls how strongly the state is corrected toward + the new measurement. + y: + Innovation (also called residual), i.e. the difference between the + observed measurement and the predicted measurement. + I: + Identity matrix of appropriate size. + + Effect of the update: + - The predicted state is corrected toward the observed detection. + - The covariance matrix P is reduced to reflect increased confidence + after receiving a measurement. + + Additional behavior: + - `time_since_update` is reset to zero. + - `number_of_successful_updates` is incremented. + - Width and height are clamped to remain positive after correction. + + Notes: + The measurement only directly observes [xc, yc, w, h], not the velocity + terms. However, the velocity estimates can still change indirectly through + the Kalman gain and the state covariance structure. """ self.time_since_update = 0 self.number_of_successful_updates += 1 + measurement = self.xyxy_to_xywh(bbox).reshape((4, 1)) + self._update_process_and_measurement_noise() + # Kalman Gain S = self.H @ self.P @ self.H.T + self.R K = self.P @ self.H.T @ np.linalg.inv(S) - # Residual - measurement = bbox.reshape((4, 1)) + # Innovation (residual) y = measurement - self.H @ self.state # Update state @@ -128,19 +421,11 @@ def update(self, bbox: np.ndarray) -> None: identity_matrix = np.eye(8, dtype=np.float32) self.P = (identity_matrix - K @ self.H) @ self.P + self.state[2, 0] = max(self.state[2, 0], 1e-3) + self.state[3, 0] = max(self.state[3, 0], 1e-3) + def get_state_bbox(self) -> np.ndarray: """ - Returns the current bounding box estimate from the state vector. - - Returns: - The bounding box [x1, y1, x2, y2]. - """ - return np.array( - [ - self.state[0], # x1 - self.state[1], # y1 - self.state[2], # x2 - self.state[3], # y2 - ], - dtype=float, - ).reshape(-1) + Return current predicted box in xyxy format. + """ + return self.xywh_to_xyxy(self.state[0:4, 0]) \ No newline at end of file From 809cf3013de65750ca5e50d8a344e2f99c58338a Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Tue, 10 Mar 2026 15:38:52 +0100 Subject: [PATCH 05/14] Adjust comment line lengths --- trackers/core/botsort/cmc.py | 27 ++++++++++++--------- trackers/core/botsort/kalman_box_tracker.py | 21 +++++++++------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 0fc16f72..45ddeeb9 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -487,7 +487,8 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: @staticmethod def apply_to_tracks(tracks: list, H: np.ndarray) -> None: """ - Apply a global affine motion transform to tracker states and covariances in-place. + Apply a global affine motion transform to tracker states and covariances + in-place. This method updates each track according to the affine transform @@ -495,7 +496,8 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: where: R: - 2x2 linear part of the affine transform (rotation / shear / scale-like part). + 2x2 linear part of the affine transform (rotation / shear / scale-like + part). t: 2D translation vector. @@ -521,8 +523,8 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: Velocities of the width and height. State update logic: - The affine transform is applied only to the geometric quantities that live in - the 2D image plane as position or velocity vectors: + The affine transform is applied only to the geometric quantities that live + in the 2D image plane as position or velocity vectors: 1) Center position: [xc, yc]^T = R @ [xc, yc]^T + t @@ -536,14 +538,14 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: Why width and height are not transformed here: Width and height are scalar box dimensions, not 2D point coordinates. In this implementation, camera motion compensation is used to correct the - object center location and its image-plane velocity, while the box size terms - are left unchanged. This keeps the compensation simple and consistent with the - state representation used by the tracker. + object center location and its image-plane velocity, while the box size + terms are left unchanged. This keeps the compensation simple and consistent + with the state representation used by the tracker. Covariance update: Each track also stores a covariance matrix `P` describing uncertainty in the - 8D Kalman state. After the mean state is transformed, the covariance is updated - using the linear transform + 8D Kalman state. After the mean state is transformed, the covariance is + updated using the linear transform P = A @ P @ A.T @@ -573,10 +575,11 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: Notes: - If `H` is None or `tracks` is empty, this method does nothing. - - The method assumes that `H` has already been estimated in image coordinates - consistent with the tracker state. + - The method assumes that `H` has already been estimated in image + coordinates onsistent with the tracker state. - This method does not perform any validity checks on whether the estimated - transform is physically plausible; it simply applies the provided transform. + transform is physically plausible; it simply applies the provided + transform. """ if H is None or len(tracks) == 0: return diff --git a/trackers/core/botsort/kalman_box_tracker.py b/trackers/core/botsort/kalman_box_tracker.py index 90cc04b2..91b1a409 100644 --- a/trackers/core/botsort/kalman_box_tracker.py +++ b/trackers/core/botsort/kalman_box_tracker.py @@ -162,8 +162,8 @@ def _initialize_kalman_filter(self, measurement: np.ndarray) -> None: This method initializes the following matrices: State transition matrix: - F is an 8x8 matrix defining how the state evolves from one frame to the next. - It implements a constant-velocity model: + F is an 8x8 matrix defining how the state evolves from one frame to the + next. It implements a constant-velocity model: xc <- xc + vxc yc <- yc + vyc w <- w + vw @@ -251,7 +251,8 @@ def _initialize_kalman_filter(self, measurement: np.ndarray) -> None: def _update_process_and_measurement_noise(self) -> None: """ - Recompute the process and measurement noise covariances from the current box size. + Recompute the process and measurement noise covariances from the current box + size. This method updates: @@ -266,13 +267,14 @@ def _update_process_and_measurement_noise(self) -> None: Why this update is needed: The scale of the uncertainty should depend on the current object size. For example, a 2-pixel error is relatively more important for a small object - than for a large one. Therefore, the diagonal entries of Q and R are computed - from the current predicted width and height stored in the state. + than for a large one. Therefore, the diagonal entries of Q and R are + computed from the current predicted width and height stored in the state. Implementation details: - Width and height are read from the current state: w = state[2], h = state[3] - - They are clamped to a small positive minimum to avoid zero or negative values. + - They are clamped to a small positive minimum to avoid zero or negative + values. - The resulting Q and R matrices remain diagonal. Notes: @@ -324,9 +326,10 @@ def predict(self) -> None: Process noise covariance. Effect of the prediction: - - The center position and box size are advanced using their current velocities. - - The covariance matrix P is propagated forward and increased by Q to reflect - additional uncertainty introduced during motion prediction. + - The center position and box size are advanced using their current + velocities. + - The covariance matrix P is propagated forward and increased by Q to + reflect additional uncertainty introduced during motion prediction. Additional behavior: - The process and measurement noise matrices are refreshed first by calling From 8cdf29dd639faf27386771d5e5925279654f45e5 Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Tue, 10 Mar 2026 15:46:09 +0100 Subject: [PATCH 06/14] Remove the bad quotation sign from docs --- trackers/core/botsort/cmc.py | 2 +- trackers/core/botsort/tracker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 45ddeeb9..6243c159 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -60,7 +60,7 @@ class CMCConfig: ransac_reproj_threshold: (ORB only) RANSAC reprojection threshold in pixels passed to - OpenCV’s affine estimation. It controls how far a point is allowed to + OpenCV's affine estimation. It controls how far a point is allowed to deviate from the estimated model while still being counted as an inlier. Smaller values are stricter (reject more matches); larger values are more tolerant. diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index 05717551..fa2db77a 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -116,7 +116,7 @@ def _update_detections( Apply matched detection updates to tracks and append corresponding outputs. For each (track_idx, det_idx) match: - - Update the track’s Kalman state with the detection bbox. + - Update the track's Kalman state with the detection bbox. - If the track is “mature” (>= minimum_consecutive_frames) and still has tracker_id == -1, assign a new unique tracker ID. - Create a single-row `sv.Detections` object for the matched detection and set From 504b7b58576a03fb13c5c8795b4b562457317c2d Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Mon, 16 Mar 2026 14:39:00 +0100 Subject: [PATCH 07/14] Add ECC camera motion compensation --- trackers/core/botsort/cmc.py | 121 ++++++++++++++++++++++++++++++- trackers/core/botsort/tracker.py | 4 +- 2 files changed, 119 insertions(+), 6 deletions(-) diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 6243c159..e480226b 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -11,7 +11,7 @@ import numpy as np import cv2 -CMCTMethod = Literal["orb", "sift", "sparseOptFlow"] +CMCTMethod = Literal["orb", "sift", "sparseOptFlow", "ecc"] @dataclass class CMCConfig: @@ -40,6 +40,10 @@ class CMCConfig: - "sparseOptFlow": Sparse optical flow using corner tracking: goodFeaturesToTrack -> calcOpticalFlowPyrLK -> robust affine estimation (RANSAC). + - "ecc": Global image alignment using the Enhanced Correlation Coefficient + (ECC) optimization method. This estimates a 2D Euclidean transform + directly from grayscale image intensities rather than from sparse feature + correspondences. downscale: Integer downscale factor applied to frames before running CMC. @@ -134,6 +138,19 @@ class CMCConfig: sof_k: (SparseOptFlow only) `k` passed to `cv2.goodFeaturesToTrack`. Harris detector free parameter. Ignored if `sof_use_harris` is False. + + ecc_number_of_iterations: + (ECC only) Maximum number of optimization iterations used by the ECC + alignment procedure. + + ecc_termination_eps: + (ECC only) Convergence tolerance used by the ECC optimizer. + Smaller values require a more precise fit and may increase runtime. + + ecc_gaussian_filter_size: + (ECC only) Gaussian filter size parameter passed to OpenCV's + `findTransformECC`. This can help stabilize optimization on noisy frames. + A value of 1 matches the current implementation. """ method: CMCTMethod = "sparseOptFlow" downscale: int = 2 @@ -160,6 +177,18 @@ class CMCConfig: sof_use_harris: bool = False sof_k: float = 0.04 + # ECC parameters + + # BoT-SORT's original - resulting in veeery long (=unacceptably long) execution time + # ecc_number_of_iterations: int = 5000 + # ecc_termination_eps: float = 1e-6 + + # Adjusted + ecc_number_of_iterations: int = 50 + ecc_termination_eps: float = 1e-4 + + ecc_gaussian_filter_size: int = 1 + class CMC: """ @@ -189,6 +218,7 @@ def __init__(self, cfg: Optional[CMCConfig] = None) -> None: Notes: - Detector/extractor/matcher are only created if method is "orb" or "sift". - feature_paramsare only created if method is "sparseOptFlow". + - ECC optimization settings are created for "ecc". """ self.cfg = cfg or CMCConfig() self.downscale = max(1, int(self.cfg.downscale)) @@ -222,6 +252,13 @@ def __init__(self, cfg: Optional[CMCConfig] = None) -> None: useHarrisDetector=self.cfg.sof_use_harris, k=self.cfg.sof_k, ) + elif self.cfg.method == "ecc": + self.warp_mode = cv2.MOTION_EUCLIDEAN + self.criteria = ( + cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, + self.cfg.ecc_number_of_iterations, + self.cfg.ecc_termination_eps, + ) self.reset() @@ -252,9 +289,9 @@ def estimate(self, frame_bgr: np.ndarray, Args: frame_bgr: Current frame in BGR format (uint8), shape (H, W, 3). - dets_xyxy: Optional detections (N,4) in xyxy format, in original image - scale. Used only by ORB method for masking out object regions - (background-only features). + dets_xyxy: Optional detections (N,4) in xyxy format, in original image + scale. Used by feature-based methods (ORB and SIFT) to mask out object + regions during motion estimation. Returns: H: Affine transform matrix of shape (2, 3), dtype float32. @@ -269,6 +306,9 @@ def estimate(self, frame_bgr: np.ndarray, if self.cfg.method == "sparseOptFlow": return self._estimate_sparse_optflow(frame_bgr) + if self.cfg.method == "ecc": + return self._estimate_ecc(frame_bgr) + # fallback return np.eye(2, 3, dtype=np.float32) @@ -483,6 +523,79 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: self._prev_points = None if keypoints is None else keypoints.copy() return H_aff + + + def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: + """ + ECC-based affine motion estimation. + + This method estimates a global 2D Euclidean transform between the previous + frame and the current frame using OpenCV's Enhanced Correlation Coefficient + (ECC) image alignment algorithm. + + Steps: + 1) Convert the current frame to grayscale. + 2) Optionally smooth and downscale the frame. + 3) If this is the first frame, store it and return identity. + 4) Optimize a 2x3 warp matrix aligning the previous frame to the current + frame. + 5) If optimization succeeds, return the estimated transform. + Otherwise, keep the identity transform. + 6) Store the current frame for the next call. + + Args: + frame_bgr: + Current frame in BGR format. + + Returns: + H: + Affine transform matrix of shape (2, 3), dtype float32, mapping + previous-frame coordinates to current-frame coordinates. Returns + identity if initialization has not yet occurred or if ECC optimization + fails. + """ + H_img, W_img = frame_bgr.shape[:2] + frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + + H_aff = np.eye(2, 3, dtype=np.float32) + + if self.downscale > 1: + frame = cv2.GaussianBlur(frame, (3, 3), 1.5) + frame = cv2.resize(frame, (W_img // self.downscale, H_img // self.downscale)) + + if not self._initialized: + self._prev_frame_gray = frame.copy() + self._initialized = True + return H_aff + + if self._prev_frame_gray is None: + self._prev_frame_gray = frame.copy() + return H_aff + + try: + _cc, H_est = cv2.findTransformECC( + self._prev_frame_gray, + frame, + H_aff, + self.warp_mode, + self.criteria, + None, + self.cfg.ecc_gaussian_filter_size, + ) + if H_est is not None: + H_aff = H_est.astype(np.float32) + except cv2.error as e: + print('Warning: find transform failed. Set warp as identity') + pass + + # NOTE: this line is not included in the original BoT-SORT. However, + # in a working recurrent estimator, you do need to update the previous frame + # after each call. Otherwise the next call would keep aligning against an old + # frame. + self._prev_frame_gray = frame.copy() + + return H_aff + @staticmethod def apply_to_tracks(tracks: list, H: np.ndarray) -> None: diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index fa2db77a..b7bb99dc 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -82,8 +82,8 @@ def __init__( - low confidence: confidence < threshold enable_cmc: Whether to enable camera motion compensation (CMC). cmc_method: CMC method string passed into `CMCConfig(method=...)`. - Supported values depend on `CMC` (e.g. "orb", "sift", "sparseOptFlow"). - See CMCConfig. + Supported values depend on `CMC` (e.g. "orb", "sift", "sparseOptFlow", + "ecc"). See CMCConfig. cmc_downscale: Downscale factor used inside CMC for speed/robustness. Notes: From 4a8e2762f855c1f7adfb8e83a61413675c93095d Mon Sep 17 00:00:00 2001 From: Tomasz Stanczyk Date: Sun, 22 Mar 2026 12:01:38 +0100 Subject: [PATCH 08/14] Change min_iou_match thresh value. Introduce separate thresh for second assoc step --- trackers/core/botsort/tracker.py | 78 +++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index b7bb99dc..8b09b953 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -42,7 +42,10 @@ class BoTSORTTracker(BaseTracker): unmatched before being removed. minimum_consecutive_frames: Track maturity threshold before assigning a permanent ID. - minimum_iou_threshold: Minimum IoU required for a valid match. + minimum_iou_threshold_first_assoc: Minimum IoU required for a valid match + in the first association step + minimum_iou_threshold_second_assoc: Minimum IoU required for a valid match + in the second association step track_activation_threshold: Confidence threshold for spawning a new track. high_conf_det_threshold: Confidence threshold splitting detections into high/low groups. @@ -57,7 +60,8 @@ def __init__( frame_rate: float = 30.0, track_activation_threshold: float = 0.7, minimum_consecutive_frames: int = 2, - minimum_iou_threshold: float = 0.1, + minimum_iou_threshold_first_assoc: float = 0.2, + minimum_iou_threshold_second_assoc: float = 0.5, high_conf_det_threshold: float = 0.6, enable_cmc: bool = True, cmc_method: str = "sparseOptFlow", @@ -76,7 +80,10 @@ def __init__( track. minimum_consecutive_frames: Number of successful updates required before assigning a stable track ID (different than initial -1). - minimum_iou_threshold: Minimum IoU to accept a detection-track association. + minimum_iou_threshold_first_assoc: Minimum IoU to accept a detection-track + association during the first association step. + minimum_iou_threshold_second_assoc: Minimum IoU to accept a detection-track + association during the second association step. high_conf_det_threshold: Confidence threshold used to split detections into: - high confidence: confidence >= threshold - low confidence: confidence < threshold @@ -96,7 +103,8 @@ def __init__( # consistent time-based tracking across different frame rates. self.maximum_frames_without_update = int(frame_rate / 30.0 * lost_track_buffer) self.minimum_consecutive_frames = minimum_consecutive_frames - self.minimum_iou_threshold = minimum_iou_threshold + self.minimum_iou_threshold_first_assoc = minimum_iou_threshold_first_assoc + self.minimum_iou_threshold_second_assoc = minimum_iou_threshold_second_assoc self.track_activation_threshold = track_activation_threshold self.high_conf_det_threshold = high_conf_det_threshold self.tracks: list[BoTSORTKalmanBoxTracker] = [] @@ -211,6 +219,7 @@ def update( self._similarity_step( high_prob_detections, self.tracks, + self.minimum_iou_threshold_first_assoc ) ) @@ -226,7 +235,9 @@ def update( # Step 2: associate Low Probability detections with remaining tracks matched_indices, unmatched_tracks, unmatched_detections = self._similarity_step( - low_prob_detections, remaining_tracks + low_prob_detections, + remaining_tracks, + self.minimum_iou_threshold_second_assoc ) # Update matched tracks with low-confidence detections @@ -263,33 +274,47 @@ def update( if len(final_updated_detections) == 0: final_updated_detections.tracker_id = np.array([], dtype=int) return final_updated_detections - + def _get_high_and_low_probability_detections( - self, detections: sv.Detections - ) -> tuple[sv.Detections, sv.Detections]: + self, detections: sv.Detections + ) -> tuple[sv.Detections, sv.Detections]: """ - Splits the input detections into high-confidence and low-confidence sets - based on the `self.high_conf_det_threshold`. + Split detections into high-confidence and low-confidence sets. + + Detections with confidence <= 0.1 are discarded completely and are not + used by the tracker. + + Rules: + high-confidence: + confidence >= self.high_conf_det_threshold + + low-confidence: + 0.1 < confidence < self.high_conf_det_threshold + + discarded: + confidence <= 0.1 Args: - detections: The input detections with confidence scores. + detections: + Input detections containing confidence scores. Returns: - A tuple containing two `sv.Detections objects`: the first for - high-confidence detections `(confidence >= threshold)` and the second - for low-confidence detections `(confidence < threshold)`. + Tuple: + (high_confidence_detections, low_confidence_detections) """ - # Check if confidence scores exist before comparing - if detections.confidence is not None: - # Perform element-wise comparison if confidence is a NumPy array - condition = detections.confidence >= self.high_conf_det_threshold - else: - # If no confidence scores, no detections meet the threshold - # Create a boolean array of False with the same length as detections - condition = np.zeros(len(detections), dtype=bool) - - high_confidence = detections[condition] - low_confidence = detections[np.logical_not(condition)] + + if detections.confidence is None: + # If no confidence information exists, treat all detections as high-confidence + return detections, detections[:0] + + conf = detections.confidence + + high_mask = conf >= self.high_conf_det_threshold + low_mask = (conf > 0.1) & (conf < self.high_conf_det_threshold) + + high_confidence = detections[high_mask] + low_confidence = detections[low_mask] + return high_confidence, low_confidence def _get_associated_indices( @@ -375,6 +400,7 @@ def _similarity_step( self, detections: sv.Detections, tracks: list[BoTSORTKalmanBoxTracker], + thresh: float ) -> tuple[list[tuple[int, int]], set[int], set[int]]: """Measures similarity based on IoU between tracks and detections and returns the matches and unmatched tracks/detections. Is used for step 1 and 2 of the @@ -383,6 +409,7 @@ def _similarity_step( Args: detections: The set of object detections. tracks: The list of tracks that will be matched to the detections. + thresh: Minimum IoU required for a valid match. Returns: A tuple containing: @@ -394,7 +421,6 @@ def _similarity_step( """ # noqa: E501 # Build IoU cost matrix between detections and predicted bounding boxes similarity_matrix = get_iou_matrix(tracks, detections.xyxy) - thresh = self.minimum_iou_threshold # Associate detections to tracks based on the higher value of the # similarity matrix, using the Jonker-Volgenant algorithm From 6db9eded520032161759aeb4f605a6e58082bb96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 22 Mar 2026 11:03:18 +0000 Subject: [PATCH 09/14] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- trackers/core/botsort/__init__.py | 2 +- trackers/core/botsort/cmc.py | 169 +++++++++++--------- trackers/core/botsort/kalman_box_tracker.py | 115 +++++++------ trackers/core/botsort/tracker.py | 93 ++++++----- trackers/core/botsort/utils.py | 4 +- 5 files changed, 207 insertions(+), 176 deletions(-) diff --git a/trackers/core/botsort/__init__.py b/trackers/core/botsort/__init__.py index e0bc8c7c..8bae3857 100644 --- a/trackers/core/botsort/__init__.py +++ b/trackers/core/botsort/__init__.py @@ -5,4 +5,4 @@ # ------------------------------------------------------------------------ from .tracker import BoTSORTTracker -__all__ = ["BoTSORTTracker"] \ No newline at end of file +__all__ = ["BoTSORTTracker"] diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index e480226b..8deb976b 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -4,41 +4,42 @@ # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ +import copy from dataclasses import dataclass -from typing import Optional, Literal +from typing import Literal -import copy -import numpy as np import cv2 +import numpy as np CMCTMethod = Literal["orb", "sift", "sparseOptFlow", "ecc"] + @dataclass class CMCConfig: """ Configuration for camera motion compensation (CMC). - The CMC module estimates a global 2D affine transform `H` (2x3) between consecutive - frames. This transform is then applied to predicted track states before data + The CMC module estimates a global 2D affine transform `H` (2x3) between consecutive + frames. This transform is then applied to predicted track states before data association. Attributes: method: Camera motion estimation method. - - "orb": Feature matching using + - "orb": Feature matching using FAST keypoints + ORB descriptors + BFMatcher (Hamming), followed by robust affine estimation (RANSAC). Optionally masks out detection boxes so features are extracted from background. - - "sift": Feature matching using - SIFT keypoints + SIFT descriptors + BFMatcher (L2), - followed by robust affine estimation (RANSAC). + - "sift": Feature matching using + SIFT keypoints + SIFT descriptors + BFMatcher (L2), + followed by robust affine estimation (RANSAC). Optionally masks out detection boxes so features are extracted from - background. "sift" generally produces fewer but more distinctive matches + background. "sift" generally produces fewer but more distinctive matches than ORB at higher compute cost. - "sparseOptFlow": Sparse optical flow using corner tracking: - goodFeaturesToTrack -> calcOpticalFlowPyrLK -> robust affine estimation + goodFeaturesToTrack -> calcOpticalFlowPyrLK -> robust affine estimation (RANSAC). - "ecc": Global image alignment using the Enhanced Correlation Coefficient (ECC) optimization method. This estimates a 2D Euclidean transform @@ -54,23 +55,23 @@ class CMCConfig: Behavior: - Frames are resized to (W//downscale, H//downscale) for motion estimation. - The resulting affine translation components H[0,2], H[1,2] are scaled back - by multiplying by `downscale`, so the transform is in original image + by multiplying by `downscale`, so the transform is in original image coordinates. fast_threshold: (ORB only) Threshold for the FAST keypoint detector. - Higher values yield fewer keypoints (more selective); lower values yield + Higher values yield fewer keypoints (more selective); lower values yield more keypoints. ransac_reproj_threshold: (ORB only) RANSAC reprojection threshold in pixels passed to - OpenCV's affine estimation. It controls how far a point is allowed to + OpenCV's affine estimation. It controls how far a point is allowed to deviate from the estimated model while still being counted as an inlier. - Smaller values are stricter (reject more matches); larger values are more + Smaller values are stricter (reject more matches); larger values are more tolerant. max_spatial_distance_frac: - (ORB only) Maximum allowed spatial displacement for a tentative match, + (ORB only) Maximum allowed spatial displacement for a tentative match, expressed as a fraction of (image width, image height) *after downscale*. Example: @@ -78,37 +79,37 @@ class CMCConfig: then a match is rejected if |dx| >= 0.25*W or |dy| >= 0.25*H. Motivation: - Reject obviously incorrect descriptor matches whose displacement is + Reject obviously incorrect descriptor matches whose displacement is implausibly large. roi_min_frac: - (ORB only) Lower bound of the region-of-interest (ROI) used to select - keypoints, expressed as a fraction of frame size. Points outside the ROI + (ORB only) Lower bound of the region-of-interest (ROI) used to select + keypoints, expressed as a fraction of frame size. Points outside the ROI are masked out. Example: roi_min_frac=0.02 means we ignore a ~2% border on each side. roi_max_frac: - (ORB only) Upper bound of the ROI used to select keypoints (fraction of + (ORB only) Upper bound of the ROI used to select keypoints (fraction of frame size). Together with roi_min_frac, it defines a central rectangle: [roi_min_frac..roi_max_frac] in both x and y. - sift_n_octave_layers: - (SIFT only) Number of octave layers used by SIFT when constructing the - scale-space pyramid. Increasing this can increase sensitivity to scale + sift_n_octave_layers: + (SIFT only) Number of octave layers used by SIFT when constructing the + scale-space pyramid. Increasing this can increase sensitivity to scale changes, at higher compute cost. - sift_contrast_threshold: - (SIFT only) Threshold controlling how sensitive SIFT is - to low-contrast keypoints. Lower values generally produce more keypoints; + sift_contrast_threshold: + (SIFT only) Threshold controlling how sensitive SIFT is + to low-contrast keypoints. Lower values generally produce more keypoints; higher values are stricter. - sift_edge_threshold: - (SIFT only) Threshold controlling rejection of keypoints on edges. - Lower values reject more edge-like responses; higher values are more + sift_edge_threshold: + (SIFT only) Threshold controlling rejection of keypoints on edges. + Lower values reject more edge-like responses; higher values are more permissive. - + sof_max_corners: (SparseOptFlow only) `maxCorners` passed to `cv2.goodFeaturesToTrack`. Maximum number of corners to detect for tracking. @@ -116,23 +117,23 @@ class CMCConfig: sof_quality_level: (SparseOptFlow only) `qualityLevel` passed to `cv2.goodFeaturesToTrack`. - Minimum accepted quality of corners. A higher value keeps only stronger + Minimum accepted quality of corners. A higher value keeps only stronger corners; a lower value yields more corners (including weaker ones). sof_min_distance: (SparseOptFlow only) `minDistance` passed to `cv2.goodFeaturesToTrack`. Minimum Euclidean distance (in pixels) between returned corners. - Higher values produce more spatially spread points; lower values allow + Higher values produce more spatially spread points; lower values allow clustering. sof_block_size: (SparseOptFlow only) `blockSize` passed to `cv2.goodFeaturesToTrack`. - Size of the neighborhood used to compute corner quality (structure tensor + Size of the neighborhood used to compute corner quality (structure tensor window). sof_use_harris: - (SparseOptFlow only) `useHarrisDetector` passed to - `cv2.goodFeaturesToTrack`. If True, uses the Harris corner measure; + (SparseOptFlow only) `useHarrisDetector` passed to + `cv2.goodFeaturesToTrack`. If True, uses the Harris corner measure; if False, uses the Shi-Tomasi measure. sof_k: @@ -152,6 +153,7 @@ class CMCConfig: `findTransformECC`. This can help stabilize optimization on noisy frames. A value of 1 matches the current implementation. """ + method: CMCTMethod = "sparseOptFlow" downscale: int = 2 @@ -179,7 +181,7 @@ class CMCConfig: # ECC parameters - # BoT-SORT's original - resulting in veeery long (=unacceptably long) execution time + # BoT-SORT's original - resulting in veeery long (=unacceptably long) execution time # ecc_number_of_iterations: int = 5000 # ecc_termination_eps: float = 1e-6 @@ -204,11 +206,11 @@ class CMC: Notes: - H maps points from previous frame coordinates to current frame coordinates. - - This class does not perform any drawing/visualization; it only estimates + - This class does not perform any drawing/visualization; it only estimates transforms. """ - def __init__(self, cfg: Optional[CMCConfig] = None) -> None: + def __init__(self, cfg: CMCConfig | None = None) -> None: """ Initialize CMC. @@ -274,23 +276,24 @@ def reset(self) -> None: # ORB state self._prev_kps = None - self._prev_desc: Optional[np.ndarray] = None + self._prev_desc: np.ndarray | None = None # SparseOptFlow state - self._prev_frame_gray: Optional[np.ndarray] = None + self._prev_frame_gray: np.ndarray | None = None # shape (N,1,2) from goodFeaturesToTrack - self._prev_points: Optional[np.ndarray] = None + self._prev_points: np.ndarray | None = None - def estimate(self, frame_bgr: np.ndarray, - dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + def estimate( + self, frame_bgr: np.ndarray, dets_xyxy: np.ndarray | None = None + ) -> np.ndarray: """ Estimate global affine transform H (2x3) from previous frame to current frame. Args: frame_bgr: Current frame in BGR format (uint8), shape (H, W, 3). dets_xyxy: Optional detections (N,4) in xyxy format, in original image - scale. Used by feature-based methods (ORB and SIFT) to mask out object + scale. Used by feature-based methods (ORB and SIFT) to mask out object regions during motion estimation. Returns: @@ -312,10 +315,11 @@ def estimate(self, frame_bgr: np.ndarray, # fallback return np.eye(2, 3, dtype=np.float32) - def _estimate_feature_affine(self, frame_bgr: np.ndarray, - dets_xyxy: Optional[np.ndarray] = None) -> np.ndarray: + def _estimate_feature_affine( + self, frame_bgr: np.ndarray, dets_xyxy: np.ndarray | None = None + ) -> np.ndarray: """ - Feature affine estimation. ORB-based or SIFT-based + Feature affine estimation. ORB-based or SIFT-based (different initializations of self.detector, self.extractor and self.matcher for ORB and SIFT) @@ -388,8 +392,9 @@ def _estimate_feature_affine(self, frame_bgr: np.ndarray, self._prev_desc = copy.copy(desc) return H_aff - max_spatial = self.cfg.max_spatial_distance_frac * np.array([W, H], - dtype=np.float32) + max_spatial = self.cfg.max_spatial_distance_frac * np.array( + [W, H], dtype=np.float32 + ) prev_pts = [] curr_pts = [] @@ -461,8 +466,9 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: # Downscale if self.downscale > 1: - frame = cv2.resize(frame, (W_img // self.downscale, - H_img // self.downscale)) + frame = cv2.resize( + frame, (W_img // self.downscale, H_img // self.downscale) + ) # Find keypoints in current frame keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params) @@ -475,15 +481,20 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: return H_aff # If we don't have points, re-init - if self._prev_frame_gray is None or self._prev_points is None or keypoints is None: + if ( + self._prev_frame_gray is None + or self._prev_points is None + or keypoints is None + ): self._prev_frame_gray = frame.copy() self._prev_points = copy.copy(keypoints) return H_aff # Optical flow correspondences # calcOpticalFlowPyrLK will throw or return nonsense if we give it None - matched, status, _err = cv2.calcOpticalFlowPyrLK(self._prev_frame_gray, frame, - self._prev_points, None) + matched, status, _err = cv2.calcOpticalFlowPyrLK( + self._prev_frame_gray, frame, self._prev_points, None + ) if status is None or matched is None: self._prev_frame_gray = frame.copy() @@ -505,7 +516,9 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: curr_pts = np.array(curr_pts) # Find rigid matrix - if (np.size(prev_pts, 0) > 4) and (np.size(prev_pts, 0) == np.size(curr_pts, 0)): + if (np.size(prev_pts, 0) > 4) and ( + np.size(prev_pts, 0) == np.size(curr_pts, 0) + ): H_est, _ = cv2.estimateAffinePartial2D(prev_pts, curr_pts, cv2.RANSAC) if H_est is not None: H_aff = H_est.astype(np.float32) @@ -515,7 +528,7 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: H_aff[0, 2] *= self.downscale H_aff[1, 2] *= self.downscale else: - print('Warning: not enough matching points') + print("Warning: not enough matching points") # Store to next iteration self._prev_frame_gray = frame.copy() @@ -523,7 +536,6 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: self._prev_points = None if keypoints is None else keypoints.copy() return H_aff - def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: """ @@ -537,7 +549,7 @@ def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: 1) Convert the current frame to grayscale. 2) Optionally smooth and downscale the frame. 3) If this is the first frame, store it and return identity. - 4) Optimize a 2x3 warp matrix aligning the previous frame to the current + 4) Optimize a 2x3 warp matrix aligning the previous frame to the current frame. 5) If optimization succeeds, return the estimated transform. Otherwise, keep the identity transform. @@ -550,8 +562,8 @@ def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: Returns: H: Affine transform matrix of shape (2, 3), dtype float32, mapping - previous-frame coordinates to current-frame coordinates. Returns - identity if initialization has not yet occurred or if ECC optimization + previous-frame coordinates to current-frame coordinates. Returns + identity if initialization has not yet occurred or if ECC optimization fails. """ H_img, W_img = frame_bgr.shape[:2] @@ -561,7 +573,9 @@ def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: if self.downscale > 1: frame = cv2.GaussianBlur(frame, (3, 3), 1.5) - frame = cv2.resize(frame, (W_img // self.downscale, H_img // self.downscale)) + frame = cv2.resize( + frame, (W_img // self.downscale, H_img // self.downscale) + ) if not self._initialized: self._prev_frame_gray = frame.copy() @@ -584,23 +598,22 @@ def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: ) if H_est is not None: H_aff = H_est.astype(np.float32) - except cv2.error as e: - print('Warning: find transform failed. Set warp as identity') + except cv2.error: + print("Warning: find transform failed. Set warp as identity") pass # NOTE: this line is not included in the original BoT-SORT. However, - # in a working recurrent estimator, you do need to update the previous frame - # after each call. Otherwise the next call would keep aligning against an old + # in a working recurrent estimator, you do need to update the previous frame + # after each call. Otherwise the next call would keep aligning against an old # frame. self._prev_frame_gray = frame.copy() - - return H_aff + return H_aff @staticmethod def apply_to_tracks(tracks: list, H: np.ndarray) -> None: """ - Apply a global affine motion transform to tracker states and covariances + Apply a global affine motion transform to tracker states and covariances in-place. This method updates each track according to the affine transform @@ -609,7 +622,7 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: where: R: - 2x2 linear part of the affine transform (rotation / shear / scale-like + 2x2 linear part of the affine transform (rotation / shear / scale-like part). t: 2D translation vector. @@ -636,7 +649,7 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: Velocities of the width and height. State update logic: - The affine transform is applied only to the geometric quantities that live + The affine transform is applied only to the geometric quantities that live in the 2D image plane as position or velocity vectors: 1) Center position: @@ -651,13 +664,13 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: Why width and height are not transformed here: Width and height are scalar box dimensions, not 2D point coordinates. In this implementation, camera motion compensation is used to correct the - object center location and its image-plane velocity, while the box size - terms are left unchanged. This keeps the compensation simple and consistent + object center location and its image-plane velocity, while the box size + terms are left unchanged. This keeps the compensation simple and consistent with the state representation used by the tracker. Covariance update: Each track also stores a covariance matrix `P` describing uncertainty in the - 8D Kalman state. After the mean state is transformed, the covariance is + 8D Kalman state. After the mean state is transformed, the covariance is updated using the linear transform P = A @ P @ A.T @@ -688,10 +701,10 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: Notes: - If `H` is None or `tracks` is empty, this method does nothing. - - The method assumes that `H` has already been estimated in image + - The method assumes that `H` has already been estimated in image coordinates onsistent with the tracker state. - This method does not perform any validity checks on whether the estimated - transform is physically plausible; it simply applies the provided + transform is physically plausible; it simply applies the provided transform. """ if H is None or len(tracks) == 0: @@ -715,8 +728,8 @@ def apply_to_tracks(tracks: list, H: np.ndarray) -> None: # Update the state covariance under the corresponding linear transform. A = np.eye(8, dtype=np.float32) - A[0:2, 0:2] = R # center position - A[4:6, 4:6] = R # center velocity + A[0:2, 0:2] = R # center position + A[4:6, 4:6] = R # center velocity # Box size terms (w, h, vw, vh) are not transformed in this implementation. - trk.P = (A @ trk.P @ A.T).astype(np.float32) \ No newline at end of file + trk.P = (A @ trk.P @ A.T).astype(np.float32) diff --git a/trackers/core/botsort/kalman_box_tracker.py b/trackers/core/botsort/kalman_box_tracker.py index 91b1a409..c5d53769 100644 --- a/trackers/core/botsort/kalman_box_tracker.py +++ b/trackers/core/botsort/kalman_box_tracker.py @@ -162,7 +162,7 @@ def _initialize_kalman_filter(self, measurement: np.ndarray) -> None: This method initializes the following matrices: State transition matrix: - F is an 8x8 matrix defining how the state evolves from one frame to the + F is an 8x8 matrix defining how the state evolves from one frame to the next. It implements a constant-velocity model: xc <- xc + vxc yc <- yc + vyc @@ -209,49 +209,58 @@ def _initialize_kalman_filter(self, measurement: np.ndarray) -> None: self.H = np.eye(4, 8, dtype=np.float32) - # BoT-SORT-style scale-aware noise using width/height. + # BoT-SORT-style scale-aware noise using width/height. sigma_p = 0.05 sigma_v = 0.00625 sigma_m = 0.05 w, h = measurement[2], measurement[3] - q_diag = np.array([ - (sigma_p * w) ** 2, - (sigma_p * h) ** 2, - (sigma_p * w) ** 2, - (sigma_p * h) ** 2, - (sigma_v * w) ** 2, - (sigma_v * h) ** 2, - (sigma_v * w) ** 2, - (sigma_v * h) ** 2, - ], dtype=np.float32) + q_diag = np.array( + [ + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + ], + dtype=np.float32, + ) self.Q = np.diag(q_diag) - r_diag = np.array([ - (sigma_m * w) ** 2, - (sigma_m * h) ** 2, - (sigma_m * w) ** 2, - (sigma_m * h) ** 2, - ], dtype=np.float32) + r_diag = np.array( + [ + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + ], + dtype=np.float32, + ) self.R = np.diag(r_diag) # Initial covariance, as in original BoT-SORT KF - p_diag = np.array([ - (2 * sigma_p * w) ** 2, - (2 * sigma_p * h) ** 2, - (2 * sigma_p * w) ** 2, - (2 * sigma_p * h) ** 2, - (10 * sigma_v * w) ** 2, - (10 * sigma_v * h) ** 2, - (10 * sigma_v * w) ** 2, - (10 * sigma_v * h) ** 2, - ], dtype=np.float32) + p_diag = np.array( + [ + (2 * sigma_p * w) ** 2, + (2 * sigma_p * h) ** 2, + (2 * sigma_p * w) ** 2, + (2 * sigma_p * h) ** 2, + (10 * sigma_v * w) ** 2, + (10 * sigma_v * h) ** 2, + (10 * sigma_v * w) ** 2, + (10 * sigma_v * h) ** 2, + ], + dtype=np.float32, + ) self.P = np.diag(p_diag) def _update_process_and_measurement_noise(self) -> None: """ - Recompute the process and measurement noise covariances from the current box + Recompute the process and measurement noise covariances from the current box size. This method updates: @@ -267,13 +276,13 @@ def _update_process_and_measurement_noise(self) -> None: Why this update is needed: The scale of the uncertainty should depend on the current object size. For example, a 2-pixel error is relatively more important for a small object - than for a large one. Therefore, the diagonal entries of Q and R are + than for a large one. Therefore, the diagonal entries of Q and R are computed from the current predicted width and height stored in the state. Implementation details: - Width and height are read from the current state: w = state[2], h = state[3] - - They are clamped to a small positive minimum to avoid zero or negative + - They are clamped to a small positive minimum to avoid zero or negative values. - The resulting Q and R matrices remain diagonal. @@ -288,24 +297,30 @@ def _update_process_and_measurement_noise(self) -> None: w = max(float(self.state[2, 0]), 1e-3) h = max(float(self.state[3, 0]), 1e-3) - q_diag = np.array([ - (sigma_p * w) ** 2, - (sigma_p * h) ** 2, - (sigma_p * w) ** 2, - (sigma_p * h) ** 2, - (sigma_v * w) ** 2, - (sigma_v * h) ** 2, - (sigma_v * w) ** 2, - (sigma_v * h) ** 2, - ], dtype=np.float32) + q_diag = np.array( + [ + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_p * w) ** 2, + (sigma_p * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + (sigma_v * w) ** 2, + (sigma_v * h) ** 2, + ], + dtype=np.float32, + ) self.Q = np.diag(q_diag) - r_diag = np.array([ - (sigma_m * w) ** 2, - (sigma_m * h) ** 2, - (sigma_m * w) ** 2, - (sigma_m * h) ** 2, - ], dtype=np.float32) + r_diag = np.array( + [ + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + (sigma_m * w) ** 2, + (sigma_m * h) ** 2, + ], + dtype=np.float32, + ) self.R = np.diag(r_diag) def predict(self) -> None: @@ -326,9 +341,9 @@ def predict(self) -> None: Process noise covariance. Effect of the prediction: - - The center position and box size are advanced using their current + - The center position and box size are advanced using their current velocities. - - The covariance matrix P is propagated forward and increased by Q to + - The covariance matrix P is propagated forward and increased by Q to reflect additional uncertainty introduced during motion prediction. Additional behavior: @@ -431,4 +446,4 @@ def get_state_bbox(self) -> np.ndarray: """ Return current predicted box in xyxy format. """ - return self.xywh_to_xyxy(self.state[0:4, 0]) \ No newline at end of file + return self.xywh_to_xyxy(self.state[0:4, 0]) diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index 8b09b953..f4d35842 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -12,18 +12,19 @@ from scipy.optimize import linear_sum_assignment from trackers.core.base import BaseTracker +from trackers.core.botsort.cmc import CMC, CMCConfig from trackers.core.botsort.kalman_box_tracker import BoTSORTKalmanBoxTracker from trackers.core.botsort.utils import ( get_alive_trackers, get_iou_matrix, ) -from trackers.core.botsort.cmc import CMC, CMCConfig + class BoTSORTTracker(BaseTracker): """ BoT-SORT-style multi-object tracker (IoU association + optional CMC). - The tracker maintains a list of active tracks (Kalman-filter-based) and, for each + The tracker maintains a list of active tracks (Kalman-filter-based) and, for each frame, performs: 1) Predict existing track states (Kalman predict) 2) Split detections into high/low confidence groups @@ -33,23 +34,23 @@ class BoTSORTTracker(BaseTracker): 6) Spawn new tracks from unmatched high-confidence detections 7) Remove tracks that have been lost for too long - Parameters in __init__ control thresholds and lifecycle logic similarly to + Parameters in __init__ control thresholds and lifecycle logic similarly to ByteTrack. Attributes: tracks: List of active `BoTSORTKalmanBoxTracker` objects. - maximum_frames_without_update: Max number of consecutive frames a track can go + maximum_frames_without_update: Max number of consecutive frames a track can go unmatched before being removed. - minimum_consecutive_frames: Track maturity threshold before assigning a + minimum_consecutive_frames: Track maturity threshold before assigning a permanent ID. minimum_iou_threshold_first_assoc: Minimum IoU required for a valid match in the first association step minimum_iou_threshold_second_assoc: Minimum IoU required for a valid match in the second association step track_activation_threshold: Confidence threshold for spawning a new track. - high_conf_det_threshold: Confidence threshold splitting detections into + high_conf_det_threshold: Confidence threshold splitting detections into high/low groups. - enable_cmc: Whether to run camera motion compensation each frame + enable_cmc: Whether to run camera motion compensation each frame (if `cmc` is set). cmc: Camera motion compensation instance (or None if disabled). """ @@ -66,7 +67,6 @@ def __init__( enable_cmc: bool = True, cmc_method: str = "sparseOptFlow", cmc_downscale: int = 2, - ) -> None: """ Initialize the tracker. @@ -74,22 +74,22 @@ def __init__( Args: lost_track_buffer: Time buffer (in frames at 30 FPS) for keeping lost tracks alive before deletion. This is scaled by `frame_rate`. - frame_rate: Video frame rate used to scale the lost track buffer to + frame_rate: Video frame rate used to scale the lost track buffer to time-like behavior. - track_activation_threshold: Minimum detection confidence to spawn a new + track_activation_threshold: Minimum detection confidence to spawn a new track. - minimum_consecutive_frames: Number of successful updates required before + minimum_consecutive_frames: Number of successful updates required before assigning a stable track ID (different than initial -1). - minimum_iou_threshold_first_assoc: Minimum IoU to accept a detection-track + minimum_iou_threshold_first_assoc: Minimum IoU to accept a detection-track association during the first association step. - minimum_iou_threshold_second_assoc: Minimum IoU to accept a detection-track + minimum_iou_threshold_second_assoc: Minimum IoU to accept a detection-track association during the second association step. high_conf_det_threshold: Confidence threshold used to split detections into: - high confidence: confidence >= threshold - low confidence: confidence < threshold enable_cmc: Whether to enable camera motion compensation (CMC). - cmc_method: CMC method string passed into `CMCConfig(method=...)`. - Supported values depend on `CMC` (e.g. "orb", "sift", "sparseOptFlow", + cmc_method: CMC method string passed into `CMCConfig(method=...)`. + Supported values depend on `CMC` (e.g. "orb", "sift", "sparseOptFlow", "ecc"). See CMCConfig. cmc_downscale: Downscale factor used inside CMC for speed/robustness. @@ -110,8 +110,11 @@ def __init__( self.tracks: list[BoTSORTKalmanBoxTracker] = [] self.enable_cmc = enable_cmc - self.cmc = CMC(CMCConfig(method=cmc_method, - downscale=cmc_downscale)) if enable_cmc else None + self.cmc = ( + CMC(CMCConfig(method=cmc_method, downscale=cmc_downscale)) + if enable_cmc + else None + ) def _update_detections( self, @@ -125,16 +128,16 @@ def _update_detections( For each (track_idx, det_idx) match: - Update the track's Kalman state with the detection bbox. - - If the track is “mature” (>= minimum_consecutive_frames) and still has + - If the track is “mature” (>= minimum_consecutive_frames) and still has tracker_id == -1, assign a new unique tracker ID. - - Create a single-row `sv.Detections` object for the matched detection and set + - Create a single-row `sv.Detections` object for the matched detection and set its tracker_id to the track ID (or -1 if not mature yet). - Append it to `updated_detections`. Args: tracks: Tracks being updated. detections: Detections used for update. - updated_detections: Accumulator list of per-detection outputs for this + updated_detections: Accumulator list of per-detection outputs for this frame. matched_indices: List of (track_row_index, detection_col_index) pairs. @@ -172,9 +175,9 @@ def update( Args: detections: Supervision detections for the current frame. Must include ` - .xyxy`. Confidence (`detections.confidence`) is optional but + .xyxy`. Confidence (`detections.confidence`) is optional but recommended. The method writes/overwrites `detections.tracker_id`. - frame: Current video frame in BGR format (H, W, 3), required if CMC is + frame: Current video frame in BGR format (H, W, 3), required if CMC is enabled. Returns: @@ -185,8 +188,8 @@ def update( mature) Notes: - - If CMC is enabled, the tracker estimates a global affine transform (2x3) - from the frame and uses it to warp predicted track states before + - If CMC is enabled, the tracker estimates a global affine transform (2x3) + from the frame and uses it to warp predicted track states before association. """ if len(self.tracks) == 0 and len(detections) == 0: @@ -194,23 +197,25 @@ def update( return detections updated_detections: list[ sv.Detections - ] = [] # List for returning the updated detections with its new assigned - # track id # noqa: E501 + ] = [] # List for returning the updated detections with its new assigned + # track id # Predict new locations for existing tracks for tracker in self.tracks: tracker.predict() # Assign a default tracker_id with the correct shape detections.tracker_id = -np.ones(len(detections)) - # Split into high confidence boxes and lower based on - # self.high_conf_det_threshold # noqa: E501 + # Split into high confidence boxes and lower based on + # self.high_conf_det_threshold high_prob_detections, low_prob_detections = ( self._get_high_and_low_probability_detections(detections) ) # CMC (ORB) apply to all predicted tracks before association if self.enable_cmc and self.cmc is not None and frame is not None: - mask_boxes = high_prob_detections.xyxy if len(high_prob_detections) > 0 else None + mask_boxes = ( + high_prob_detections.xyxy if len(high_prob_detections) > 0 else None + ) H = self.cmc.estimate(frame, mask_boxes) self.cmc.apply_to_tracks(self.tracks, H) @@ -219,7 +224,7 @@ def update( self._similarity_step( high_prob_detections, self.tracks, - self.minimum_iou_threshold_first_assoc + self.minimum_iou_threshold_first_assoc, ) ) @@ -235,9 +240,9 @@ def update( # Step 2: associate Low Probability detections with remaining tracks matched_indices, unmatched_tracks, unmatched_detections = self._similarity_step( - low_prob_detections, - remaining_tracks, - self.minimum_iou_threshold_second_assoc + low_prob_detections, + remaining_tracks, + self.minimum_iou_threshold_second_assoc, ) # Update matched tracks with low-confidence detections @@ -274,10 +279,10 @@ def update( if len(final_updated_detections) == 0: final_updated_detections.tracker_id = np.array([], dtype=int) return final_updated_detections - + def _get_high_and_low_probability_detections( - self, detections: sv.Detections - ) -> tuple[sv.Detections, sv.Detections]: + self, detections: sv.Detections + ) -> tuple[sv.Detections, sv.Detections]: """ Split detections into high-confidence and low-confidence sets. @@ -329,14 +334,14 @@ def _get_associated_indices( assignment problem in an optimal way. Args: - similarity_matrix: Similarity matrix between tracks (rows) and detections - (columns). min_similarity_thresh: Minimum similarity threshold for a valid + similarity_matrix: Similarity matrix between tracks (rows) and detections + (columns). min_similarity_thresh: Minimum similarity threshold for a valid match. Returns: Matched indices (list of (tracker_idx, detection_idx)), indices of unmatched tracks, indices of unmatched detections. - """ # noqa: E501 + """ matched_indices = [] n_tracks, n_detections = similarity_matrix.shape unmatched_tracks = set(range(n_tracks)) @@ -400,9 +405,9 @@ def _similarity_step( self, detections: sv.Detections, tracks: list[BoTSORTKalmanBoxTracker], - thresh: float + thresh: float, ) -> tuple[list[tuple[int, int]], set[int], set[int]]: - """Measures similarity based on IoU between tracks and detections and returns + """Measures similarity based on IoU between tracks and detections and returns the matches and unmatched tracks/detections. Is used for step 1 and 2 of the BYTE algorithm. @@ -418,13 +423,13 @@ def _similarity_step( were not matched. - unmatched_detections_indices: A set of indices for detections that were not matched. - """ # noqa: E501 + """ # Build IoU cost matrix between detections and predicted bounding boxes similarity_matrix = get_iou_matrix(tracks, detections.xyxy) # Associate detections to tracks based on the higher value of the - # similarity matrix, using the Jonker-Volgenant algorithm - # (linear_sum_assignment). # noqa: E501 + # similarity matrix, using the Jonker-Volgenant algorithm + # (linear_sum_assignment). matched_indices, unmatched_tracks, unmatched_detections = ( self._get_associated_indices(similarity_matrix, thresh) ) diff --git a/trackers/core/botsort/utils.py b/trackers/core/botsort/utils.py index 078542e4..3f4fdf23 100644 --- a/trackers/core/botsort/utils.py +++ b/trackers/core/botsort/utils.py @@ -13,9 +13,7 @@ from trackers.core.botsort.kalman_box_tracker import BoTSORTKalmanBoxTracker -KalmanBoxTrackerType = TypeVar( - "KalmanBoxTrackerType", bound=BoTSORTKalmanBoxTracker -) +KalmanBoxTrackerType = TypeVar("KalmanBoxTrackerType", bound=BoTSORTKalmanBoxTracker) BoTSORTKalmanBoxTracker From 3e1c18f4d09da28d5a676f14e275c95aa9a85e6e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:04:00 +0100 Subject: [PATCH 10/14] =?UTF-8?q?chore(pre=5Fcommit):=20=E2=AC=86=20pre=5F?= =?UTF-8?q?commit=20autoupdate=20(#333)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.15.6 → v0.15.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.15.6...v0.15.7) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0168ef5f..73b34531 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: - id: mixed-line-ending - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.6 + rev: v0.15.7 hooks: - id: ruff-check args: [--fix] From a77fbbdcf168544d7fa5543d1e6097700cd694b7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:09:34 +0100 Subject: [PATCH 11/14] :arrow_up: Bump uv from 0.10.10 to 0.10.12 (#332) Bumps [uv](https://github.com/astral-sh/uv) from 0.10.10 to 0.10.12. - [Release notes](https://github.com/astral-sh/uv/releases) - [Changelog](https://github.com/astral-sh/uv/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/uv/compare/0.10.10...0.10.12) --- updated-dependencies: - dependency-name: uv dependency-version: 0.10.12 dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Piotr Skalski --- uv.lock | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/uv.lock b/uv.lock index e93d28d6..4a8bb04d 100644 --- a/uv.lock +++ b/uv.lock @@ -3957,28 +3957,28 @@ wheels = [ [[package]] name = "uv" -version = "0.10.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/77/22/21476e738938bbb36fa0029d369c6989ade90039110a7013a24f4c6211c0/uv-0.10.10.tar.gz", hash = "sha256:266b24bf85aa021af37d3fb22d84ef40746bc4da402e737e365b12badff60e89", size = 3976117, upload-time = "2026-03-13T20:04:44.335Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/2b/2cbc9ebc53dc84ad698c31583735605eb55627109af59d9d3424eb824935/uv-0.10.10-py3-none-linux_armv6l.whl", hash = "sha256:2c89017c0532224dc1ec6f3be1bc4ec3d8c3f291c23a229e8a40e3cc5828f599", size = 22712805, upload-time = "2026-03-13T20:03:36.034Z" }, - { url = "https://files.pythonhosted.org/packages/14/44/4e8db982a986a08808cc5236e73c12bd6619823b3be41c9d6322d4746ebd/uv-0.10.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ee47b5bc1b8ccd246a3801611b2b71c8107db3a2b528e64463d737fd8e4f2798", size = 21857826, upload-time = "2026-03-13T20:03:52.852Z" }, - { url = "https://files.pythonhosted.org/packages/6f/98/aca12549cafc4c0346b04f8fed7f7ee3bfc2231b45b7e59d062d5b519746/uv-0.10.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:009a4c534e83bada52c8e2cccea6250e3486d01d609e4eb874cd302e2e534269", size = 20381437, upload-time = "2026-03-13T20:04:00.735Z" }, - { url = "https://files.pythonhosted.org/packages/93/c4/f3f832e4871b2bb86423c4cdbbd40b10c835a426449e86951f992d63120a/uv-0.10.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5dd85cc8ff9fa967c02c3edbf2b77d54b56bedcb56b323edec0df101f37f26e2", size = 22334006, upload-time = "2026-03-13T20:04:32.887Z" }, - { url = "https://files.pythonhosted.org/packages/75/e1/852d1eb2630410f465287e858c93b2f2c81b668b7fa63c3f05356896706d/uv-0.10.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:49235f8a745ef10eea24b2f07be1ee77da056792cef897630b78c391c5f1e2e4", size = 22303994, upload-time = "2026-03-13T20:04:04.849Z" }, - { url = "https://files.pythonhosted.org/packages/3f/39/1678ed510b7ee6d68048460c428ca26d57cc798ca34d4775e113e7801144/uv-0.10.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f97709570158efc87d52ddca90f2c96293eea382d81be295b1fd7088153d6a83", size = 22301619, upload-time = "2026-03-13T20:03:40.56Z" }, - { url = "https://files.pythonhosted.org/packages/81/2f/e4137b7f3f07c0cc1597b49c341b30f09cea13dbe57cd83ad14f5839dfff/uv-0.10.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c863fb46a62f3c8a1b7bc1520b0939c05cf4fab06e7233fc48ed17538e6601e", size = 23669879, upload-time = "2026-03-13T20:04:20.356Z" }, - { url = "https://files.pythonhosted.org/packages/ff/11/44f7f067b7dcfc57e21500918a50e0f2d56b23acdc9b2148dbd4d07b5078/uv-0.10.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f56734baf7a8bd616da69cd7effe1a237c2cb364ec4feefe6a4b180f1cf5ec2", size = 24480854, upload-time = "2026-03-13T20:03:31.645Z" }, - { url = "https://files.pythonhosted.org/packages/9c/b5/d2bed329892b5298c493709bc851346d9750bafed51f8ba2b31e7d3ae0cc/uv-0.10.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1085cc907a1315002015bc218cc88e42c5171a03a705421341cdb420400ee2f3", size = 23677933, upload-time = "2026-03-13T20:03:57.052Z" }, - { url = "https://files.pythonhosted.org/packages/02/95/84166104b968c02c2bb54c32082d702d29beb24384fb3f13ade0cb2456fb/uv-0.10.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42e9e4a196ef75d1089715574eb1fe9bb62d390da05c6c8b36650a4de23d59f", size = 23473055, upload-time = "2026-03-13T20:03:48.648Z" }, - { url = "https://files.pythonhosted.org/packages/b9/b6/9cc6e5442e3734615b5dbf45dcacf94cd46a05b1d04066cbdb992701e6bf/uv-0.10.10-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:fbd827042dbdcadeb5e3418bee73ded9feb5ead8edac23e6e1b5dadb5a90f8b2", size = 22403569, upload-time = "2026-03-13T20:04:08.514Z" }, - { url = "https://files.pythonhosted.org/packages/cf/8c/2e0a3690603e86f8470bae3a27896a9f8b56677b5cd337d131c4d594e0dc/uv-0.10.10-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:41a3cc94e0c43070e48a521b6b26156ffde1cdc2088339891aa35eb2245ac5cf", size = 23309789, upload-time = "2026-03-13T20:03:44.764Z" }, - { url = "https://files.pythonhosted.org/packages/24/e5/5af4d7426e39d7a7a751f8d1a7646d04e042a3c2c2c6aeb9d940ddc34df0/uv-0.10.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:8a59c80ade3aa20baf9ec5d17b6449f4fdba9212f6e3d1bdf2a6db94cbc64c21", size = 23329370, upload-time = "2026-03-13T20:04:24.525Z" }, - { url = "https://files.pythonhosted.org/packages/3a/10/94b773933cd2e39aa9768dd11f85f32844e4dcb687c6df0714dfb3c0234a/uv-0.10.10-py3-none-musllinux_1_1_i686.whl", hash = "sha256:e77e52ba74e0085a1c03a16611146c6f813034787f83a2fd260cdc8357e18d2d", size = 22818945, upload-time = "2026-03-13T20:04:29.064Z" }, - { url = "https://files.pythonhosted.org/packages/85/71/6fb74f35ef3afdb6b3f77e35a29a571a5c789e89d97ec5cb7fd1285eb48e/uv-0.10.10-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:4f9fd7f62df91c2d91c02e2039d4c5bad825077d04ebd27af8ea35a8cc736daf", size = 23667652, upload-time = "2026-03-13T20:04:41.239Z" }, - { url = "https://files.pythonhosted.org/packages/df/7b/3042f2fb5bf7288cbe7f954ca64badb1243bbac207c0119b4a2cef561564/uv-0.10.10-py3-none-win32.whl", hash = "sha256:52e8b70a4fd7a734833c6a55714b679a10b29cf69b2e663e657df1995cf11c6a", size = 21778937, upload-time = "2026-03-13T20:04:37.11Z" }, - { url = "https://files.pythonhosted.org/packages/89/c8/d314c4aab369aa105959a6b266e3e082a1252b8517564ea7a28b439726a2/uv-0.10.10-py3-none-win_amd64.whl", hash = "sha256:3da90c197e8e9f5d49862556fa9f4a9dd5b8617c0bbcc88585664e777209a315", size = 24176234, upload-time = "2026-03-13T20:04:16.406Z" }, - { url = "https://files.pythonhosted.org/packages/e8/89/ea5852f4dadf01d6490131e5be88b2e12ea85b9cd5ffdc2efc933a3b6892/uv-0.10.10-py3-none-win_arm64.whl", hash = "sha256:3873b965d62b282ab51e328f4b15a760b32b11a7231dc3fe658fa11d98f20136", size = 22561685, upload-time = "2026-03-13T20:04:12.36Z" }, +version = "0.10.12" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8d/b7/6a27678654caa7f2240d9c5be9bd032bfff90a58858f0078575e7a9b6d9f/uv-0.10.12.tar.gz", hash = "sha256:fa722691c7ae5c023778ad0b040ab8619367bcfe44fd0d9e05a58751af86cdf8", size = 3988720, upload-time = "2026-03-19T21:50:41.015Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/aa/dde1b7300f8e924606ab0fe192aa25ca79736c5883ee40310ba8a5b34042/uv-0.10.12-py3-none-linux_armv6l.whl", hash = "sha256:7099bdefffbe2df81accad52579657b8f9f870170caa779049c9fd82d645c9b3", size = 22662810, upload-time = "2026-03-19T21:50:43.108Z" }, + { url = "https://files.pythonhosted.org/packages/5c/90/4fd10d7337a084847403cdbff288395a6a12adbaaac975943df4f46c2d31/uv-0.10.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e0f0ef58f0ba6fbfaf5f91b67aad6852252c49b8f78015a2a5800cf74c7538d5", size = 21852701, upload-time = "2026-03-19T21:51:06.216Z" }, + { url = "https://files.pythonhosted.org/packages/ce/db/c41ace81b8ef5d5952433df38e321c0b6e5f88ce210c508b14f84817963f/uv-0.10.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:551f799d53e397843b6cde7e3c61de716fb487da512a21a954b7d0cbc06967e0", size = 20454594, upload-time = "2026-03-19T21:50:53.693Z" }, + { url = "https://files.pythonhosted.org/packages/5d/07/a990708c5ba064b4eb1a289f1e9c484ebf5c1a0ea8cad049c86625f3b467/uv-0.10.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:a5afe619e8a861fe4d49df8e10d2c6963de0dac6b79350c4832bf3366c8496cf", size = 22212546, upload-time = "2026-03-19T21:51:08.76Z" }, + { url = "https://files.pythonhosted.org/packages/b7/26/7f5ac4af027846c24bd7bf0edbd48b805f9e7daec145c62c632b5ce94e5f/uv-0.10.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:8dc352c93a47a4760cf824c31c55ce26511af780481e8f67c796d2779acaa928", size = 22278457, upload-time = "2026-03-19T21:51:19.895Z" }, + { url = "https://files.pythonhosted.org/packages/02/00/c9043c73fb958482c9b42ad39ba81d1bd1ceffef11c4757412cb17f12316/uv-0.10.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd84379292e3c1a1bf0a05847c7c72b66bb581dccf8da1ef94cc82bf517efa7c", size = 22239751, upload-time = "2026-03-19T21:50:51.25Z" }, + { url = "https://files.pythonhosted.org/packages/5c/d1/31fe74bf2a049446dd95213890ffed98f733d0f5e3badafec59164951608/uv-0.10.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ace05115bd9ee1b30d341728257fe051817c4c0a652c085c90d4bd4fb0bc8f2", size = 23697005, upload-time = "2026-03-19T21:50:48.767Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9a/dd58ef59e622a1651e181ec5b7d304ae482e591f28a864c474d09ea00aff/uv-0.10.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be85acae8f31c68311505cd96202bad43165cbd7be110c59222f918677e93248", size = 24453680, upload-time = "2026-03-19T21:51:11.443Z" }, + { url = "https://files.pythonhosted.org/packages/09/26/b5920b43d7c91e720b72feaf81ea8575fa6188b626607695199fb9a0b683/uv-0.10.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2bb5893d79179727253e4a283871a693d7773c662a534fb897aa65496aa35765", size = 23570067, upload-time = "2026-03-19T21:51:13.976Z" }, + { url = "https://files.pythonhosted.org/packages/8d/42/139e68d7d92bb90a33b5e269dbe474acb00b6c9797541032f859c5bf4c4d/uv-0.10.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101481a1f48db6becf219914a591a588c0b3bfd05bef90768a5d04972bd6455e", size = 23498314, upload-time = "2026-03-19T21:50:36.104Z" }, + { url = "https://files.pythonhosted.org/packages/0c/75/40b237d005e4cdef9f960c215d3e2c0ab4f459ca009c3800cdcb07fbaa1d/uv-0.10.12-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:384b7f36a1ae50efe5f50fe299f276a83bf7acc8b7147517f34e27103270f016", size = 22314017, upload-time = "2026-03-19T21:50:56.45Z" }, + { url = "https://files.pythonhosted.org/packages/d0/c3/e65a6d795d5baf6fc113ff764650cc6dd792d745ff23f657e4c302877365/uv-0.10.12-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:2c21e1b36c384f75dd3fd4a818b04871158ce115efff0bb4fdcd18ba2df7bd48", size = 23321597, upload-time = "2026-03-19T21:50:39.012Z" }, + { url = "https://files.pythonhosted.org/packages/65/ad/00f561b90b0ddfd1d591a78299fdeae68566e9cf82a4913548e4b700afef/uv-0.10.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:006812a086fce03d230fc987299f7295c7a73d17a1f1c17de1d1f327826f8481", size = 23336447, upload-time = "2026-03-19T21:50:58.764Z" }, + { url = "https://files.pythonhosted.org/packages/f1/6e/ddf50c9ad12cffa99dbb6d1ab920da8ba95e510982cf53df3424e8cbc228/uv-0.10.12-py3-none-musllinux_1_1_i686.whl", hash = "sha256:2c5dfc7560453186e911c8c2e4ce95cd1c91e1c5926c3b34c5a825a307217be9", size = 22855873, upload-time = "2026-03-19T21:51:01.13Z" }, + { url = "https://files.pythonhosted.org/packages/7a/9a/31a9c2f939849e56039bbe962aef6fb960df68c31bebd834d956876decfc/uv-0.10.12-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:b9ca1d264059cb016c853ebbc4f21c72d983e0f347c927ca29e283aec2f596cf", size = 23675276, upload-time = "2026-03-19T21:51:17.262Z" }, + { url = "https://files.pythonhosted.org/packages/81/83/9225e3032f24fcb3b80ff97bbd4c28230de19f0f6b25dbad3ba6efda035e/uv-0.10.12-py3-none-win32.whl", hash = "sha256:cca36540d637c80d11d8a44a998a068355f0c78b75ec6b0f152ecbf89dfdd67b", size = 21739726, upload-time = "2026-03-19T21:50:46.155Z" }, + { url = "https://files.pythonhosted.org/packages/b5/9c/1954092ce17c00a8c299d39f8121e4c8d60f22a69c103f34d8b8dc68444d/uv-0.10.12-py3-none-win_amd64.whl", hash = "sha256:76ebe11572409dfbe20ec25a823f9bc8781400ece5356aa33ec44903af7ec316", size = 24219668, upload-time = "2026-03-19T21:51:03.591Z" }, + { url = "https://files.pythonhosted.org/packages/37/92/9ca420deb5a7b6716d8746e1b05eb2c35a305ff3b4aa57061919087d82dd/uv-0.10.12-py3-none-win_arm64.whl", hash = "sha256:6727e3a0208059cd4d621684e580d5e254322dacbd806e0d218360abd0d48a68", size = 22544602, upload-time = "2026-03-19T21:51:22.678Z" }, ] [[package]] From 4341c0b8413f0e1aa8c661e72d8b14cca55103af Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:15:55 +0100 Subject: [PATCH 12/14] :arrow_up: Bump inference-models from 0.20.1 to 0.22.0 (#331) Bumps inference-models from 0.20.1 to 0.22.0. --- updated-dependencies: - dependency-name: inference-models dependency-version: 0.22.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Piotr Skalski --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 4a8bb04d..78ae1354 100644 --- a/uv.lock +++ b/uv.lock @@ -934,7 +934,7 @@ wheels = [ [[package]] name = "inference-models" -version = "0.20.1" +version = "0.22.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "accelerate" }, @@ -968,9 +968,9 @@ dependencies = [ { name = "torchvision" }, { name = "transformers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/31/2c/fbb5d52bfb477a5bbbf433f9ddbaecdda799d2c12d77acc31cd1dc731b9a/inference_models-0.20.1.tar.gz", hash = "sha256:71f74139b5d9db717a32c8f153151972fd1e6a8771252154b626fcdf36aa1aa9", size = 1652212, upload-time = "2026-03-12T15:49:00.66Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7f/ed/91c7da89709edd53a30d3b2b5d5ac91fe947a78ea284c6ec16ac189edaec/inference_models-0.22.0.tar.gz", hash = "sha256:06dec33ee9a868f1e8261e34e60e1b15fd2c3d8126d5cdf0ae8ebdfde8e1ff30", size = 1660412, upload-time = "2026-03-20T18:04:48.988Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/39/971f866d150e1aff85e45fc8a935c2229d8c2d2284ebc98097c67e0f6b97/inference_models-0.20.1-py3-none-any.whl", hash = "sha256:5e2787c43c47ed938748f309a95f04c7ecb74374db7bc27b7936329a330a01e8", size = 1817133, upload-time = "2026-03-12T15:48:58.749Z" }, + { url = "https://files.pythonhosted.org/packages/06/99/18215129b667c46a993f89e9b37da25cd0b982b0a3a296269fed7a18d276/inference_models-0.22.0-py3-none-any.whl", hash = "sha256:6839111d5dd5b5403404b55e182fa3b1c44cd38c5a26430c589b7a0021ece126", size = 1827857, upload-time = "2026-03-20T18:04:47.399Z" }, ] [[package]] From 148613ae234679a8aa06f15562c11e6dbe347df1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:26:49 +0100 Subject: [PATCH 13/14] :arrow_up: Bump mkdocs-material from 9.7.5 to 9.7.6 (#330) Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.7.5 to 9.7.6. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.7.5...9.7.6) --- updated-dependencies: - dependency-name: mkdocs-material dependency-version: 9.7.6 dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Piotr Skalski --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 78ae1354..651054bf 100644 --- a/uv.lock +++ b/uv.lock @@ -1414,7 +1414,7 @@ wheels = [ [[package]] name = "mkdocs-material" -version = "9.7.5" +version = "9.7.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "babel" }, @@ -1429,9 +1429,9 @@ dependencies = [ { name = "pymdown-extensions" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/74/76/5c202fecdc45d53e83e03a85bae70c48b6c81e9f87f0bc19a9e9c723bdc0/mkdocs_material-9.7.5.tar.gz", hash = "sha256:f76bdab532bad1d9c57ca7187b37eccf64dd12e1586909307f8856db3be384ea", size = 4097749, upload-time = "2026-03-10T15:43:22.809Z" } +sdist = { url = "https://files.pythonhosted.org/packages/45/29/6d2bcf41ae40802c4beda2432396fff97b8456fb496371d1bc7aad6512ec/mkdocs_material-9.7.6.tar.gz", hash = "sha256:00bdde50574f776d328b1862fe65daeaf581ec309bd150f7bff345a098c64a69", size = 4097959, upload-time = "2026-03-19T15:41:58.161Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/45/e1/e8080dcfa95cca267662a6f4afe29237452bdeb5a2a6555ac83646d21915/mkdocs_material-9.7.5-py3-none-any.whl", hash = "sha256:7cf9df2ff121fd098ff6e05c732b0be3699afca9642e2dfe4926c40eb5873eec", size = 9305251, upload-time = "2026-03-10T15:43:19.089Z" }, + { url = "https://files.pythonhosted.org/packages/2c/01/bc663630c510822c95c47a66af9fa7a443c295b47d5f041e5e6ae62ef659/mkdocs_material-9.7.6-py3-none-any.whl", hash = "sha256:71b84353921b8ea1ba84fe11c50912cc512da8fe0881038fcc9a0761c0e635ba", size = 9305470, upload-time = "2026-03-19T15:41:55.217Z" }, ] [[package]] From 62e46b973729a04f7e98925fe0beff9ff9751939 Mon Sep 17 00:00:00 2001 From: Piotr Skalski Date: Thu, 26 Mar 2026 13:20:27 +0100 Subject: [PATCH 14/14] feat(botsort): add integration tests and fix ruff/mypy errors (#335) --- pyproject.toml | 2 + test/core/test_tracker_integration.py | 2 +- test/data/tracker_expected_dancetrack.json | 6 +++ test/data/tracker_expected_sportsmot.json | 6 +++ trackers/__init__.py | 2 + trackers/core/botsort/cmc.py | 44 +++++++++++---------- trackers/core/botsort/kalman_box_tracker.py | 6 +-- trackers/core/botsort/tracker.py | 24 ++++++----- 8 files changed, 58 insertions(+), 34 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c662c6ee..c625c582 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -196,5 +196,7 @@ module = [ "torchvision", "torchvision.transforms", "firerequests", + "scipy", + "scipy.optimize", ] ignore_missing_imports = true diff --git a/test/core/test_tracker_integration.py b/test/core/test_tracker_integration.py index 31c4688b..b00e8119 100644 --- a/test/core/test_tracker_integration.py +++ b/test/core/test_tracker_integration.py @@ -17,7 +17,7 @@ from trackers.eval import evaluate_mot_sequences from trackers.io.mot import _load_mot_file, _mot_frame_to_detections, _MOTOutput -_TRACKER_IDS = ["sort", "bytetrack", "ocsort"] +_TRACKER_IDS = ["sort", "bytetrack", "ocsort", "botsort"] _METRICS = ["CLEAR", "HOTA", "Identity"] _TEST_DATA_DIR = Path(__file__).resolve().parent.parent / "data" diff --git a/test/data/tracker_expected_dancetrack.json b/test/data/tracker_expected_dancetrack.json index ebf5990b..c5c60aa8 100644 --- a/test/data/tracker_expected_dancetrack.json +++ b/test/data/tracker_expected_dancetrack.json @@ -16,5 +16,11 @@ "MOTA": 98.187, "IDF1": 74.367, "IDSW": 631 + }, + "botsort": { + "HOTA": 79.999, + "MOTA": 99.511, + "IDF1": 76.389, + "IDSW": 614 } } diff --git a/test/data/tracker_expected_sportsmot.json b/test/data/tracker_expected_sportsmot.json index fadde849..08c658a6 100644 --- a/test/data/tracker_expected_sportsmot.json +++ b/test/data/tracker_expected_sportsmot.json @@ -16,5 +16,11 @@ "MOTA": 97.791, "IDF1": 79.21, "IDSW": 917 + }, + "botsort": { + "HOTA": 85.544, + "MOTA": 98.925, + "IDF1": 80.53, + "IDSW": 1107 } } diff --git a/trackers/__init__.py b/trackers/__init__.py index 31c646df..a34ba7a9 100644 --- a/trackers/__init__.py +++ b/trackers/__init__.py @@ -7,6 +7,7 @@ from __future__ import annotations from trackers.annotators.trace import MotionAwareTraceAnnotator +from trackers.core.botsort.tracker import BoTSORTTracker from trackers.core.bytetrack.tracker import ByteTrackTracker from trackers.core.ocsort.tracker import OCSORTTracker from trackers.core.sort.tracker import SORTTracker @@ -22,6 +23,7 @@ from trackers.utils.converters import xcycsr_to_xyxy, xyxy_to_xcycsr __all__ = [ + "BoTSORTTracker", "ByteTrackTracker", "CoordinatesTransformation", "Dataset", diff --git a/trackers/core/botsort/cmc.py b/trackers/core/botsort/cmc.py index 8deb976b..f04526ee 100644 --- a/trackers/core/botsort/cmc.py +++ b/trackers/core/botsort/cmc.py @@ -230,16 +230,16 @@ def __init__(self, cfg: CMCConfig | None = None) -> None: self.extractor = None self.matcher = None if self.cfg.method == "orb": - self.detector = cv2.FastFeatureDetector_create(self.cfg.fast_threshold) - self.extractor = cv2.ORB_create() + self.detector = cv2.FastFeatureDetector_create(self.cfg.fast_threshold) # type: ignore[attr-defined] + self.extractor = cv2.ORB_create() # type: ignore[attr-defined] self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) elif self.cfg.method == "sift": - self.detector = cv2.SIFT_create( + self.detector = cv2.SIFT_create( # type: ignore[attr-defined] nOctaveLayers=self.cfg.sift_n_octave_layers, contrastThreshold=self.cfg.sift_contrast_threshold, edgeThreshold=int(self.cfg.sift_edge_threshold), ) - self.extractor = cv2.SIFT_create( + self.extractor = cv2.SIFT_create( # type: ignore[attr-defined] nOctaveLayers=self.cfg.sift_n_octave_layers, contrastThreshold=self.cfg.sift_contrast_threshold, edgeThreshold=int(self.cfg.sift_edge_threshold), @@ -369,8 +369,8 @@ def _estimate_feature_affine( mask[y1b:y2b, x1b:x2b] = 0 # Detect + describe (ORB) - kps = self.detector.detect(gray, mask) - kps, desc = self.extractor.compute(gray, kps) + kps = self.detector.detect(gray, mask) # type: ignore[union-attr] + kps, desc = self.extractor.compute(gray, kps) # type: ignore[union-attr] H_aff = np.eye(2, 3, dtype=np.float32) @@ -386,7 +386,7 @@ def _estimate_feature_affine( self._prev_desc = None if desc is None else copy.copy(desc) return H_aff - knn = self.matcher.knnMatch(self._prev_desc, desc, k=2) + knn = self.matcher.knnMatch(self._prev_desc, desc, k=2) # type: ignore[union-attr] if len(knn) == 0: self._prev_kps = copy.copy(kps) self._prev_desc = copy.copy(desc) @@ -405,7 +405,7 @@ def _estimate_feature_affine( continue m, n = pair if m.distance < 0.9 * n.distance: - p_prev = np.array(self._prev_kps[m.queryIdx].pt, dtype=np.float32) + p_prev = np.array(self._prev_kps[m.queryIdx].pt, dtype=np.float32) # type: ignore[index] p_curr = np.array(kps[m.trainIdx].pt, dtype=np.float32) d = p_prev - p_curr if (abs(d[0]) < max_spatial[0]) and (abs(d[1]) < max_spatial[1]): @@ -414,12 +414,12 @@ def _estimate_feature_affine( curr_pts.append(p_curr) if len(prev_pts) >= 5: - spatial = np.asarray(spatial, dtype=np.float32) - mean = spatial.mean(axis=0) - std = spatial.std(axis=0) + 1e-6 + spatial_arr = np.asarray(spatial, dtype=np.float32) + mean = spatial_arr.mean(axis=0) + std = spatial_arr.std(axis=0) + 1e-6 inl = np.logical_and( - np.abs(spatial[:, 0] - mean[0]) < 2.5 * std[0], - np.abs(spatial[:, 1] - mean[1]) < 2.5 * std[1], + np.abs(spatial_arr[:, 0] - mean[0]) < 2.5 * std[0], + np.abs(spatial_arr[:, 1] - mean[1]) < 2.5 * std[1], ) prev_pts_np = np.asarray(prev_pts, dtype=np.float32)[inl] curr_pts_np = np.asarray(curr_pts, dtype=np.float32)[inl] @@ -471,7 +471,7 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: ) # Find keypoints in current frame - keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params) + keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params) # type: ignore[call-overload] # First frame: init and return identity if not self._initialized: @@ -492,7 +492,7 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: # Optical flow correspondences # calcOpticalFlowPyrLK will throw or return nonsense if we give it None - matched, status, _err = cv2.calcOpticalFlowPyrLK( + matched, status, _err = cv2.calcOpticalFlowPyrLK( # type: ignore[call-overload] self._prev_frame_gray, frame, self._prev_points, None ) @@ -512,14 +512,16 @@ def _estimate_sparse_optflow(self, frame_bgr: np.ndarray) -> np.ndarray: prev_pts.append(self._prev_points[i]) curr_pts.append(matched[i]) - prev_pts = np.array(prev_pts) - curr_pts = np.array(curr_pts) + prev_pts_arr = np.array(prev_pts) + curr_pts_arr = np.array(curr_pts) # Find rigid matrix - if (np.size(prev_pts, 0) > 4) and ( - np.size(prev_pts, 0) == np.size(curr_pts, 0) + if (np.size(prev_pts_arr, 0) > 4) and ( + np.size(prev_pts_arr, 0) == np.size(curr_pts_arr, 0) ): - H_est, _ = cv2.estimateAffinePartial2D(prev_pts, curr_pts, cv2.RANSAC) + H_est, _ = cv2.estimateAffinePartial2D( # type: ignore[call-overload] + prev_pts_arr, curr_pts_arr, cv2.RANSAC + ) if H_est is not None: H_aff = H_est.astype(np.float32) @@ -587,7 +589,7 @@ def _estimate_ecc(self, frame_bgr: np.ndarray) -> np.ndarray: return H_aff try: - _cc, H_est = cv2.findTransformECC( + _cc, H_est = cv2.findTransformECC( # type: ignore[call-overload] self._prev_frame_gray, frame, H_aff, diff --git a/trackers/core/botsort/kalman_box_tracker.py b/trackers/core/botsort/kalman_box_tracker.py index c5d53769..f38a1939 100644 --- a/trackers/core/botsort/kalman_box_tracker.py +++ b/trackers/core/botsort/kalman_box_tracker.py @@ -360,10 +360,10 @@ def predict(self) -> None: self._update_process_and_measurement_noise() # Predict state - self.state = self.F @ self.state + self.state = self.F @ self.state # type: ignore[assignment] # Predict error (uncertainty) covariance - self.P = self.F @ self.P @ self.F.T + self.Q + self.P = self.F @ self.P @ self.F.T + self.Q # type: ignore[assignment] # Prevent degenerate box shape self.state[2, 0] = max(self.state[2, 0], 1e-3) @@ -437,7 +437,7 @@ def update(self, bbox: np.ndarray) -> None: # Update covariance identity_matrix = np.eye(8, dtype=np.float32) - self.P = (identity_matrix - K @ self.H) @ self.P + self.P = (identity_matrix - K @ self.H) @ self.P # type: ignore[assignment] self.state[2, 0] = max(self.state[2, 0], 1e-3) self.state[3, 0] = max(self.state[3, 0], 1e-3) diff --git a/trackers/core/botsort/tracker.py b/trackers/core/botsort/tracker.py index f4d35842..696280c0 100644 --- a/trackers/core/botsort/tracker.py +++ b/trackers/core/botsort/tracker.py @@ -5,7 +5,7 @@ # ------------------------------------------------------------------------ from copy import deepcopy -from typing import cast +from typing import Literal, cast import numpy as np import supervision as sv @@ -55,6 +55,8 @@ class BoTSORTTracker(BaseTracker): cmc: Camera motion compensation instance (or None if disabled). """ + tracker_id = "botsort" + def __init__( self, lost_track_buffer: int = 30, @@ -65,7 +67,7 @@ def __init__( minimum_iou_threshold_second_assoc: float = 0.5, high_conf_det_threshold: float = 0.6, enable_cmc: bool = True, - cmc_method: str = "sparseOptFlow", + cmc_method: Literal["orb", "sift", "sparseOptFlow", "ecc"] = "sparseOptFlow", cmc_downscale: int = 2, ) -> None: """ @@ -163,10 +165,10 @@ def _update_detections( updated_detections.append(new_det) return updated_detections - def update( + def update( # type: ignore[override] self, detections: sv.Detections, - frame: np.ndarray, + frame: np.ndarray | None = None, ) -> sv.Detections: """ Update the tracker with detections from the current frame. @@ -255,7 +257,10 @@ def update( # Add unmatched low prob predictions to updated predictions for det_index in unmatched_detections: - new_det = deepcopy(low_prob_detections[det_index : det_index + 1]) + new_det = cast( + sv.Detections, + deepcopy(low_prob_detections[det_index : det_index + 1]), + ) new_det.tracker_id = np.array([-1]) updated_detections.append(new_det) @@ -309,16 +314,17 @@ def _get_high_and_low_probability_detections( """ if detections.confidence is None: - # If no confidence information exists, treat all detections as high-confidence - return detections, detections[:0] + # If no confidence information exists, treat all detections + # as high-confidence + return detections, cast(sv.Detections, detections[:0]) conf = detections.confidence high_mask = conf >= self.high_conf_det_threshold low_mask = (conf > 0.1) & (conf < self.high_conf_det_threshold) - high_confidence = detections[high_mask] - low_confidence = detections[low_mask] + high_confidence = cast(sv.Detections, detections[high_mask]) + low_confidence = cast(sv.Detections, detections[low_mask]) return high_confidence, low_confidence