Football_tracking_data_preparation/test_detector.py at main · tse-coder/Football_tracking_data_preparation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
"""
test_detector.py

A module for filtering football (soccer) replays from broadcast video using a multi-stage
pipeline: shot boundary detection, YOLOv8 spatial classification, SlowFast temporal
refinement, and FFmpeg video merging.

Designed for Google Colab execution.
"""
import os
import logging
from pathlib import Path

# Set up simple logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Try importing required dependencies, raising informative errors if missing.
try:
    import cv2
except ImportError:
    raise ImportError("OpenCV (cv2) is not installed. Please install it using: pip install opencv-python")

try:
    import ffmpeg
except ImportError:
    raise ImportError("ffmpeg-python is not installed. Please install it using: pip install ffmpeg-python")

try:
    from scenedetect import SceneManager, open_video
    from scenedetect.detectors import ContentDetector
except ImportError:
    raise ImportError("PySceneDetect is not installed. Please install it using: pip install scenedetect")

try:
    from ultralytics import YOLO
except ImportError:
    raise ImportError("YOLO (ultralytics) is not installed. Please install it using: pip install ultralytics")

import sys
try:
    # PySlowFast is officially available via cloning the facebookresearch/SlowFast repo.
    # It is not on PyPI. If standard import fails, we dynamically add the 'SlowFast'
    # directory to the path.
    import slowfast.utils.checkpoint as cu
    from slowfast.models import build_model
    from slowfast.config.defaults import get_cfg
except ImportError:
    # Resolve the absolute path to the directory containing this script
    current_dir = os.path.dirname(os.path.abspath(__file__))

    # Check multiple possible locations for the SlowFast clone
    possible_paths = [
        os.path.join(current_dir, "SlowFast"), # Assuming it's in the same project folder
        "/content/SlowFast",                   # Default Colab clone location
        os.path.abspath(os.path.join(current_dir, "..", "SlowFast")), # One level up
        "./SlowFast"                           # Current working directory
    ]

    slowfast_path = None
    for p in possible_paths:
        if os.path.exists(p) and os.path.isdir(p):
            # Check if it actually contains the slowfast package inside
            if os.path.exists(os.path.join(p, "slowfast")):
                slowfast_path = p
                break

    if slowfast_path:
        if slowfast_path not in sys.path:
            sys.path.insert(0, slowfast_path)
        logger.info(f"Dynamically added SlowFast path: {slowfast_path}")

        try:
            import slowfast.utils.checkpoint as cu
            from slowfast.models import build_model
            from slowfast.config.defaults import get_cfg
        except ImportError as e:
            raise ImportError(f"Found SlowFast at {slowfast_path} but failed to import its modules. Error: {e}")
    else:
        logger.warning(
            "PySlowFast not found in expected locations. "
            "Please ensure you ran: !git clone https://github.com/facebookresearch/SlowFast "
            "and that the folder is accessible."
        )

class TestDetector:
    """
    TestDetector implements a football replay filtering pipeline.

    Stages:
    1. detect_shots: Detects boundaries between different camera shots.
    2. sample_frames: Samples frames from each shot for spatial classification.
    3. classify_frames: Uses YOLOv8 to classify sampled frames into match states.
    4. temporal_refine: Uses PySlowFast to temporally refine replay predictions.
    5. save_clean_video: Merges live match segments back into a clean video.
    """

    def __init__(self, video_path: str, yolo_weights_path: str = 'yolov8n-cls.pt', slowfast_config_path: str = None):
        """
        Initializes the TestDetector pipeline.

        Args:
            video_path (str): Path to the input video file.
            yolo_weights_path (str): Path to YOLOv8 classification model weights.
            slowfast_config_path (str): Path to SlowFast/X3D config YAML file.
        """
        self.video_path = video_path
        self.yolo_weights_path = yolo_weights_path
        self.slowfast_config_path = slowfast_config_path

        if not os.path.exists(self.video_path):
            raise FileNotFoundError(f"Input video not found at: {self.video_path}")

        # Pipeline state
        self.shots = []       # List of detected scenes (start_time, end_time)
        self.sampled_frames = {} # Dict mapping shot index to lists of sampled frames paths/arrays
        self.shot_classes = {}   # Dict mapping shot index to major class ('match_play', 'replay', etc.)
        self.refined_shots = {}  # Dict mapping shot index to final refined class

        logger.info(f"Initialized TestDetector with video: {self.video_path}")

    def detect_shots(self, threshold: float = 30.0):
        """
        Detects shot boundaries in the video using PySceneDetect.

        Args:
            threshold (float): Detection threshold for ContentDetector.

        Returns:
            list: A list of tuples containing (start_time_seconds, end_time_seconds) for each shot.
        """
        logger.info("Starting shot boundary detection...")
        video = open_video(self.video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=threshold))

        # Detect scenes
        scene_manager.detect_scenes(video)
        scene_list = scene_manager.get_scene_list()

        self.shots = [(scene[0].get_seconds(), scene[1].get_seconds()) for scene in scene_list]
        logger.info(f"Detected {len(self.shots)} shots.")
        return self.shots

    def sample_frames(self, frames_per_shot: int = 5):
        """
        Samples a fixed number of uniformly distributed frames from each detected shot.

        Args:
            frames_per_shot (int): Number of frames to sample per shot.
        """
        if not self.shots:
            raise ValueError("No shots detected. Run detect_shots() first.")

        logger.info(f"Sampling {frames_per_shot} frames per shot...")
        cap = cv2.VideoCapture(self.video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        for idx, (start_sec, end_sec) in enumerate(self.shots):
            duration = end_sec - start_sec
            # Calculate timepoints to sample
            sample_times = [start_sec + duration * (i + 1) / (frames_per_shot + 1) for i in range(frames_per_shot)]

            frames = []
            for t in sample_times:
                frame_idx = int(t * fps)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                ret, frame = cap.read()
                if ret:
                    frames.append(frame)

            self.sampled_frames[idx] = frames

        cap.release()
        logger.info("Frame sampling completed.")

    def classify_frames(self):
        """
        Runs YOLOv8 classifier on sampled frames to determine the prominent class
        for each shot ("match_play", "replay", "crowd", "ads").
        """
        if not self.sampled_frames:
            raise ValueError("No frames sampled. Run sample_frames() first.")

        logger.info("Loading YOLOv8 model for frame classification...")
        try:
            model = YOLO(self.yolo_weights_path)
        except Exception as e:
            raise RuntimeError(f"Failed to load YOLO model: {e}")

        logger.info("Classifying frames...")
        for shot_idx, frames in self.sampled_frames.items():
            class_counts = {"match_play": 0, "replay": 0, "crowd": 0, "ads": 0}

            for frame in frames:
                # YOLO classification (assuming custom trained weights mapping to classes)
                results = model(frame, verbose=False)
                # Parse top1 class prediction (dummy mapping shown here)
                try:
                    top_class_idx = results[0].probs.top1
                    class_name = results[0].names[top_class_idx]
                except AttributeError:
                    # Fallback if YOLO returns missing predictions structure
                    class_name = "match_play"

                # In a real scenario, this maps YOLO classes to our categories
                if class_name in class_counts:
                    class_counts[class_name] += 1
                else:
                    # Defaulting to match_play or handling undefined classes
                    class_counts["match_play"] += 1

            # Assign the majority class to the entire shot
            best_class = max(class_counts, key=class_counts.get)
            self.shot_classes[shot_idx] = best_class

        logger.info("Frame classification completed.")

    def temporal_refine(self):
        """
        Runs a functional SlowFast/X3D temporal action recognition model to refine
        the classification of segments initially classified as "match_play" or "replay".
        """
        if not self.shot_classes:
            raise ValueError("No shots classified. Run classify_frames() first.")

        logger.info("Initializing temporal refinement using PySlowFast/X3D...")
        if not self.slowfast_config_path:
            logger.warning("No slowfast config provided. Skipping temporal refinement.")
            self.refined_shots = self.shot_classes.copy()
            return

        try:
            import torch
            import numpy as np
        except ImportError:
            raise ImportError("PyTorch or NumPy not found. Ensure they are installed in Colab.")

        try:
            # Re-verify that SlowFast was successfully imported from the top of the file
            from slowfast.config.defaults import get_cfg
            from slowfast.models import build_model
            import slowfast.utils.checkpoint as cu
        except ImportError:
            raise ImportError(
                "PySlowFast is missing. In Colab, you MUST run: "
                "!git clone https://github.com/facebookresearch/SlowFast"
            )

        # 7. Ensure method handles GPU execution if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device for temporal refinement: {device}")

        # 2. Initialize the SlowFast model using config YAML
        logger.info(f"Loading SlowFast config from {self.slowfast_config_path}")
        cfg = get_cfg()
        cfg.merge_from_file(self.slowfast_config_path)

        # 8. Loading model
        logger.info("Building SlowFast model...")
        model = build_model(cfg)

        # 3. Load pretrained weights (checkpoint or defaults)
        if cfg.TEST.CHECKPOINT_FILE_PATH:
            logger.info(f"Loading checkpoint weights from {cfg.TEST.CHECKPOINT_FILE_PATH}")
            cu.load_test_checkpoint(cfg, model)
        else:
            logger.warning("No CHECKPOINT_FILE_PATH in config. Using uninitialized weights.")

        model.eval()
        model = model.to(device)

        # Setup video capture (using OpenCV instead of full pipeline decoding for simplicity)
        cap = cv2.VideoCapture(self.video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        num_frames = cfg.DATA.NUM_FRAMES
        crop_size = cfg.DATA.TEST_CROP_SIZE

        for shot_idx, shot_class in self.shot_classes.items():
            if shot_class in ["match_play", "replay"]:
                logger.info(f"Refining shot {shot_idx} (initially {shot_class})...")
                start_sec, end_sec = self.shots[shot_idx]

                # 4a. Extract corresponding video frames (uniformly sample)
                duration = end_sec - start_sec
                # Ensure we handle division by zero or extremely short shots
                sample_times = [
                    start_sec + duration * i / max(1, num_frames - 1)
                    for i in range(num_frames)
                ]

                frames = []
                for t in sample_times:
                    frame_idx = int(t * fps)
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                    ret, frame = cap.read()
                    if ret:
                        # Convert BGR (OpenCV) to RGB for model input
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        frames.append(frame)
                    else:
                        break

                if not frames:
                    logger.warning(f"Failed to read frames for shot {shot_idx}. Keeping original class.")
                    self.refined_shots[shot_idx] = shot_class
                    continue

                # Pad frames if shot too short
                while len(frames) < num_frames:
                    frames.append(frames[-1])

                # 4b. Preprocess frames into format required by SlowFast (3D tensor, normalize)
                clip = np.array(frames) # Shape: (T, H, W, C)
                clip = torch.from_numpy(clip).float() # Convert to tensor
                clip = clip / 255.0 # Scale values to [0, 1]
                clip = clip.permute(3, 0, 1, 2) # Reshape to (C, T, H, W)

                # Resize dimension spatially
                clip = torch.nn.functional.interpolate(
                    clip.unsqueeze(0),
                    size=(num_frames, crop_size, crop_size),
                    mode='trilinear',
                    align_corners=False
                ).squeeze(0)

                # Apply normalization (mean/std vector subtraction)
                mean = torch.tensor(cfg.DATA.MEAN).view(-1, 1, 1, 1)
                std = torch.tensor(cfg.DATA.STD).view(-1, 1, 1, 1)
                clip = clip - mean
                clip = clip / std

                # Adjust for potential multiple pathways (e.g. SlowFast architecture vs X3D single pathway)
                if cfg.MODEL.ARCH in ['slowfast']:
                    alpha = cfg.SLOWFAST.ALPHA
                    fast_pathway = clip
                    # Slow pathway involves subsampling frames
                    slow_idx = torch.linspace(0, clip.shape[1] - 1, clip.shape[1] // alpha).long()
                    slow_pathway = torch.index_select(clip, 1, slow_idx)
                    inputs = [slow_pathway.unsqueeze(0).to(device), fast_pathway.unsqueeze(0).to(device)]
                else:
                    inputs = [clip.unsqueeze(0).to(device)]

                # 4c. Run frames through the SlowFast model to predict probability
                with torch.no_grad():
                    preds = model(inputs)
                    probs = torch.nn.functional.softmax(preds[0], dim=0)
                    top_prob, top_idx = torch.max(probs, dim=0)

                    # 6. Include clear logging of the refinement progress
                    logger.info(f"Prediction for shot {shot_idx} -> class_index: {top_idx.item()} (prob: {top_prob.item():.4f})")

                    # 4d. Use the prediction to assign a refined label
                    # In a typical setup, pre-mapped classes determine interpretation.
                    # Assuming for demonstration that index 1 specifies "replay", and otherwise "match_play".
                    if top_idx.item() == 1:
                        refined_class = "replay"
                    else:
                        refined_class = "match_play"

                # 9. Final refined dict tracking mapped index
                self.refined_shots[shot_idx] = refined_class
                logger.info(f"Shot {shot_idx} refined label assigned: {refined_class}")

            else:
                # 5. Copy other shot classes (crowd, ads) directly to self.refined_shots
                self.refined_shots[shot_idx] = shot_class
                logger.info(f"Shot {shot_idx} passed through without refinement: {shot_class}")

        cap.release()
        logger.info("Temporal refinement completed across all shots.")

    def save_clean_video(self, output_path: str = "output_clean_match.mp4"):
        """
        Merges all segments classified as live match play into a final output video using FFmpeg.

        Args:
            output_path (str): File path for the final merged video.
        """
        if not self.refined_shots:
            raise ValueError("No refined shots available. Run temporal_refine() first.")

        logger.info(f"Extracting live 'match_play' segments and merging into {output_path}...")

        # Collect live play segment timestamps
        live_segments = []
        for idx, shot_type in self.refined_shots.items():
            if shot_type == "match_play":
                live_segments.append(self.shots[idx])

        if not live_segments:
            logger.warning("No 'match_play' segments found. Output video will not be created.")
            return

        # Build FFmpeg complex filter command to concatenate multiple segments
        try:
            input_stream = ffmpeg.input(self.video_path)
            streams = []

            for (start, end) in live_segments:
                # Extract video and audio for each segment
                v = input_stream.video.trim(start=start, end=end).setpts('PTS-STARTPTS')
                a = input_stream.audio.filter('atrim', start=start, end=end).filter('asetpts', 'PTS-STARTPTS')
                streams.extend([v, a])

            # Concatenate all stream parts
            concated = ffmpeg.concat(*streams, v=1, a=1)

            # Output the result
            out = ffmpeg.output(concated, output_path, vcodec='libx264', acodec='aac')
            out.run(overwrite_output=True, quiet=True)
            logger.info(f"Clean video successfully saved to {output_path}")

        except ffmpeg.Error as e:
            logger.error(f"FFmpeg error during compilation: {e.stderr.decode('utf-8') if e.stderr else str(e)}")
            raise RuntimeError("Failed to compile final video using FFmpeg.")

# Example Usage Block:
if __name__ == "__main__":
    # Note: Replace paths with actual paths in your Colab environment
    try:
        # Pass dummy paths for the example run
        detector = TestDetector(video_path="sample_match.mp4", yolo_weights_path="yolov8n-cls.pt")
        detector.detect_shots()
        detector.sample_frames()
        detector.classify_frames()
        detector.temporal_refine()
        detector.save_clean_video("clean_match_only.mp4")
    except Exception as e:
        logger.error(f"Pipeline failed: {e}")