metric_depth_video_toolbox/align_3d_points.py at main · calledit/metric_depth_video_toolbox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
import numpy as np
import os
import cv2
import json
import argparse
import depth_map_tools
from itertools import islice


np.set_printoptions(suppress=True, precision=4)

def angle_between_rays(d1, d2):
    # Convert inputs to numpy arrays.
    d1 = np.asarray(d1)
    d2 = np.asarray(d2)

    # If the inputs are single rays (1D arrays), add a new axis.
    if d1.ndim == 1:
        d1 = d1[None, :]
    if d2.ndim == 1:
        d2 = d2[None, :]

    # Normalize each ray along the row (axis=1).
    d1_norm = d1 / np.linalg.norm(d1, axis=1, keepdims=True)
    d2_norm = d2 / np.linalg.norm(d2, axis=1, keepdims=True)

    # Compute the cross product for each pair.
    cross = np.cross(d1_norm, d2_norm)
    cross_norm = np.linalg.norm(cross, axis=1)

    # Compute the dot product for each pair.
    dot = np.sum(d1_norm * d2_norm, axis=1)

    # Compute the angle for each pair using arctan2.
    angles = np.arctan2(cross_norm, dot)

    # If the original inputs were single rays, return a scalar.
    if angles.size == 1:
        return angles.item()

    return angles


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray) or isinstance(obj, torch.Tensor):
            return obj.tolist()
        return super().default(obj)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Align 3D video based on depth video and a point tracking file')

    parser.add_argument('--track_file', type=str, help='file with 2d point tracking data', required=True)
    parser.add_argument('--mask_video', type=str, help='black and white mask video for thigns that should not be tracked', required=False)
    parser.add_argument('--strict_mask', default=False, action='store_true', help='Remove any points that has ever been masked out even in frames where they are not masked', required=False)
    parser.add_argument('--xfov', type=int, help='fov in deg in the x-direction, calculated from aspectratio and yfov in not given', required=False)
    parser.add_argument('--yfov', type=int, help='fov in deg in the y-direction, calculated from aspectratio and xfov in not given', required=False)
    parser.add_argument('--depth_video', type=str, help='depth video', required=True)
    parser.add_argument('--max_frames', default=-1, type=int, help='quit after max_frames nr of frames', required=False)
    parser.add_argument('--max_depth', default=100, type=int, help='the max depth that the video uses', required=False)
    parser.add_argument('--color_video', type=str, help='video file to use as color input only used when debuging', required=False)
    parser.add_argument('--assume_stationary_camera', action='store_true', help='Makes the algorithm assume the camera a stationary_camera, leads to better tracking.', required=False)
    parser.add_argument('--use_madpose', action='store_true', help='Uses madpose for camera pose estimation.', required=False)


    args = parser.parse_args()

    if args.xfov is None and args.yfov is None:
        print("Either --xfov or --yfov is required.")
        exit(0)

    if not os.path.isfile(args.track_file):
        raise Exception("input track_file does not exist")

    if not os.path.isfile(args.depth_video):
        raise Exception("input depth_video does not exist")

    with open(args.track_file) as json_track_file_handle:
        frames = json.load(json_track_file_handle)


    out_file = args.depth_video + "_transformations.json"

    MODEL_maxOUTPUT_depth = args.max_depth

    raw_video = cv2.VideoCapture(args.depth_video)
    frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_rate = raw_video.get(cv2.CAP_PROP_FPS)

    cam_matrix = depth_map_tools.compute_camera_matrix(args.xfov, args.yfov, frame_width, frame_height)

    color_video = None
    debug_video = None
    if args.color_video is not None:
        if not os.path.isfile(args.color_video):
            raise Exception("input color_video does not exist")
        color_video = cv2.VideoCapture(args.color_video)
        debug_video = cv2.VideoWriter(args.color_video+"_track_debug.mp4", cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (frame_width, frame_height))

    mask_video = None
    if args.mask_video is not None:
        if not os.path.isfile(args.mask_video):
            raise Exception("input mask_video does not exist")
        mask_video = cv2.VideoCapture(args.mask_video)

    for i, frame in enumerate(frames):
        frames[i] = np.array(frames[i])


    used_frames = []

    #1. Pick the first frame
    frame_n = 0
    used_frames.append(frame_n)


    transformations = []
    to_ref_zero = np.eye(4)
    transformations.append(to_ref_zero.tolist())

    #Only import madpose if we need it, as it is not in pypi
    if not args.assume_stationary_camera:
        import madpose
        from madpose.utils import compute_pose_error, get_depths
        reproj_pix_thres = 8.0
        epipolar_pix_thres = 2.0

        # Weight for epipolar error
        epipolar_weight = 1.0

        # Ransac Options and Estimator Configs
        options = madpose.HybridLORansacOptions()
        options.min_num_iterations = 100
        options.max_num_iterations = 1000
        options.final_least_squares = True
        options.threshold_multiplier = 5.0
        options.num_lo_steps = 4
        options.squared_inlier_thresholds = [reproj_pix_thres**2, epipolar_pix_thres**2]
        options.data_type_weights = [1.0, epipolar_weight]
        options.random_seed = 0

        est_config = madpose.EstimatorConfig()
        est_config.min_depth_constraint = False# we use metric models so we dont want shift
        est_config.use_shift = False # we use metric models so we dont want shift
        est_config.ceres_num_threads = 2


    # Load frames
    depth_frames = []
    frame_residuals = []
    depth_frames_all = []
    global_points = {}
    rgb_frames = []
    fr_n = 0
    while raw_video.isOpened():

        print("--- frame ", fr_n + 1, " ---")
        ret, raw_frame = raw_video.read()
        if not ret:
            break

        raw_frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB)

        if color_video is not None:
            ret, col_vid = color_video.read()
            col_vid = cv2.cvtColor(col_vid, cv2.COLOR_BGR2RGB)
            rgb_frames.append(col_vid)
        else:
            rgb_frames.append(raw_frame)

        if mask_video is not None:
            ret, mask = mask_video.read()
            if ret:
                mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)


                rem = []
                rem_global = []
                for i, point in enumerate(frames[fr_n]):
                    if point[1] >= frame_width or point[2] >= frame_height or mask[point[2], point[1]] > 0:
                        rem.append(i)
                        rem_global.append(point[0])

                if len(rem) > 0:
                    frames[fr_n] = np.delete(frames[fr_n], rem, axis=0)

                if args.strict_mask:
                    for global_id in rem_global:
                        for frame_id, frame in enumerate(frames):
                            rem = []
                            for i, point in enumerate(frames[fr_n]):
                                if global_id == point[0]:
                                    rem.append(i)
                            if len(rem) > 0:
                                frames[frame_id] = np.delete(frames[frame_id], rem, axis=0)
            else:
                print("WARNING: mask video ended before other videos")


        depth = np.zeros((frame_height, frame_width), dtype=np.uint32)
        depth_unit = depth.view(np.uint8).reshape((frame_height, frame_width, 4))
        depth_unit[..., 3] = ((raw_frame[..., 0].astype(np.uint32) + raw_frame[..., 1]).astype(np.uint32) / 2)
        depth_unit[..., 2] = raw_frame[..., 2]
        depth = depth.astype(np.float32)/((255**4)/MODEL_maxOUTPUT_depth)
        depth_frames.append(depth)


        #Only used for point tirangulation of points# need to remove eventually
        depth_frames_all.append(depth)

        #DEBUG: Only looking at 25 frames dont want to load entire video when DEBUGING
        #if fr_n > 210:
        #    break

        #Add online analyzing so we dont have to load everything in to memory
        if len(depth_frames) > 1:
            ref_frame_no = fr_n - 1
            this_frame_no = fr_n
            best_common_points = list(set(frames[ref_frame_no][:, 0]) & set(frames[this_frame_no][:, 0]))

            #Current frame points
            point_ids_in_this_frame = frames[this_frame_no][:,0]
            cur_mask = np.isin(point_ids_in_this_frame, best_common_points)
            points_2d = frames[this_frame_no][cur_mask][:, 1:3]

            global_point_ids = frames[this_frame_no][cur_mask][:, 0]

            #Ref frame points
            point_ids_in_frame = frames[ref_frame_no][:,0]
            cur_mask = np.isin(point_ids_in_frame, best_common_points)
            ref_points_2d = frames[ref_frame_no][cur_mask][:, 1:3]


            tranformation_to_ref = np.eye(4)

            if args.use_madpose:

                points_3d = depth_map_tools.project_2d_points_to_3d(points_2d, depth_frames[-1], cam_matrix)
                ref_points_3d = depth_map_tools.project_2d_points_to_3d(ref_points_2d, depth_frames[-2], cam_matrix)

                # use madpose to esitimate transformation matrix
                pose, stats = madpose.HybridEstimatePoseScaleOffset(
                    points_2d,
                    ref_points_2d,
                    get_depths(depth_frames[-1], depth_frames[-1], points_2d),
                    get_depths(depth_frames[-2], depth_frames[-2], ref_points_2d),
                    [depth_frames[-1].min(), depth_frames[-2].min()],
                    cam_matrix,
                    cam_matrix,
                    options,
                    est_config,
                )

                R_est, t_est = pose.R(), pose.t()

                # scale of the affine corrected depth maps as given by madpose (if it has deviated to much from 1.0 tracking has probably failed)
                s_est = pose.scale
                print("madpose scale estimate:", s_est)

                tranformation_to_ref = np.eye(4)
                tranformation_to_ref[:3, :3] = R_est
                tranformation_to_ref[:3, 3] = t_est

            elif args.assume_stationary_camera:
                # Use SVD to find transformation from one frame to the next
                # This may produce inacurate results even if the real camera is mounted on a tripod as the 3d camera is pinhole camera
                # and this solution does not account for potential isses caused by diferances in "real world" focal distance (with lenses)
                # and the position of the 3d pinhole camera

                points_3d = depth_map_tools.project_2d_points_to_3d(points_2d, depth_frames[-1], cam_matrix)
                ref_points_3d = depth_map_tools.project_2d_points_to_3d(ref_points_2d, depth_frames[-2], cam_matrix)

                #Filter away points close away to mitigate shity mask
                #no_near_points = points_3d[:, 2] > 1.8
                #points_3d = points_3d[no_near_points]
                #ref_points_3d = ref_points_3d[no_near_points]
                #points_2d = points_2d[no_near_points]
                #ref_points_2d = ref_points_2d[no_near_points]

                mean_depth = np.mean(points_3d[:, 2])
                distant_points = points_3d[:, 2] > mean_depth

                tranformation_to_ref = depth_map_tools.svd(points_3d[distant_points], ref_points_3d[distant_points], True)
            else:
                #Camera pose algorithm in two step fundamantally based on grouping all points in to distant and close points.
                #Split in to two parts. 1. Find the rotation and the x and y movment of the camera 2. find the z movment.

                points_3d = depth_map_tools.project_2d_points_to_3d(points_2d, depth_frames[-1], cam_matrix)
                ref_points_3d = depth_map_tools.project_2d_points_to_3d(ref_points_2d, depth_frames[-2], cam_matrix)

                mean_depth = np.mean(points_3d[:, 2])
                distant_points = points_3d[:, 2] > mean_depth
                close_points = points_3d[:, 2] < mean_depth

                #3 iterations works good enogh in test videos
                for i in range(3):
                    #Find overall rotation based on distant points
                    overall_rot = depth_map_tools.svd(points_3d[distant_points], ref_points_3d[distant_points], True)

                    tranformation_to_ref @= overall_rot

                    #transform points according to overall rotation
                    points_3d = depth_map_tools.transform_points(points_3d, overall_rot)


                    #Find rotation to close points after correction of overall rotation
                    close_rotation = depth_map_tools.svd(points_3d[close_points], ref_points_3d[close_points], True)
                    clos_pos_mean = np.mean(points_3d[close_points], axis=0)

                    #Convert close rotation to up/down, left/right shift
                    pos_after_transform = depth_map_tools.transform_points(np.array([clos_pos_mean]), close_rotation)[0]
                    pos_change = pos_after_transform - clos_pos_mean
                    pos_change[2] = 0.0

                    transform_chnage = np.eye(4)
                    transform_chnage[:3, 3] = pos_change


                    #transform points according to close point movment
                    points_3d = depth_map_tools.transform_points(points_3d, transform_chnage)

                    tranformation_to_ref @= transform_chnage

                #Apply final overall roation after fixing shift
                final_overall_rot = depth_map_tools.svd(points_3d[distant_points], ref_points_3d[distant_points], True)
                points_3d = depth_map_tools.transform_points(points_3d, final_overall_rot)
                tranformation_to_ref @= final_overall_rot


                #We now assume the camera is aligned in rotation and x and y movment
                #lets do depth

                # Precompute the image center offset
                center_offset = np.array([frame_width // 2, frame_height // 2])

                avg_points_2d_ref = np.mean(ref_points_2d, 0) - center_offset

                ref_dist = np.linalg.norm(avg_points_2d_ref)

                # Parameters
                step_size = 0.002      # 2mm step (in meters)
                min_step  = 0.0001      # final 0.1mm step
                max_iter  = 20
                tolerance = 1e-5       # stop if error is smaller than this


                # Initialize variables
                direction = 1.0  # positive means moving forward
                prev_error = None

                for i in range(max_iter):
                    # Project the 3D points and recenter the result
                    points_2d = depth_map_tools.project_3d_points_to_2d(points_3d, cam_matrix)

                    # Compute the average 2D location and its distance from the center
                    avg_2d = np.mean(points_2d, axis=0) - center_offset
                    current_dist = np.linalg.norm(avg_2d)

                    # Compute error relative to the reference distance
                    error = abs(current_dist - ref_dist)

                    # Break if within tolerance
                    if error < tolerance:
                        break

                    # Reverse direction if the error worsens
                    if prev_error is not None and error > prev_error:
                        # If we already reversed, reduce step size
                        if direction < 0:
                            step_size = min_step
                        direction *= -1  # flip movement direction

                    prev_error = error

                    # Create the transformation matrix: a translation along the z-axis
                    transform_change = np.eye(4)
                    transform_change[:3, 3] = np.array([0, 0, step_size * direction])

                    # Apply transformation to update the 3D points and accumulate the change
                    points_3d = depth_map_tools.transform_points(points_3d, transform_change)
                    tranformation_to_ref @= transform_change

            to_ref_zero @= tranformation_to_ref

            transformations.append(to_ref_zero.tolist())


            #The depth estimations has now done their part
            #Now we will use the resulting transformation to calculate more accurate points

            #We use a reference frame 15 steps back, That way we always have points
            #You could probably get more accuracy by going even further back
            #On the other hand you lose acuracy the further back you go as the errors in the estimations of camera movment might add up

            _ref_frame_no = max(0, fr_n-15)
            _this_frame_no = fr_n
            _best_common_points = list(set(frames[_ref_frame_no][:, 0]) & set(frames[_this_frame_no][:, 0]))

            #Current frame points
            point_ids_in_frame = frames[_this_frame_no][:,0]
            cur_mask = np.isin(point_ids_in_frame, _best_common_points)
            points_2d_z = frames[this_frame_no][cur_mask][:, 1:3]

            global_point_ids = frames[_this_frame_no][cur_mask][:, 0]

            #Ref frame points
            point_ids_in_frame = frames[_ref_frame_no][:,0]
            cur_mask = np.isin(point_ids_in_frame, _best_common_points)
            ref_points_2d_z = frames[_ref_frame_no][cur_mask][:, 1:3]

            points_3d_c = depth_map_tools.project_2d_points_to_3d(points_2d_z, depth_frames[-1], cam_matrix)


            #Get tranformation from refernce frame to current frame
            ref_camtrans = np.array(transformations[_ref_frame_no])
            ref_camtrans_pos = ref_camtrans[:3, 3]
            camtrans_pos = to_ref_zero[:3, 3]
            ref_frame_inv_trans = np.linalg.inv(ref_camtrans)
            t_to_z = to_ref_zero @ ref_frame_inv_trans
            tranformation_to_ref_rot = t_to_z.copy()
            tranformation_to_ref_rot[:3, 3] = 0
            points_3d_c = depth_map_tools.transform_points(points_3d_c, tranformation_to_ref_rot)


            ref_points_3d_z = depth_map_tools.project_2d_points_to_3d(ref_points_2d_z, depth_frames_all[_ref_frame_no], cam_matrix)


            ref_ray = ref_points_3d_z/np.linalg.norm(ref_points_3d_z, axis=1, keepdims=True)
            ray = points_3d_c

            cam_move = t_to_z[:3, 3]
            cam_move_dist = np.linalg.norm(cam_move)
            #print(cam_move_dist)

            cam_2_cam_ray = cam_move / cam_move_dist

            cam_2_cam_ray = np.tile(cam_2_cam_ray, (ray.shape[0], 1))


            ray_angle = angle_between_rays(ref_ray, ray) #This angle will be wrong as this triangle has non planar vectors

            cam_2ref = angle_between_rays(ref_ray, cam_2_cam_ray)
            cam_2ray = angle_between_rays(ray, -cam_2_cam_ray)

            angle = np.pi - cam_2ref - cam_2ray

            residual = np.abs(angle - ray_angle) # as the tracking gets better the vecotrs should become more planar and
            #the residual should go closer to zero

            frame_residuals.append(np.sum(residual))


            #if _this_frame_no == 1000:
                #print("residuals:", np.rad2deg(np.mean(frame_residuals))/args.xfov)
                #exit(0)


            ref_cam2point = cam_move_dist * np.sin(cam_2ray)/np.sin(angle)
            cam2point = cam_move_dist * np.sin(cam_2ref)/np.sin(angle)


            ponts = (ref_ray * ref_cam2point[:, np.newaxis])-ref_camtrans_pos

            #print(ponts)

            distance_to_point = np.column_stack((global_point_ids, cam2point, ref_cam2point, angle, points_2d_z[:,0], points_2d_z[:,1]))

            #Filter out angles that are to small, thay are to unreliable
            #distance_to_point = distance_to_point[distance_to_point[:, 3] >= 0.01]

            for l, pt in enumerate(distance_to_point):
                global_id = int(pt[0])
                if pt[3] < 0.01: # Dont save if angle is to small
                    continue
                if global_id not in global_points:
                    global_points[global_id] = []
                global_points[global_id].append(ponts[l])

            #print(global_points)


            if debug_video is not None:

                max_nr = np.max(depth_frames[1])

                #print("max_nr: ", max_nr)

                debug_image = rgb_frames[-1]

                for pt in distance_to_point:
                    global_id = int(pt[0])
                    if global_id in global_points:
                        avg_depth = np.linalg.norm(np.mean(global_points[global_id], axis=0)-camtrans_pos)#TODO FIx there is something wrong with this
                        if pt[3] < 0.01: # Dont save if angle is to small
                            continue
                        avg_depth = pt[2]
                        cv2.putText(debug_image, "{:.2f}".format(avg_depth), ( int(pt[4]), int(pt[5])), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0,0,0), 1, cv2.LINE_AA)

                x = np.clip(points_2d[:,0].astype(np.int32), 0, frame_width-2)
                y = np.clip(points_2d[:,1].astype(np.int32), 0, frame_height-2)
                red = np.array([255,0,0])
                blue = np.array([0,0,255])
                debug_image[y, x] = red
                debug_image[y, x+1] = red
                debug_image[y, x-1] = red

                debug_image[y-1, x] = red
                debug_image[y-1, x+1] = red
                debug_image[y-1, x-1] = red

                debug_image[y+1, x] = red
                debug_image[y+1, x+1] = red
                debug_image[y+1, x-1] = red

                x = np.clip(ref_points_2d[:,0].astype(np.int32), 0, frame_width-2)
                y = np.clip(ref_points_2d[:,1].astype(np.int32), 0, frame_height-2)
                debug_image[y, x] = blue
                debug_image[y, x+1] = blue
                debug_image[y, x-1] = blue

                debug_image[y-1, x] = blue
                debug_image[y-1, x+1] = blue
                debug_image[y-1, x-1] = blue

                debug_image[y+1, x] = blue
                debug_image[y+1, x+1] = blue
                debug_image[y+1, x-1] = blue

                debug_video.write(cv2.cvtColor(debug_image, cv2.COLOR_RGB2BGR))

            depth_frames.pop(0)
            if color_video is not None:
                rgb_frames.pop(0)

        fr_n += 1

        if args.max_frames < fr_n and args.max_frames != -1:
            break

    raw_video.release()
    if color_video is not None:
        color_video.release()

    if debug_video is not None:
        debug_video.release()


    #ref_mesh Is used to draw the DEBUG alignment window
    #ref_mesh = depth_map_tools.get_mesh_from_depth_map(depth_frames[frame_n], cam_matrix, rgb_frames[frame_n])
    #ref_mesh.paint_uniform_color([0, 0, 1])
    #meshes = [ref_mesh]


    with open(out_file, "w") as json_file_handle:
        json_file_handle.write(json.dumps(transformations, cls=NumpyEncoder))