Add Support for Meshroom data convertion

parodiniccolo · parodiniccolo · commit d1350d881663 · 2025-04-28T22:49:11.000+02:00
diff --git a/nerfstudio/process_data/meshroom_utils.py b/nerfstudio/process_data/meshroom_utils.py
@@ -0,0 +1,270 @@
+# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper utils for processing meshroom data into the nerfstudio format."""
+
+import json
+import math
+import numpy as np
+from pathlib import Path
+from typing import Dict, List, Optional
+from copy import deepcopy as dc
+
+from nerfstudio.process_data.process_data_utils import CAMERA_MODELS
+from nerfstudio.utils.rich_utils import CONSOLE
+
+# Rotation matrix to adjust coordinate system
+ROT_MAT = np.array([[1, 0, 0, 0],
+                    [0, 0, 1, 0],
+                    [0,-1, 0, 0],
+                    [0, 0, 0, 1]])
+
+def reflect(axis, size=4):
+    """Create a reflection matrix along the specified axis."""
+    _diag = np.ones(size)
+    _diag[axis] = -1
+    refl = np.diag(_diag)
+    return refl
+
+def Mat2Nerf(mat):
+    """Convert a matrix to NeRF coordinate system."""
+    M = np.array(mat)
+    M = ((M @ reflect(2)) @ reflect(1))
+    return M
+
+def closest_point_2_lines(oa, da, ob, db): 
+    """Find the point closest to both rays of form o+t*d."""
+    da = da / np.linalg.norm(da)
+    db = db / np.linalg.norm(db)
+    c = np.cross(da, db)
+    denom = np.linalg.norm(c)**2
+    t = ob - oa
+    ta = np.linalg.det([t, db, c]) / (denom + 1e-10)
+    tb = np.linalg.det([t, da, c]) / (denom + 1e-10)
+    if ta > 0:
+        ta = 0
+    if tb > 0:
+        tb = 0
+    return (oa+ta*da+ob+tb*db) * 0.5, denom
+
+def central_point(out):
+    """Find a central point all cameras are looking at."""
+    CONSOLE.print("Computing center of attention...")
+    totw = 0.0
+    totp = np.array([0.0, 0.0, 0.0])
+    for f in out["frames"]:
+        mf = np.array(f["transform_matrix"])[0:3,:]
+        for g in out["frames"]:
+            mg = np.array(g["transform_matrix"])[0:3,:]
+            p, w = closest_point_2_lines(mf[:,3], mf[:,2], mg[:,3], mg[:,2])
+            if w > 0.01:
+                totp += p*w
+                totw += w
+                
+    if len(out["frames"]) == 0:
+        CONSOLE.print("[bold red]No frames found when computing center of attention[/bold red]")
+        return totp
+
+    if (totw == 0) and (not totp.any()):
+        CONSOLE.print("[bold red]Center of attention is zero[/bold red]")
+        return totp
+    
+    totp /= totw
+    CONSOLE.print(f"The center of attention is: {totp}")
+
+    return totp
+
+def build_sensor(intrinsic):
+    """Build camera intrinsics from Meshroom data."""
+    out = {}
+    out["w"] = float(intrinsic['width'])
+    out["h"] = float(intrinsic['height'])
+
+    # Focal length in mm
+    focal = float(intrinsic['focalLength'])
+    
+    # Sensor width in mm
+    sensor_width = float(intrinsic['sensorWidth'])
+    sensor_height = float(intrinsic['sensorHeight'])
+
+    # Focal length in pixels
+    out["fl_x"] = (out["w"] * focal) / sensor_width
+
+    # Check W/H ratio to sensor ratio
+    if np.isclose((out["w"] / out["h"]), (sensor_width / sensor_height)):
+        out["fl_y"] = (out["h"] * focal) / sensor_height
+    else:
+        CONSOLE.print("[yellow]WARNING: W/H ratio does not match sensor ratio, this is likely a bug from Meshroom. Will use fl_x to set fl_y.[/yellow]")
+        out["fl_y"] = out["fl_x"]
+
+    camera_angle_x = math.atan(out["w"] / (out['fl_x']) * 2) * 2
+    camera_angle_y = math.atan(out["h"] / (out['fl_y']) * 2) * 2
+
+    out["camera_angle_x"] = camera_angle_x
+    out["camera_angle_y"] = camera_angle_y
+
+    out["cx"] = float(intrinsic['principalPoint'][0]) + (out["w"] / 2.0)
+    out["cy"] = float(intrinsic['principalPoint'][1]) + (out["h"] / 2.0)
+
+    if intrinsic['type'] == 'radial3':
+        for i, coef in enumerate(intrinsic['distortionParams']):
+            out[f"k{i + 1}"] = float(coef)
+    
+    return out
+
+def meshroom_to_json(
+    image_filename_map: Dict[str, Path],
+    json_filename: Path,
+    output_dir: Path,
+    ply_filename: Optional[Path] = None,
+    verbose: bool = False,
+) -> List[str]:
+    """Convert Meshroom data into a nerfstudio dataset.
+
+    Args:
+        image_filename_map: Mapping of original image filenames to their saved locations.
+        json_filename: Path to the Meshroom json file.
+        output_dir: Path to the output directory.
+        ply_filename: Path to the exported ply file.
+        verbose: Whether to print verbose output.
+
+    Returns:
+        Summary of the conversion.
+    """
+    summary_log = []
+    
+    with open(json_filename, 'r') as f:
+        data = json.load(f)
+    
+    # Create output structure
+    out = {}
+    out['aabb_scale'] = 16  # Default value
+    
+    # Extract transforms from Meshroom data
+    transforms = {}
+    for pose in data.get('poses', []):
+        transform = pose['pose']['transform']
+        rot = np.asarray(transform['rotation'])
+        rot = rot.reshape(3, 3).astype(float)
+
+        ctr = np.asarray(transform['center'])
+        ctr = ctr.astype(float)
+
+        M = np.eye(4)
+        M[:3, :3] = rot
+        M[:3, 3] = ctr
+
+        M = Mat2Nerf(M.astype(float))
+        transforms[pose['poseId']] = np.dot(ROT_MAT, M)
+
+    # Extract intrinsics from Meshroom data
+    intrinsics = {}
+    for intrinsic in data.get('intrinsics', []):
+        intrinsics[intrinsic['intrinsicId']] = build_sensor(intrinsic)
+    
+    # Set camera model based on intrinsic type
+    if data.get('intrinsics') and 'type' in data['intrinsics'][0]:
+        intrinsic_type = data['intrinsics'][0]['type']
+        if intrinsic_type in ['radial1', 'radial3']:
+            out["camera_model"] = CAMERA_MODELS["perspective"].value
+        elif intrinsic_type in ['fisheye', 'fisheye4']:
+            out["camera_model"] = CAMERA_MODELS["fisheye"].value
+        else:
+            # Default to perspective
+            out["camera_model"] = CAMERA_MODELS["perspective"].value
+    else:
+        out["camera_model"] = CAMERA_MODELS["perspective"].value
+    
+    # Build frames
+    frames = []
+    skipped_images = 0
+    
+    for view in data.get('views', []):
+        # Get the image name from the path
+        path = Path(view['path'])
+        name = path.stem
+        
+        # Check if the image exists in our mapping
+        if name not in image_filename_map:
+            if verbose:
+                CONSOLE.print(f"[yellow]Missing image for {name}, skipping[/yellow]")
+            skipped_images += 1
+            continue
+        
+        # Get poseId and intrinsicId
+        poseId = view['poseId']
+        intrinsicId = view['intrinsicId']
+        
+        # Check if we have the necessary data
+        if poseId not in transforms:
+            if verbose:
+                CONSOLE.print(f"[yellow]PoseId {poseId} not found in transforms, skipping image: {name}[/yellow]")
+            skipped_images += 1
+            continue
+        
+        if intrinsicId not in intrinsics:
+            if verbose:
+                CONSOLE.print(f"[yellow]IntrinsicId {intrinsicId} not found, skipping image: {name}[/yellow]")
+            skipped_images += 1
+            continue
+        
+        # Create camera data
+        camera = {}
+        camera.update(dc(intrinsics[intrinsicId]))
+        camera['transform_matrix'] = transforms[poseId]
+        camera['file_path'] = image_filename_map[name].as_posix()
+        
+        frames.append(camera)
+    
+    out['frames'] = frames
+    
+    # Calculate center point
+    center = central_point(out)
+    
+    # Adjust camera positions by centering
+    for f in out["frames"]:
+        f["transform_matrix"][0:3, 3] -= center
+        f["transform_matrix"] = f["transform_matrix"].tolist()
+    
+    # Include point cloud if provided
+    if ply_filename is not None:
+        import open3d as o3d
+        
+        # Create the applied transform
+        applied_transform = np.eye(4)[:3, :]
+        applied_transform = applied_transform[np.array([2, 0, 1]), :]
+        out["applied_transform"] = applied_transform.tolist()
+        
+        # Load and transform point cloud
+        pc = o3d.io.read_point_cloud(str(ply_filename))
+        points3D = np.asarray(pc.points)
+        points3D = np.einsum("ij,bj->bi", applied_transform[:3, :3], points3D) + applied_transform[:3, 3]
+        pc.points = o3d.utility.Vector3dVector(points3D)
+        o3d.io.write_point_cloud(str(output_dir / "sparse_pc.ply"), pc)
+        out["ply_file_path"] = "sparse_pc.ply"
+        summary_log.append(f"Imported {ply_filename} as starting points")
+    
+    # Write output
+    with open(output_dir / "transforms.json", "w", encoding="utf-8") as f:
+        json.dump(out, f, indent=4)
+    
+    # Add summary info
+    if skipped_images == 1:
+        summary_log.append(f"{skipped_images} image skipped due to missing camera pose or intrinsic data.")
+    elif skipped_images > 1:
+        summary_log.append(f"{skipped_images} images were skipped due to missing camera poses or intrinsic data.")
+    
+    summary_log.append(f"Final dataset contains {len(out['frames'])} frames.")
+    
+    return summary_log
diff --git a/nerfstudio/scripts/process_data.py b/nerfstudio/scripts/process_data.py
@@ -26,6 +26,7 @@
 from typing_extensions import Annotated
 
 from nerfstudio.process_data import (
+    meshroom_utils,
     metashape_utils,
     odm_utils,
     polycam_utils,
@@ -330,6 +331,105 @@ def main(self) -> None:
         CONSOLE.rule()
 
 
+@dataclass
+class _NoDefaultProcessMeshroom:
+    """Private class to order the parameters of ProcessMeshroom in the right order for default values."""
+
+    json: Path
+    """Path to the Meshroom sfm.json file."""
+
+
+@dataclass
+class ProcessMeshroom(BaseConverterToNerfstudioDataset, _NoDefaultProcessMeshroom):
+    """Process Meshroom data into a nerfstudio dataset.
+
+    This script assumes that cameras have been aligned using Meshroom. After alignment, it is necessary to export the
+    camera poses as a `.json` file.
+
+    Optional: Meshroom does not align or constrain solved cameras, you may want to add a SfMTransform after the StructureFromMotion node, set the Transformation Method to Manual, and adjust camera positioning.
+
+    When you Start Meshroom processing, it generates an output folder for the ConvertSfMFormat node (right click > Open Folder). The sfm.json file needed for this script's --input function will be generated there.
+
+    This script does the following:
+    1. Scales images to a specified size.
+    2. Converts Meshroom poses into the nerfstudio format.
+    """
+
+    ply: Optional[Path] = None
+    """Path to the Meshroom point export ply file."""
+
+    num_downscales: int = 3
+    """Number of times to downscale the images. Downscales by 2 each time. For example a value of 3
+        will downscale the images by 2x, 4x, and 8x."""
+    max_dataset_size: int = 600
+    """Max number of images to train on. If the dataset has more, images will be sampled approximately evenly. If -1,
+    use all images."""
+
+    def main(self) -> None:
+        """Process images into a nerfstudio dataset."""
+
+        if self.json.suffix != ".json":
+            raise ValueError(f"JSON file {self.json} must have a .json extension")
+        if not self.json.exists():
+            raise ValueError(f"JSON file {self.json} doesn't exist")
+        if self.eval_data is not None:
+            raise ValueError("Cannot use eval_data since cameras were already aligned with Meshroom.")
+
+        if self.ply is not None:
+            if self.ply.suffix != ".ply":
+                raise ValueError(f"PLY file {self.ply} must have a .ply extension")
+            if not self.ply.exists():
+                raise ValueError(f"PLY file {self.ply} doesn't exist")
+
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        image_dir = self.output_dir / "images"
+        image_dir.mkdir(parents=True, exist_ok=True)
+
+        summary_log = []
+
+        # Copy images to output directory
+        image_filenames, num_orig_images = process_data_utils.get_image_filenames(self.data, self.max_dataset_size)
+        copied_image_paths = process_data_utils.copy_images_list(
+            image_filenames,
+            image_dir=image_dir,
+            verbose=self.verbose,
+            num_downscales=self.num_downscales,
+        )
+        num_frames = len(copied_image_paths)
+
+        copied_image_paths = [Path("images/" + copied_image_path.name) for copied_image_path in copied_image_paths]
+        original_names = [image_path.stem for image_path in image_filenames]
+        image_filename_map = dict(zip(original_names, copied_image_paths))
+
+        if self.max_dataset_size > 0 and num_frames != num_orig_images:
+            summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total")
+            summary_log.append(
+                "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to "
+                f"larger than the current value ({self.max_dataset_size}), or -1 to use all images."
+            )
+        else:
+            summary_log.append(f"Started with {num_frames} images")
+
+        # Save json
+        if num_frames == 0:
+            CONSOLE.print("[bold red]No images found, exiting")
+            sys.exit(1)
+        summary_log.extend(
+            meshroom_utils.meshroom_to_json(
+                image_filename_map=image_filename_map,
+                json_filename=self.json,
+                output_dir=self.output_dir,
+                ply_filename=self.ply,
+                verbose=self.verbose,
+            )
+        )
+
+        CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:")
+
+        for summary in summary_log:
+            CONSOLE.print(summary, justify="center")
+        CONSOLE.rule()
+
 @dataclass
 class _NoDefaultProcessRealityCapture:
     """Private class to order the parameters of ProcessRealityCapture in the right order for default values."""
@@ -529,6 +629,7 @@ def main(self) -> None: ...
     Annotated[ProcessRealityCapture, tyro.conf.subcommand(name="realitycapture")],
     Annotated[ProcessRecord3D, tyro.conf.subcommand(name="record3d")],
     Annotated[ProcessODM, tyro.conf.subcommand(name="odm")],
+    Annotated[ProcessMeshroom, tyro.conf.subcommand(name="meshroom")],
 ]
 
 # Add aria subcommand if projectaria_tools is installed.