diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..550d7b296f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vggt"] + path = vggt + url = https://github.com/jckhng/vggt.git diff --git a/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py b/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py index 55f657a3a4..699b32bf36 100644 --- a/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py +++ b/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py @@ -13,6 +13,8 @@ # limitations under the License. """Base class to processes a video or image sequence to a nerfstudio compatible dataset.""" +import sys +sys.path.append('./vggt') from dataclasses import dataclass from pathlib import Path @@ -35,7 +37,7 @@ class ColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset): """Feature matching method to use. Vocab tree is recommended for a balance of speed and accuracy. Exhaustive is slower but more accurate. Sequential is faster but should only be used for videos.""" - sfm_tool: Literal["any", "colmap", "hloc"] = "any" + sfm_tool: Literal["any", "colmap", "hloc", "vggt"] = "any" """Structure from motion tool to use. Colmap will use sift features, hloc can use many modern methods such as superpoint features and superglue matcher""" refine_pixsfm: bool = False @@ -238,6 +240,34 @@ def _run_colmap(self, mask_path: Optional[Path] = None): refine_pixsfm=self.refine_pixsfm, use_single_camera_mode=self.use_single_camera_mode, ) + elif sfm_tool == "vggt": + from vggt_to_colmap import load_model, process_images, extrinsic_to_colmap_format, filter_and_prepare_points + + model, device = load_model() + predictions, image_names = process_images(image_dir, model, device) + + quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"]) + points3D, image_points2D = filter_and_prepare_points( + predictions, + conf_threshold=50.0, + mask_sky=False, + mask_black_bg=True, + mask_white_bg=True, + stride=1, + ) + + # Save COLMAP-compatible files + colmap_utils.save_colmap_files( + self.output_dir, + quaternions, + translations, + points3D, + image_points2D, + image_names, + predictions["intrinsic"], + predictions["images"].shape[2], + predictions["images"].shape[1], + ) else: raise RuntimeError("Invalid combination of sfm_tool, feature_type, and matcher_type, exiting") diff --git a/nerfstudio/process_data/colmap_utils.py b/nerfstudio/process_data/colmap_utils.py index 1d9405c81a..2a6ca144f3 100644 --- a/nerfstudio/process_data/colmap_utils.py +++ b/nerfstudio/process_data/colmap_utils.py @@ -712,3 +712,80 @@ def create_ply_from_colmap( x, y, z = coord r, g, b = color f.write(f"{x:8f} {y:8f} {z:8f} {r} {g} {b}\n") + + +def save_colmap_files( + output_dir: Path, + quaternions: np.ndarray, + translations: np.ndarray, + points3D: list, + image_points2D: list, + image_names: list, + intrinsic: np.ndarray, + width: int, + height: int, +): + """Save COLMAP-compatible reconstruction files in both text and binary formats. + + Args: + output_dir: Path to save the COLMAP files + quaternions: Camera orientations as quaternions (Nx4) + translations: Camera positions (Nx3) + points3D: List of 3D points with rgb and observation info + image_points2D: 2D point observations for each image + image_names: Names of the images + intrinsic: Camera intrinsic matrix + width: Image width + height: Image height + """ + import sys + sys.path.append(str(Path(__file__).parent.parent.parent / "vggt")) + from vggt_to_colmap import ( + write_colmap_cameras_txt, + write_colmap_images_txt, + write_colmap_points3D_txt, + write_colmap_cameras_bin, + write_colmap_images_bin, + write_colmap_points3D_bin, + ) + + sparse_dir = output_dir / "colmap" / "sparse" / "0" + sparse_dir.mkdir(parents=True, exist_ok=True) + + # Save text format + write_colmap_cameras_txt( + sparse_dir / "cameras.txt", + intrinsic, + width, + height, + ) + write_colmap_images_txt( + sparse_dir / "images.txt", + quaternions, + translations, + image_points2D, + image_names, + ) + write_colmap_points3D_txt( + sparse_dir / "points3D.txt", + points3D, + ) + + # Save binary format + write_colmap_cameras_bin( + sparse_dir / "cameras.bin", + intrinsic, + width, + height, + ) + write_colmap_images_bin( + sparse_dir / "images.bin", + quaternions, + translations, + image_points2D, + image_names, + ) + write_colmap_points3D_bin( + sparse_dir / "points3D.bin", + points3D, + ) diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py index 89f78ed53f..be3d60b3d8 100644 --- a/nerfstudio/process_data/process_data_utils.py +++ b/nerfstudio/process_data/process_data_utils.py @@ -490,7 +490,7 @@ def downscale_images( def find_tool_feature_matcher_combination( - sfm_tool: Literal["any", "colmap", "hloc"], + sfm_tool: Literal["any", "colmap", "hloc", "vggt"], feature_type: Literal[ "any", "sift", @@ -518,11 +518,11 @@ def find_tool_feature_matcher_combination( ) -> Union[ Tuple[None, None, None], Tuple[ - Literal["colmap", "hloc"], + Literal["colmap", "hloc", "vggt"], Literal[ "sift", "superpoint_aachen", - "superpoint_max", + "superpoint_max", "superpoint_inloc", "r2d2", "d2net-ss", @@ -546,7 +546,7 @@ def find_tool_feature_matcher_combination( Basically, replace the default parameters 'any' by usable value Args: - sfm_tool: Sfm tool name (any, colmap, hloc) + sfm_tool: Sfm tool name (any, colmap, hloc, vggt) feature_type: Type of image features (any, sift, superpoint, ...) matcher_type: Type of matching algorithm (any, NN, superglue,...) @@ -555,10 +555,19 @@ def find_tool_feature_matcher_combination( Returns (None,None,None) if no valid combination can be found """ if sfm_tool == "any": - if (feature_type in ("any", "sift")) and (matcher_type in ("any", "NN")): - sfm_tool = "colmap" - else: - sfm_tool = "hloc" + sfm_tool = "colmap" + + if sfm_tool == "vggt": + # VGGT does not require feature_type or matcher_type + return ("vggt", None, None) + elif sfm_tool == "colmap": + feature_type = "sift" + matcher_type = "NN" + elif sfm_tool == "hloc": + feature_type = feature_type or "superpoint" + matcher_type = matcher_type or "superglue" + else: + raise ValueError(f"Invalid sfm_tool: {sfm_tool}") if sfm_tool == "colmap": if (feature_type not in ("any", "sift")) or (matcher_type not in ("any", "NN")): @@ -573,7 +582,8 @@ def find_tool_feature_matcher_combination( elif matcher_type == "NN": matcher_type = "NN-mutual" - return (sfm_tool, feature_type, matcher_type) + return ("hloc", feature_type, matcher_type) + return (None, None, None) diff --git a/nerfstudio/process_data/vggt_colmap_converter_to_nerfstudio_dataset.old b/nerfstudio/process_data/vggt_colmap_converter_to_nerfstudio_dataset.old new file mode 100644 index 0000000000..0e29e11cdc --- /dev/null +++ b/nerfstudio/process_data/vggt_colmap_converter_to_nerfstudio_dataset.old @@ -0,0 +1,140 @@ +import sys +sys.path.append('./vggt') + +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import shutil + +from nerfstudio.process_data.base_converter_to_nerfstudio_dataset import BaseConverterToNerfstudioDataset +from nerfstudio.process_data.colmap_utils import colmap_to_json +from nerfstudio.utils.rich_utils import CONSOLE + +# Adjusted imports for VGGT utilities +from vggt_to_colmap import load_model, process_images, extrinsic_to_colmap_format, filter_and_prepare_points, write_colmap_cameras_txt, write_colmap_images_txt, write_colmap_points3D_txt, write_colmap_cameras_bin, write_colmap_images_bin, write_colmap_points3D_bin + +@dataclass +class VGGTColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset): + """Class to process VGGT data into a Nerfstudio-compatible dataset.""" + + data: Path + """Path to the input data, either a video file or a directory of images.""" + output_dir: Path + """Path to the output directory.""" + conf_threshold: float = 50.0 + """Confidence threshold for filtering points.""" + mask_sky: bool = False + """Whether to mask sky regions.""" + mask_black_bg: bool = False + """Whether to mask black background regions.""" + mask_white_bg: bool = False + """Whether to mask white background regions.""" + stride: int = 1 + """Stride for point sampling.""" + + @property + def image_dir(self) -> Path: + return self.data + + def _run_vggt_to_colmap(self): + """Run VGGT to generate COLMAP-compatible data.""" + model, device = load_model() + predictions, image_names = process_images(self.image_dir, model, device) + + quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"]) + points3D, image_points2D = filter_and_prepare_points( + predictions, + self.conf_threshold, + mask_sky=self.mask_sky, + mask_black_bg=self.mask_black_bg, + mask_white_bg=self.mask_white_bg, + stride=self.stride, + ) + + return quaternions, translations, points3D, image_points2D, image_names, predictions + + def _save_transforms(self, num_frames: int) -> List[str]: + """Save transforms.json after processing VGGT data.""" + summary_log = [] + quaternions, translations, points3D, image_points2D, image_names, predictions = self._run_vggt_to_colmap() + + with CONSOLE.status("[bold yellow]Saving results to transforms.json", spinner="balloon"): + # Save COLMAP-compatible files before calling colmap_to_json + write_colmap_cameras_txt( + self.output_dir / "cameras.txt", + predictions["intrinsic"], + predictions["images"].shape[2], + predictions["images"].shape[1], + ) + write_colmap_images_txt( + self.output_dir / "images.txt", + quaternions, + translations, + image_points2D, + image_names, + ) + write_colmap_points3D_txt( + self.output_dir / "points3D.txt", + points3D, + ) + + # Save binary COLMAP-compatible files + write_colmap_cameras_bin( + self.output_dir / "cameras.bin", + predictions["intrinsic"], + predictions["images"].shape[2], + predictions["images"].shape[1], + ) + write_colmap_images_bin( + self.output_dir / "images.bin", + quaternions, + translations, + image_points2D, + image_names, + ) + write_colmap_points3D_bin( + self.output_dir / "points3D.bin", + points3D, + ) + + num_matched_frames = colmap_to_json( + recon_dir=self.output_dir, + output_dir=self.output_dir, + camera_mask_path=None, + image_id_to_depth_path=None, + image_rename_map=None, + keep_original_world_coordinate=False, + use_single_camera_mode=True, + ) + summary_log.append(f"VGGT-Colmap matched {num_matched_frames} images") + + return summary_log + + def __post_init__(self) -> None: + super().__post_init__() + if not self.image_dir.exists(): + raise RuntimeError(f"Image directory {self.image_dir} does not exist.") + + def main(self) -> None: + """Main method to process VGGT data into a Nerfstudio-compatible dataset.""" + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Copy downscaled images from the input directory to the output directory + downscaled_image_dir = self.image_dir / "downscaled" + if not downscaled_image_dir.exists(): + raise FileNotFoundError(f"Expected downscaled directory at {downscaled_image_dir}, but it does not exist.") + + output_image_dir = self.output_dir / "images" + output_image_dir.mkdir(parents=True, exist_ok=True) + + for image_file in downscaled_image_dir.iterdir(): + if image_file.is_file(): + shutil.copy(image_file, output_image_dir) + + print(f"Copied downscaled images to {output_image_dir}") + + summary_log = self._save_transforms(num_frames=0) + + for summary in summary_log: + CONSOLE.print(summary, justify="center") + CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:") \ No newline at end of file diff --git a/nerfstudio/scripts/process_data.py b/nerfstudio/scripts/process_data.py index 1fdd36f7f2..e296b0025c 100644 --- a/nerfstudio/scripts/process_data.py +++ b/nerfstudio/scripts/process_data.py @@ -554,7 +554,6 @@ def main(self) -> None: ... ], ] - def entrypoint(): """Entrypoint for use with pyproject scripts.""" tyro.extras.set_accent_color("bright_yellow") diff --git a/notes.md b/notes.md new file mode 100644 index 0000000000..7d269f5ffe --- /dev/null +++ b/notes.md @@ -0,0 +1,3 @@ +1. need to read in a bunch of images +2. pain point is in VGGT being memory hog +3. how do fix this? need to batch it up? and how to combine after batching it up? \ No newline at end of file diff --git a/vggt b/vggt new file mode 160000 index 0000000000..89f1ec519f --- /dev/null +++ b/vggt @@ -0,0 +1 @@ +Subproject commit 89f1ec519fdd8b05e6c2f90d1607c6464d14c05f