Kin-Zhang
diff --git a/‎assets/slurm/dufolabel_sbatch.py‎
Lines changed: 0 additions & 58 deletions b/‎assets/slurm/dufolabel_sbatch.py‎
Lines changed: 0 additions & 58 deletions
diff --git a/‎assets/slurm/ssl-process.sh‎
Lines changed: 1 addition & 1 deletion b/‎assets/slurm/ssl-process.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conf/eval.yaml‎
Lines changed: 2 additions & 2 deletions b/‎conf/eval.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dataprocess/README.md‎
Lines changed: 3 additions & 0 deletions b/‎dataprocess/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎eval.py‎
Lines changed: 24 additions & 11 deletions b/‎eval.py‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎process.py‎
Lines changed: 0 additions & 1 deletion b/‎process.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/dataset.py‎
Lines changed: 6 additions & 17 deletions b/‎src/dataset.py‎
Lines changed: 6 additions & 17 deletions
diff --git a/‎src/models/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/models/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/runner.py‎
Lines changed: 15 additions & 11 deletions b/‎src/runner.py‎
Lines changed: 15 additions & 11 deletions
@@ -18,7 +18,7 @@ cd /proj/berzelius-2023-154/users/x_qinzh/OpenSceneFlow
 
 
 # data directory containing the extracted h5py files
-DATA_DIR="/proj/berzelius-2023-364/data/truckscenes/h5py/val"
+DATA_DIR="/proj/berzelius-2023-364/data/av2/h5py/sensor/train"
 
 TOTAL_SCENES=$(ls ${DATA_DIR}/*.h5 | wc -l)
 # Process every n-th frame into DUFOMap, no need to change at least for now.
 
@@ -1,6 +1,6 @@
 
 dataset_path: /home/kin/data/av2/h5py/sensor
-checkpoint: /home/kin/model_zoo/deflow.ckpt
+checkpoint: /home/kin/data/model_zoo/deltaflow_public/deltaflow-av2.ckpt
 data_mode: val # [val, test]
 save_res: False # [True, False]
 
@@ -15,7 +15,7 @@ output: ${model.name}-${slurm_id}
 gpus: 1
 seed: 42069
 eval_only: True
-wandb_mode: offline # [offline, disabled, online]
+wandb_mode: disabled # [offline, disabled, online]
 defaults:
   - hydra: default
   - model: deflow
@@ -247,3 +247,6 @@ Process train data for self-supervised learning. Only training data needs this s
 ```bash
 python process.py --data_dir /home/kin/data/av2/h5py/sensor/train --scene_range 0,701
 ```
+
+As some users must have multi-nodes for running, here I provide an example SLURM script to run the data process in parallel. 
+Check [assets/slurm/ssl-process.sh](../assets/slurm/ssl-process.sh) for more details.
@@ -13,12 +13,13 @@
 import torch
 from torch.utils.data import DataLoader
 import lightning.pytorch as pl
-from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
 from omegaconf import DictConfig
 import hydra, wandb, os, sys
 from hydra.core.hydra_config import HydraConfig
 from src.dataset import HDF5Dataset
 from src.trainer import ModelWrapper
+from src.utils import InlineTee
 
 def precheck_cfg_valid(cfg):
     if os.path.exists(cfg.dataset_path + f"/{cfg.data_mode}") is False:
@@ -36,8 +37,8 @@ def main(cfg):
 
     if 'iter_only' in cfg.model and cfg.model.iter_only:
         from src.runner import launch_runner
-        print(f"---LOG[eval]: Run optmization-based method: {cfg.model.name}")
-        launch_runner(cfg, cfg.data_mode)
+        launch_runner(cfg, cfg.data_mode, output_dir)
+        print(f"---LOG[eval]: Finished optimization-based evaluation. Logging saved to {output_dir}/output.log")
         return
 
     if not os.path.exists(cfg.checkpoint):
@@ -47,27 +48,39 @@ def main(cfg):
     torch_load_ckpt = torch.load(cfg.checkpoint)
     checkpoint_params = DictConfig(torch_load_ckpt["hyper_parameters"])
     cfg.output = checkpoint_params.cfg.output + f"-e{torch_load_ckpt['epoch']}-{cfg.data_mode}-v{cfg.leaderboard_version}"
+    # replace output_dir ${old_output_dir} with ${output_dir}
+    output_dir = output_dir.replace(HydraConfig.get().runtime.output_dir.split('/')[-2], checkpoint_params.cfg.output.split('/')[-1])
     cfg.model.update(checkpoint_params.cfg.model)
     cfg.num_frames = cfg.model.target.get('num_frames', checkpoint_params.cfg.get('num_frames', cfg.get('num_frames', 2)))
 
     mymodel = ModelWrapper.load_from_checkpoint(cfg.checkpoint, cfg=cfg, eval=True)
-    print(f"\n---LOG[eval]: Loaded model from {cfg.checkpoint}. The backbone network is {checkpoint_params.cfg.model.name}.\n")
+    os.makedirs(output_dir, exist_ok=True)
+    sys.stdout = InlineTee(f"{output_dir}/output.log")
+    print(f"---LOG[eval]: Loaded model from {cfg.checkpoint}. The backbone network is {checkpoint_params.cfg.model.name}.")
+    print(f"---LOG[eval]: Evaluation data: {cfg.dataset_path}/{cfg.data_mode} set.\n")
 
-    wandb_logger = WandbLogger(save_dir=output_dir,
-                               entity="kth-rpl",
-                               project=f"opensf-eval", 
-                               name=f"{cfg.output}",
-                               offline=(cfg.wandb_mode == "offline"))
+    if cfg.wandb_mode != "disabled":
+        logger = WandbLogger(save_dir=output_dir,
+                            entity="kth-rpl",
+                            project=f"opensf-eval", 
+                            name=f"{cfg.output}",
+                            offline=(cfg.wandb_mode == "offline"))
+        logger.watch(mymodel, log_graph=False)
+    else:
+        # check local tensorboard logging: tensorboard --logdir logs/jobs/{log folder}
+        logger = TensorBoardLogger(save_dir=output_dir, name="logs")
 
-    trainer = pl.Trainer(logger=wandb_logger, devices=1)
+    trainer = pl.Trainer(logger=logger, devices=1)
     # NOTE(Qingwen): search & check: def eval_only_step_(self, batch, res_dict)
     trainer.validate(model = mymodel, \
                      dataloaders = DataLoader( \
                                             HDF5Dataset(cfg.dataset_path + f"/{cfg.data_mode}", \
                                                         n_frames=cfg.num_frames, \
                                                         eval=True, leaderboard_version=cfg.leaderboard_version), \
                                             batch_size=1, shuffle=False))
-    wandb.finish()
+    if cfg.wandb_mode != "disabled":
+        wandb.finish()
+    print(f"---LOG[eval]: Finished feed-forward evaluation. Logging saved to {output_dir}/output.log")
 
 if __name__ == "__main__":
     main()
@@ -186,7 +186,6 @@ def main(
     if not os.path.exists(gm_config_path) and run_gm:
         raise FileNotFoundError(f"Ground segmentation config file not found: {gm_config_path}. Please check folder")
 
-    
     data_path = Path(data_dir)
     dataset = HDF5Data(data_path) # single frame reading.
     all_scene_ids = list(dataset.scene_id_bounds.keys())
 
@@ -23,6 +23,7 @@
 import h5py, pickle, argparse
 from tqdm import tqdm
 import numpy as np
+from torchvision import transforms
 
 import os, sys
 BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
@@ -185,8 +186,8 @@ def __call__(self, data_dict):
 class HDF5Dataset(Dataset):
     def __init__(self, directory, \
                 transform=None, n_frames=2, ssl_label=None, \
-                eval = False, eval_input_seq = False, leaderboard_version=1, \
-                vis_name='', flow_num=1):
+                eval = False, leaderboard_version=1, \
+                vis_name=''):
         '''
         Args:
             directory: the directory of the dataset, the folder should contain some .h5 file and index_total.pkl.
@@ -196,10 +197,8 @@ def __init__(self, directory, \
             * n_frames: the number of frames we use, default is 2: current (pc0), next (pc1); if it's more than 2, then it read the history from current.
             * ssl_label: if attr, it will read the dynamic cluster label. Otherwise, no dynamic cluster label in data dict.
             * eval: if True, use the eval index (only used it for leaderboard evaluation)
-            * eval_input_seq: I forgot what it is.... xox...
             * leaderboard_version: 1st or 2nd, default is 1. If '2', we will use the index_eval_v2.pkl from assets/docs.
             * vis_name: the data of the visualization, default is ''.
-            * flow_num: the number of future frames we read, default is 1. (pc0->pc1 flow)
         '''
         super(HDF5Dataset, self).__init__()
         self.directory = directory
@@ -209,12 +208,10 @@ def __init__(self, directory, \
             self.data_index = pickle.load(f)
 
         self.eval_index = False
-        self.eval_input_seq = eval_input_seq
         self.ssl_label = import_func(f"src.autolabel.{ssl_label}") if ssl_label is not None else None
         self.history_frames = n_frames - 2
         self.vis_name = vis_name if isinstance(vis_name, list) else [vis_name]
         self.transform = transform
-        self.flow_num = flow_num
 
         if eval:
             eval_index_file = os.path.join(self.directory, 'index_eval.pkl')
@@ -267,7 +264,7 @@ def __init__(self, directory, \
 
     def __len__(self):
         # return 100 # for testing
-        if self.eval_index and not self.eval_input_seq:
+        if self.eval_index:
             return len(self.eval_data_index)
         elif not self.eval_index and self.train_index is not None:
             return len(self.train_index)
@@ -278,25 +275,17 @@ def valid_index(self, index_):
         Check if the index is valid for the current mode and satisfy the constraints.
         """
         eval_flag = False
-        if self.eval_index and not self.eval_input_seq:
+        if self.eval_index:
             eval_index_ = index_
             scene_id, timestamp = self.eval_data_index[eval_index_]
             index_ = self.data_index.index([scene_id, timestamp])
             max_idx = self.scene_id_bounds[scene_id]["max_index"]
             if index_ >= max_idx:
                 _, index_ = self.valid_index(eval_index_ - 1)
             eval_flag = True
-        elif self.eval_index and self.eval_input_seq:
-            scene_id, timestamp = self.data_index[index_]
-            # to make sure we have continuous frames
-            if self.scene_id_bounds[scene_id]["max_index"] <= index_:
-                index_ = index_ - 1
-            scene_id, timestamp = self.data_index[index_]
-            eval_flag = True if [scene_id, timestamp] in self.eval_data_index else False
         elif self.train_index is not None:
             train_index_ = index_
             scene_id, timestamp = self.train_index[train_index_]
-            # FIXME: it works now, but self.flow_num is not possible in this case.
             max_idx = self.scene_id_bounds[scene_id]["max_index"]
             index_ = self.data_index.index([scene_id, timestamp])
             if index_ >= max_idx:
@@ -306,7 +295,7 @@ def valid_index(self, index_):
             max_idx = self.scene_id_bounds[scene_id]["max_index"]
             min_idx = self.scene_id_bounds[scene_id]["min_index"]
 
-            max_valid_index_for_flow = max_idx - self.flow_num
+            max_valid_index_for_flow = max_idx - 1
             min_valid_index_for_flow = min_idx + self.history_frames
             index_ = max(min_valid_index_for_flow, min(max_valid_index_for_flow, index_))
         return eval_flag, index_
 
@@ -49,6 +49,6 @@
 # * pip install pytorch3d assets/cuda/histlib
 try:
     from .icpflow import ICPFlow
-except ImportError:
+except ImportError as e:
     print("--- WARNING [model]: ICPFlow is not imported, as it requires pytorch3d lib which is not installed.")
     print(f"Detail error message\033[0m: {e}. Just ignore this warning if code runs without these models.")
@@ -15,7 +15,7 @@
 # 
 """
 
-import os
+import os, sys
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -34,7 +34,7 @@
 from .utils.eval_metric import OfficialMetrics, evaluate_leaderboard, evaluate_leaderboard_v2, evaluate_ssf
 from .utils.av2_eval import write_output_file
 from .utils.mics import zip_res
-
+from .utils import InlineTee
 class SceneDistributedSampler(Sampler):
     """
     A DistributedSampler that distributes data based on scene IDs, not individual indices.
@@ -101,11 +101,12 @@ def __init__(self, cfg, rank, world_size, mode):
         self.mode = mode
 
         self.model.to(self.device)
-        self.metrics = OfficialMetrics() if self.mode in ['val', 'eval'] else None
+        self.metrics = OfficialMetrics() if self.mode in ['val', 'eval', 'valid'] else None
+        self.res_name = cfg.get('res_name', cfg.model.name)
         self.save_res_path = cfg.get('save_res_path', None)
 
     def _setup_dataloader(self):
-        if self.mode in ['val', 'test', 'eval']:
+        if self.mode in ['val', 'test', 'eval', 'valid']:
             dataset_path = self.cfg.dataset_path + f"/{self.cfg.data_mode}"
             is_eval_mode = True
         else: # 'save'
@@ -153,7 +154,7 @@ def _process_step(self, batch):
         final_flow = pose_flow.clone()
         final_flow[~batch['gm0']] = res_dict['flow'] + pose_flow[~batch['gm0']]
 
-        if self.mode in ['val', 'eval']:
+        if self.mode in ['val', 'eval', 'valid']:
             eval_mask = batch['eval_mask'].squeeze()
             gt_flow = batch["flow"]
             v1_dict = evaluate_leaderboard(final_flow[eval_mask], pose_flow[eval_mask], pc0[eval_mask], \
@@ -257,7 +258,7 @@ def _run_process(cfg, mode):
         gathered_metrics_objects = [runner.metrics]
 
     if rank == 0:
-        if mode in ['val', 'eval']:
+        if mode in ['val', 'eval', 'valid']:
             final_metrics = OfficialMetrics()
             print(f"\n--- [LOG] Finished processing. Aggregating results from {world_size} GPUs with {len(gathered_metrics_objects)} metrics objects...")
             for metrics_obj in gathered_metrics_objects:
@@ -299,17 +300,20 @@ def _run_process(cfg, mode):
 
     runner.cleanup()
 
-def _spawn_wrapper(rank, world_size, cfg, mode):
+def _spawn_wrapper(rank, world_size, cfg, mode, output_dir):
+    log_filepath = f"{output_dir}/output.log" if output_dir else None
+    if log_filepath and rank==0:
+        sys.stdout = InlineTee(log_filepath, append=True)
+    if rank == 0:
+        print(f"---LOG[eval]: Run optimization-based method: {cfg.model.name} on {cfg.dataset_path}/{cfg.data_mode} set.\n")
     torch.cuda.set_device(rank)
-
-    # FIXME(Qingwen): better to set these through command, since we might have more nodes to connected.
     os.environ['RANK'] = str(rank)
     os.environ['WORLD_SIZE'] = str(world_size)
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = str(cfg.get('master_port', 12355))
     _run_process(cfg, mode)
 
-def launch_runner(cfg, mode):
+def launch_runner(cfg, mode, output_dir):
     is_slurm_job = 'SLURM_PROCID' in os.environ
 
     if not is_slurm_job and not dist.is_initialized():
@@ -321,7 +325,7 @@ def launch_runner(cfg, mode):
             cfg.save_res_path = Path(cfg.dataset_path).parent / "results" / cfg.output
 
         mp.spawn(_spawn_wrapper,
-                 args=(world_size, cfg, mode),
+                 args=(world_size, cfg, mode, output_dir),
                  nprocs=world_size,
                  join=True)