Check non MRL loss for the model (#331)

srmsoumya · web-flow · commit 05c167e3d8c5 · 2024-11-04T17:30:07.000+05:30
- Use DINO v2 with linear projection
- Don't load from base checkpoint of MRL
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -20,7 +20,7 @@ model:
   patch_size: 8
   shuffle: True
   metadata_path: configs/metadata.yaml
-  teacher: samvit_base_patch16.sa1b
+  teacher: vit_large_patch14_reg4_dinov2.lvd142m
   dolls: [16, 32, 64, 128, 256, 768, 1024]
   doll_weights: [1, 1, 1, 1, 1, 1, 1]
   lr: 5e-6
@@ -32,7 +32,7 @@ trainer:
   accelerator: gpu
   strategy: ddp
   devices: 8
-  num_nodes: 20
+  num_nodes: 48
   precision: bf16-mixed
   log_every_n_steps: 1
   max_epochs: 1000
@@ -48,9 +48,9 @@ trainer:
       init_args:
         entity: developmentseed
         project: clay
-        group: v1.5
-        # id: v8jh2pn9
-        # resume: must
+        group: v1.5-nomrl-dinov2
+        id: 0uy3in7l
+        resume: must
         log_model: false
   callbacks:
     - class_path: lightning.pytorch.callbacks.ModelCheckpoint
@@ -70,4 +70,4 @@ trainer:
     - class_path: src.callbacks_wandb.LogIntermediatePredictions
   plugins:
     - class_path: lightning.pytorch.plugins.io.AsyncCheckpointIO
-ckpt_path: null
+ckpt_path: checkpoints/v1.5.0/last.ckpt
diff --git a/src/callbacks_wandb.py b/src/callbacks_wandb.py
@@ -257,9 +257,7 @@ def on_validation_end(
 
                 for j in range(n_cols):
                     # Plot actual images in rows 0 and 2
-                    axs[0, j].imshow(
-                        batch["pixels"][j][0], cmap="viridis"
-                    )
+                    axs[0, j].imshow(batch["pixels"][j][0], cmap="viridis")
                     axs[0, j].set_title(f"Actual {j}")
                     axs[0, j].axis("off")
 
@@ -271,15 +269,11 @@ def on_validation_end(
                     axs[2, j].axis("off")
 
                     # Plot predicted images in rows 1 and 3
-                    axs[1, j].imshow(
-                        pixels[j][0], cmap="viridis"
-                    )
+                    axs[1, j].imshow(pixels[j][0], cmap="viridis")
                     axs[1, j].set_title(f"Pred {j}")
                     axs[1, j].axis("off")
 
-                    axs[3, j].imshow(
-                        pixels[j + n_cols][0], cmap="viridis"
-                    )
+                    axs[3, j].imshow(pixels[j + n_cols][0], cmap="viridis")
                     axs[3, j].set_title(f"Pred {j+n_cols}")
                     axs[3, j].axis("off")
 
diff --git a/src/datamodule.py b/src/datamodule.py
@@ -253,7 +253,7 @@ def setup(self, stage: Literal["fit", "predict"] | None = None) -> None:
         #     chips_path = list(dp.list_files_by_s3(masks="*.npz"))
         # else:  # if self.data_dir is a local data path
         chips_path = sorted(list(Path(self.data_dir).glob("**/*.npz")))
-        chips_platform = [chip.parent.parent.name for chip in chips_path]
+        chips_platform = [chip.parent.name for chip in chips_path]
         # chips_platform = [chip.parent.parent.name for chip in chips_path]
         print(f"Total number of chips: {len(chips_path)}")
 
diff --git a/src/model.py b/src/model.py
@@ -7,10 +7,10 @@
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
 from torch import nn
+from torchvision.transforms import v2
 
 from src.backbone import Transformer
 from src.factory import DynamicEmbedding
-from src.mrl import MRL, MRLLoss
 from src.utils import posemb_sincos_2d_with_gsd
 
 torch.set_float32_matmul_precision("medium")
@@ -386,8 +386,13 @@ def __init__(  # noqa: PLR0913
         self.shuffle = shuffle
         self.metadata = metadata
         self.teacher = timm.create_model(teacher, pretrained=True, num_classes=0)
-        self.mrl = MRL(features=self.teacher.num_features, dolls=dolls)
-        self.mrl_loss = MRLLoss(weights=doll_weights)
+        self.teacher_chip_size = 518
+        self.teacher_resize = v2.Resize(
+            size=(self.teacher_chip_size, self.teacher_chip_size)
+        )
+        # self.mrl = MRL(features=self.teacher.num_features, dolls=dolls)
+        # self.mrl_loss = MRLLoss(weights=doll_weights)
+        self.proj = nn.Linear(dim, self.teacher.num_features)
 
         self.encoder = Encoder(
             mask_ratio=mask_ratio,
@@ -516,8 +521,11 @@ def forward(self, datacube):
         if platform == "modis":
             reconstruction_loss /= 10
 
-        # MRL
-        representations = self.mrl(encoded_unmasked_patches[:, 0, :])  # [(B D') ...]
+        # # MRL
+        # representations = self.mrl(encoded_unmasked_patches[:, 0, :])  # [(B D') ...]
+
+        # PROJ
+        representations = self.proj(encoded_unmasked_patches[:, 0, :])  # [B D']
 
         with torch.no_grad():
             if platform == "sentinel-1-rtc":
@@ -529,9 +537,12 @@ def forward(self, datacube):
                 # Read RGB bands from the sensor to feed the teacher model
                 indices = self.metadata[platform].rgb_indices
                 rgb = datacube["pixels"][:, indices, :, :]
+            rgb = self.teacher_resize(rgb)
             target = self.teacher(rgb)
+            # target = self.teacher(rgb)
 
-        representation_loss = self.mrl_loss(representations, target)
+        # representation_loss = self.mrl_loss(representations, target)
+        representation_loss = 1.0 - F.cosine_similarity(representations, target).mean()
 
         loss = 0.9 * reconstruction_loss + 0.1 * representation_loss
         return (loss, reconstruction_loss, representation_loss)
diff --git a/src/module.py b/src/module.py
@@ -27,6 +27,7 @@ def __init__(  # noqa: PLR0913
         embeddings_level: Literal["mean", "patch", "group"] = "mean",
     ):
         super().__init__()
+        # self.strict_loading = False # Allow partial loading to check if MRL was the bug
         self.save_hyperparameters(logger=True)
         self.metadata = Box(yaml.safe_load(open(metadata_path)))
         model_map = {
@@ -47,6 +48,26 @@ def __init__(  # noqa: PLR0913
                 "doll_weights": doll_weights,
             }
             self.model = model_map[model_size](**model_args)
+            # checkpoint_path = 'mae_v1.5.0_epoch-76_val-loss-0.1612.ckpt'
+            # checkpoint = torch.load(checkpoint_path, map_location="cpu")
+            # # Extract the state dictionary
+            # state_dict = checkpoint['state_dict']
+
+            # # Modify the state dictionary
+            # new_state_dict = OrderedDict()
+            # for k, v in state_dict.items():
+            #     # Remove 'model.' prefix if it exists
+            #     if k.startswith('model.'):
+            #         k = k[len('model.'):]
+            #     # Exclude keys related to the 'teacher'
+            #     if not (k.startswith('teacher') or k.startswith('mrl')):
+            #         new_state_dict[k] = v
+            # with torch.no_grad():
+            #     # Load the modified state dictionary into your model
+            #     missing_keys, unexpected_keys = self.model.load_state_dict(new_state_dict, strict=False)
+            # # Optionally, print missing and unexpected keys
+            # print(f"Missing keys: {missing_keys}")
+            # print(f"Unexpected keys: {unexpected_keys}")
         else:
             raise ValueError(
                 f"Invalid model size {model_size}. Expected one of {model_map.keys()}"
diff --git a/src/mrl.py b/src/mrl.py
@@ -9,12 +9,13 @@ class MRL(nn.Module):
     def __init__(self, features, dolls: list = [16, 32, 64, 128, 256, 768]) -> None:
         super().__init__()
         self.dolls = dolls
+        self.layers = nn.ModuleDict()
         for doll in dolls:
-            setattr(self, f"mrl_{doll}", nn.Linear(doll, features))
+            self.layers[f"mrl_{doll}"] = nn.Linear(doll, features)
 
     def forward(self, x):
         "x: (batch, features)"
-        logits = [getattr(self, f"mrl_{doll}")(x[:, :doll]) for doll in self.dolls]
+        logits = [self.layers[f"mrl_{doll}"](x[:, :doll]) for doll in self.dolls]
         return logits
 
 
diff --git a/train_clay_v2.sh b/train_clay_v2.sh
@@ -1,13 +1,14 @@
 #!/bin/bash
 
 #SBATCH --job-name=clay-laucher
-#SBATCH --nodes=20
+#SBATCH --nodes=24
 #SBATCH --ntasks-per-node=8          # EDIT if it's not 8-gpus per node
 #SBATCH --cpus-per-task=12           # EDIT this to how many cpu cores the node has divided by num of gpus
 #SBATCH --gres=gpu:8                 # EDIT this if it's not 8-gpus per node
 #SBATCH --time=0-00:00:00            # EDIT the desired runtime
 #SBATCH --exclusive
 #SBATCH --partition=gpu      # EDIT to the desired partition name
+#SBATCH --nodelist=gpu-dy-g6-[1-12],gpu-dy-g5-[1-12]
 #SBATCH --output=%x-%j-%N.out
 
 echo "START TIME: $(date)"
@@ -31,8 +32,11 @@ LOG_PATH="main_log.txt"
 # PTL doesn't need a special launcher
 LAUNCHER="python -u"
 
+# Capture the number of nodes allocated by Slurm
+NUM_NODES=$SLURM_JOB_NUM_NODES
+
 # EDIT the path+name of the python script and whatever args it needs
-PROGRAM="trainer.py fit --config configs/config.yaml"
+PROGRAM="trainer.py fit --config configs/config.yaml --trainer.num_nodes=$NUM_NODES"
 
 export CMD="$LAUNCHER $PROGRAM"
 
diff --git a/train_environment.yml b/train_environment.yml
@@ -1,21 +1,23 @@
 name: claymodel
 channels:
-  - pytorch
   - conda-forge
+  - nvidia
+  - pytorch
 dependencies:
   - python=3.11
   - pip
   - pip:
+    - --extra-index-url https://download.pytorch.org/whl/cu121
+    - torch==2.4.0+cu121
+    - torchvision==0.19.0+cu121
     - einops~=0.7.0
     - geopandas
     - jsonargparse[signatures]>=4.27.7
     - lightning
     - matplotlib
     - python-box
-    - torch
     - scikit-image
     - scikit-learn
     - timm
-    - torchvision
     - vit-pytorch
     - wandb
diff --git a/trainer.py b/trainer.py
@@ -22,9 +22,7 @@ def cli_main():
     Command-line inteface to run ClayMAE with ClayDataModule.
     """
     cli = LightningCLI(
-        ClayMAEModule, 
-        ClayDataModule, 
-        save_config_kwargs={"overwrite": True}
+        ClayMAEModule, ClayDataModule, save_config_kwargs={"overwrite": True}
     )
     return cli