NVlabs · ilessiorobotflowlabs · Mar 29, 2026
diff --git a/.claude/settings.json b/.claude/settings.json
@@ -0,0 +1,191 @@
+{
+  "permissions": {
+    "allow": [
+      "Read(*)",
+      "Edit(*)",
+      "Write(*)",
+      "Glob(*)",
+      "Grep(*)",
+      "WebFetch(*)",
+      "WebSearch(*)",
+      "Task(*)",
+      "NotebookEdit(*)",
+      "Skill(*)",
+      "Agent(*)",
+      "Bash(uv *)",
+      "Bash(pnpm *)",
+      "Bash(npm *)",
+      "Bash(npx *)",
+      "Bash(pip *)",
+      "Bash(python *)",
+      "Bash(python3 *)",
+      "Bash(node *)",
+      "Bash(tsx *)",
+      "Bash(tsc *)",
+      "Bash(pytest *)",
+      "Bash(rg *)",
+      "Bash(find *)",
+      "Bash(ls *)",
+      "Bash(cat *)",
+      "Bash(head *)",
+      "Bash(tail *)",
+      "Bash(wc *)",
+      "Bash(sort *)",
+      "Bash(grep *)",
+      "Bash(awk *)",
+      "Bash(sed *)",
+      "Bash(echo *)",
+      "Bash(printf *)",
+      "Bash(mkdir *)",
+      "Bash(cp *)",
+      "Bash(mv *)",
+      "Bash(touch *)",
+      "Bash(chmod +x *)",
+      "Bash(git add *)",
+      "Bash(git commit *)",
+      "Bash(git status*)",
+      "Bash(git log *)",
+      "Bash(git diff *)",
+      "Bash(git branch *)",
+      "Bash(git checkout *)",
+      "Bash(git stash *)",
+      "Bash(git tag *)",
+      "Bash(git remote -v*)",
+      "Bash(git rev-parse *)",
+      "Bash(git show *)",
+      "Bash(docker compose *)",
+      "Bash(docker build *)",
+      "Bash(docker ps*)",
+      "Bash(docker images*)",
+      "Bash(docker logs *)",
+      "Bash(docker inspect *)",
+      "Bash(docker exec *)",
+      "Bash(docker run *)",
+      "Bash(docker stop *)",
+      "Bash(docker start *)",
+      "Bash(curl *)",
+      "Bash(wget *)",
+      "Bash(ssh *)",
+      "Bash(rsync *)",
+      "Bash(scp *)",
+      "Bash(ping *)",
+      "Bash(ifconfig*)",
+      "Bash(networksetup *)",
+      "Bash(brew *)",
+      "Bash(which *)",
+      "Bash(env *)",
+      "Bash(export *)",
+      "Bash(source *)",
+      "Bash(eval *)",
+      "Bash(cd *)",
+      "Bash(pwd*)",
+      "Bash(date*)",
+      "Bash(df *)",
+      "Bash(du *)",
+      "Bash(free *)",
+      "Bash(top *)",
+      "Bash(htop*)",
+      "Bash(ps *)",
+      "Bash(lsof *)",
+      "Bash(nc *)",
+      "Bash(tar *)",
+      "Bash(unzip *)",
+      "Bash(zip *)",
+      "Bash(jq *)",
+      "Bash(yq *)",
+      "Bash(tree *)",
+      "Bash(xargs *)",
+      "Bash(tee *)",
+      "Bash(diff *)",
+      "Bash(patch *)",
+      "Bash(ruff *)",
+      "Bash(mypy *)",
+      "Bash(black *)",
+      "Bash(isort *)",
+      "Bash(eslint *)",
+      "Bash(prettier *)",
+      "Bash(cargo *)",
+      "Bash(rustc *)",
+      "Bash(go *)",
+      "Bash(make *)",
+      "Bash(cmake *)",
+      "Bash(conda *)",
+      "Bash(mamba *)",
+      "Bash(ros2 *)",
+      "Bash(colcon *)",
+      "Bash(osgrep *)",
+      "Bash(gh *)",
+      "Bash(rtk *)"
+    ],
+    "deny": [
+      "Bash(rm -rf /)*",
+      "Bash(rm -rf ~)*",
+      "Bash(rm -rf /*)*",
+      "Bash(rm -rf .)*",
+      "Bash(rm -rf ..)*",
+      "Bash(sudo rm -rf *)",
+      "Bash(sudo rm -r /)*",
+      "Bash(git push --force *)",
+      "Bash(git push -f *)",
+      "Bash(git push --force-with-lease *)",
+      "Bash(git reset --hard *)",
+      "Bash(git clean -fd*)",
+      "Bash(git checkout -- .)*",
+      "Bash(git restore .)*",
+      "Bash(git rebase -i *)",
+      "Bash(git push origin master*)",
+      "Bash(mkfs *)",
+      "Bash(dd if=*of=/dev/*)",
+      "Bash(shutdown *)",
+      "Bash(reboot *)",
+      "Bash(halt *)",
+      "Bash(init 0*)",
+      "Bash(:(){ :|:& };:)*",
+      "Bash(> /dev/sd*)",
+      "Bash(> /dev/nvme*)",
+      "Bash(curl * | sh)*",
+      "Bash(curl * | bash)*",
+      "Bash(wget * | sh)*",
+      "Bash(wget * | bash)*",
+      "Bash(chmod 777 *)",
+      "Bash(chmod -R 777 *)",
+      "Bash(chown -R *)",
+      "Bash(chgrp -R *)",
+      "Bash(pkill -9 *)",
+      "Bash(killall *)",
+      "Bash(kill -9 -1*)",
+      "Bash(sudo *)",
+      "Bash(su *)",
+      "Bash(passwd *)",
+      "Bash(usermod *)",
+      "Bash(useradd *)",
+      "Bash(userdel *)",
+      "Bash(visudo *)",
+      "Bash(crontab -r*)",
+      "Bash(iptables -F*)",
+      "Bash(systemctl stop *)",
+      "Bash(systemctl disable *)",
+      "Bash(launchctl unload *)",
+      "Bash(npm publish *)",
+      "Bash(pip upload *)",
+      "Bash(twine upload *)",
+      "Bash(docker push *)",
+      "Bash(docker rmi -f *)",
+      "Bash(docker system prune -a*)",
+      "Bash(docker volume rm *)",
+      "Bash(dropdb *)",
+      "Bash(drop database *)",
+      "Bash(DROP DATABASE *)",
+      "Bash(mongo * --eval *dropDatabase*)",
+      "Bash(redis-cli FLUSHALL*)",
+      "Bash(aws s3 rm *--recursive*)",
+      "Bash(aws s3 rb *--force*)",
+      "Bash(terraform destroy *)",
+      "Bash(kubectl delete namespace *)",
+      "Bash(kubectl delete -f * --all*)",
+      "Bash(gh repo delete *)",
+      "Bash(gh issue close *)",
+      "Bash(gh pr close *)"
+    ]
+  }
+}
diff --git a/.gitignore b/.gitignore
@@ -65,4 +65,12 @@ gradio_queue.db
 # stable diffusion
 *.ckpt
 
-*.o
+*.o
+
+# optional third_party checkouts (source-only forks can be re-cloned locally)
+third_party/latent-diffusion/.git
+third_party/taming-transformers/.git
+
+# Third-party cloned repos (managed by bootstrap scripts)
+third_party/latent-diffusion/
+third_party/taming-transformers/
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "third_party/latent-diffusion"]
+	path = third_party/latent-diffusion
+	url = https://github.com/CompVis/latent-diffusion.git
+
+[submodule "third_party/taming-transformers"]
+	path = third_party/taming-transformers
+	url = https://github.com/CompVis/taming-transformers.git
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,62 @@
+# ODISE — Open-Vocabulary Panoptic Segmentation
+
+Open-vocabulary panoptic segmentation using pre-trained text-image diffusion and discriminative models (CVPR 2023 Highlight, NVIDIA).
+
+## Architecture
+```
+odise/
+├── checkpoint/     # Custom checkpointer (ODISE weights)
+├── config/         # Detectron2-style configs
+├── data/           # Dataset registration & transforms
+├── engine/         # Training loop & defaults
+├── evaluation/     # Eval metrics
+├── model_zoo/      # Pre-built model configs
+├── modeling/       # Core models (diffusion, meta-arch, backbone, wrapper)
+└── utils/          # Env collection, misc helpers
+configs/            # YAML/Python training configs
+third_party/        # Mask2Former, latent-diffusion, taming-transformers
+tools/              # train_net.py, extract_features.py, bootstrap script
+demo/               # Gradio demo app
+```
+
+## Key Dependencies
+- Python >=3.10, PyTorch >=2.0
+- detectron2, Mask2Former (local third_party)
+- open-clip-torch==2.0.2, timm==0.6.11
+- numpy<2.0, omegaconf>=2.3
+- Stable Diffusion via latent-diffusion/taming-transformers submodules
+
+## Dev Commands
+```bash
+# Activate env (GPU server)
+source /mnt/forge-data/activate.sh
+
+# Install
+uv pip install -e .
+
+# Bootstrap third-party submodules
+bash tools/bootstrap_third_party.sh
+
+# Train
+CUDA_VISIBLE_DEVICES=0,1,2,3 python tools/train_net.py --config-file configs/common/train.py --num-gpus 4
+
+# Demo
+python demo/demo.py
+
+# Lint
+ruff check odise/ --select E,F,I,B,UP
+isort --check odise/
+mypy odise/
+```
+
+## Conventions
+- Package manager: `uv` (never pip directly)
+- Search: `rg` (ripgrep), never `grep`
+- Line length: 100
+- Style: isort + ruff
+- Config: Detectron2 LazyConfig system (Python-based configs)
+- Git commit prefix: `[ODISE]`
+- Training outputs: `/mnt/artifacts-datai/`
+
+# currentDate
+Today's date is 2026-03-29.
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
@@ -6,6 +6,22 @@ For further reading, please refer to [Getting Started with Detectron2](https://g
 
 **Important Note**: ODISE's `demo/demo.py` and `tools/train_net.py` scripts link to the original pre-trained models for [Stable Diffusion v1.3](https://huggingface.co/CompVis/stable-diffusion-v-1-3-original/resolve/main/sd-v1-3.ckpt) and [CLIP](https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt). When you run them for the very first time, these scripts will automatically download the pre-trained models for Stable Diffuson and CLIP, from their original sources, to your local directories `$HOME/.torch/` and `$HOME/.cache/clip`, respectively. Their use is subject to the original license terms defined at [https://github.com/CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) and [https://github.com/openai/CLIP](https://github.com/openai/CLIP), respectively.
 
+If you use `stable-diffusion` backbones (latent-diffusion/taming-transformers), initialize optional third_party checkouts first:
+
+```bash
+bash tools/bootstrap_third_party.sh
+```
+
+If your clone did not include submodules, or if you need a clean refresh:
+
+```bash
+bash tools/bootstrap_third_party.sh --force
+```
+or
+```bash
+git submodule update --init --recursive
+```
+
 
 ### Inference Demo with Pre-trained ODISE Models
 
@@ -49,39 +65,40 @@ python demo/demo.py --input demo/examples/purse.jpeg --output demo/purse_pred.jp
 We provide a script `tools/train_net.py` that trains all configurations of ODISE.
 
 To train a model with `tools/train_net.py`, first prepare the datasets following the instructions in
-[datasets/README.md](./datasets/README.md) and then run, for single-node (8-GPUs) NVIDIA AMP-based training:
-```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp 
-```
-For 4-node (32-GPUs) AMP-based training, run: 
+[datasets/README.md](./datasets/README.md) and then run, for CPU-first single-process training:
 ```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 0 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
-(node1)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 1 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
-(node2)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 2 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
-(node3)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 3 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
+./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 1 --force-cpu
 ```
 
-Note that our default training configurations are designed for 32 GPUs.
-Since we use the AdamW optimizer, it is not clear as to how to scale the learning rate with batch size.
-However, we provide the ability to automatically scale the learning rate and the batch size for any number of GPUs used for training by passing in the`--ref $REFERENCE_WORLD_SIZE` argument. 
-For example, if you set `$REFERENCE_WORLD_SIZE=32` while training on 8 GPUs, the batch size and learning rate will be set to 8/32 = 0.25 of the original ones.
+AMP is only enabled when CUDA is available. On CPU-only machines, training falls back to full precision.
 
-```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp --ref 32
-```
+For multi-GPU training (optional, if you still run distributed CUDA), keep your existing launch pattern and pass `--num-gpus` plus `--amp` as before.
 
-ODISE trains in 6 days on 32 NVIDIA V100 GPUs.
+### High-throughput Feature Extraction
+
+`tools/extract_features.py` supports distributed extraction. For CPU-only use:
 
-To evaluate a trained ODISE model's performance, run on single node
-```
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-```
-or for multi-node inference:
 ```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 0 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-(node1)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 1 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-(node2)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 2 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-(node3)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 3 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
+python tools/extract_features.py \
+  --config-file configs/Panoptic/odise_label_coco_50e.py \
+  --num-gpus 1 \
+  --force-cpu \
+  --num-machines 1 \
+  --init-from /path/to/checkpoint.pth \
+  --output /path/to/feature_out \
+  --dataloader dataloader.test \
+  --feature-layers s2,s3,s4,s5
+``` 
+
+You can scale this to multi-GPU later by increasing `--num-gpus` and `--num-machines` once your environment is configured for distributed execution.
+
+`--dataloader` is a dotted path inside the config; for built-in PANOPTIC configs this is `dataloader.test`.
+Each `.pt` file stores a single image's normalized feature maps and metadata and can be merged later as needed.
+
+To evaluate a trained ODISE model on CPU-only single process:
+```
+./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 1 --force-cpu --eval-only --init-from /path/to/checkpoint
 ```
+or use distributed multi-node/multi-GPU launch flags as needed in your own environment.
 
 To use the our provided ODISE [model zoo](README.md#model-zoo), you can pass in the arguments `--config-file configs/Panoptic/odise_label_coco_50e.py --init-from odise://Panoptic/odise_label_coco_50e` or `--config-file configs/Panoptic/odise_label_coco_50e.py --init-from odise://Panoptic/odise_caption_coco_50e` to `./tools/train_net.py`, respectively.