Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions .claude/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
{
"permissions": {
"allow": [
"Read(*)",
"Edit(*)",
"Write(*)",
"Glob(*)",
"Grep(*)",
"WebFetch(*)",
"WebSearch(*)",
"Task(*)",
"NotebookEdit(*)",
"Skill(*)",
"Agent(*)",
"Bash(uv *)",
"Bash(pnpm *)",
"Bash(npm *)",
"Bash(npx *)",
"Bash(pip *)",
"Bash(python *)",
"Bash(python3 *)",
"Bash(node *)",
"Bash(tsx *)",
"Bash(tsc *)",
"Bash(pytest *)",
"Bash(rg *)",
"Bash(find *)",
"Bash(ls *)",
"Bash(cat *)",
"Bash(head *)",
"Bash(tail *)",
"Bash(wc *)",
"Bash(sort *)",
"Bash(grep *)",
"Bash(awk *)",
"Bash(sed *)",
"Bash(echo *)",
"Bash(printf *)",
"Bash(mkdir *)",
"Bash(cp *)",
"Bash(mv *)",
"Bash(touch *)",
"Bash(chmod +x *)",
"Bash(git add *)",
"Bash(git commit *)",
"Bash(git status*)",
"Bash(git log *)",
"Bash(git diff *)",
"Bash(git branch *)",
"Bash(git checkout *)",
"Bash(git stash *)",
"Bash(git tag *)",
"Bash(git remote -v*)",
"Bash(git rev-parse *)",
"Bash(git show *)",
"Bash(docker compose *)",
"Bash(docker build *)",
"Bash(docker ps*)",
"Bash(docker images*)",
"Bash(docker logs *)",
"Bash(docker inspect *)",
"Bash(docker exec *)",
"Bash(docker run *)",
"Bash(docker stop *)",
"Bash(docker start *)",
"Bash(curl *)",
"Bash(wget *)",
"Bash(ssh *)",
"Bash(rsync *)",
"Bash(scp *)",
"Bash(ping *)",
"Bash(ifconfig*)",
"Bash(networksetup *)",
"Bash(brew *)",
"Bash(which *)",
"Bash(env *)",
"Bash(export *)",
"Bash(source *)",
"Bash(eval *)",
"Bash(cd *)",
"Bash(pwd*)",
"Bash(date*)",
"Bash(df *)",
"Bash(du *)",
"Bash(free *)",
"Bash(top *)",
"Bash(htop*)",
"Bash(ps *)",
"Bash(lsof *)",
"Bash(nc *)",
"Bash(tar *)",
"Bash(unzip *)",
"Bash(zip *)",
"Bash(jq *)",
"Bash(yq *)",
"Bash(tree *)",
"Bash(xargs *)",
"Bash(tee *)",
"Bash(diff *)",
"Bash(patch *)",
"Bash(ruff *)",
"Bash(mypy *)",
"Bash(black *)",
"Bash(isort *)",
"Bash(eslint *)",
"Bash(prettier *)",
"Bash(cargo *)",
"Bash(rustc *)",
"Bash(go *)",
"Bash(make *)",
"Bash(cmake *)",
"Bash(conda *)",
"Bash(mamba *)",
"Bash(ros2 *)",
"Bash(colcon *)",
"Bash(osgrep *)",
"Bash(gh *)",
"Bash(rtk *)"
],
"deny": [
"Bash(rm -rf /)*",
"Bash(rm -rf ~)*",
"Bash(rm -rf /*)*",
"Bash(rm -rf .)*",
"Bash(rm -rf ..)*",
"Bash(sudo rm -rf *)",
"Bash(sudo rm -r /)*",
"Bash(git push --force *)",
"Bash(git push -f *)",
"Bash(git push --force-with-lease *)",
"Bash(git reset --hard *)",
"Bash(git clean -fd*)",
"Bash(git checkout -- .)*",
"Bash(git restore .)*",
"Bash(git rebase -i *)",
"Bash(git push origin master*)",
"Bash(mkfs *)",
"Bash(dd if=*of=/dev/*)",
"Bash(shutdown *)",
"Bash(reboot *)",
"Bash(halt *)",
"Bash(init 0*)",
"Bash(:(){ :|:& };:)*",
"Bash(> /dev/sd*)",
"Bash(> /dev/nvme*)",
"Bash(curl * | sh)*",
"Bash(curl * | bash)*",
"Bash(wget * | sh)*",
"Bash(wget * | bash)*",
"Bash(chmod 777 *)",
"Bash(chmod -R 777 *)",
"Bash(chown -R *)",
"Bash(chgrp -R *)",
"Bash(pkill -9 *)",
"Bash(killall *)",
"Bash(kill -9 -1*)",
"Bash(sudo *)",
"Bash(su *)",
"Bash(passwd *)",
"Bash(usermod *)",
"Bash(useradd *)",
"Bash(userdel *)",
"Bash(visudo *)",
"Bash(crontab -r*)",
"Bash(iptables -F*)",
"Bash(systemctl stop *)",
"Bash(systemctl disable *)",
"Bash(launchctl unload *)",
"Bash(npm publish *)",
"Bash(pip upload *)",
"Bash(twine upload *)",
"Bash(docker push *)",
"Bash(docker rmi -f *)",
"Bash(docker system prune -a*)",
"Bash(docker volume rm *)",
"Bash(dropdb *)",
"Bash(drop database *)",
"Bash(DROP DATABASE *)",
"Bash(mongo * --eval *dropDatabase*)",
"Bash(redis-cli FLUSHALL*)",
"Bash(aws s3 rm *--recursive*)",
"Bash(aws s3 rb *--force*)",
"Bash(terraform destroy *)",
"Bash(kubectl delete namespace *)",
"Bash(kubectl delete -f * --all*)",
"Bash(gh repo delete *)",
"Bash(gh issue close *)",
"Bash(gh pr close *)"
]
}
}
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,12 @@ gradio_queue.db
# stable diffusion
*.ckpt

*.o
*.o

# optional third_party checkouts (source-only forks can be re-cloned locally)
third_party/latent-diffusion/.git
third_party/taming-transformers/.git

# Third-party cloned repos (managed by bootstrap scripts)
third_party/latent-diffusion/
third_party/taming-transformers/
7 changes: 7 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[submodule "third_party/latent-diffusion"]
path = third_party/latent-diffusion
url = https://github.com/CompVis/latent-diffusion.git

[submodule "third_party/taming-transformers"]
path = third_party/taming-transformers
url = https://github.com/CompVis/taming-transformers.git
62 changes: 62 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# ODISE — Open-Vocabulary Panoptic Segmentation

Open-vocabulary panoptic segmentation using pre-trained text-image diffusion and discriminative models (CVPR 2023 Highlight, NVIDIA).

## Architecture
```
odise/
├── checkpoint/ # Custom checkpointer (ODISE weights)
├── config/ # Detectron2-style configs
├── data/ # Dataset registration & transforms
├── engine/ # Training loop & defaults
├── evaluation/ # Eval metrics
├── model_zoo/ # Pre-built model configs
├── modeling/ # Core models (diffusion, meta-arch, backbone, wrapper)
└── utils/ # Env collection, misc helpers
configs/ # YAML/Python training configs
third_party/ # Mask2Former, latent-diffusion, taming-transformers
tools/ # train_net.py, extract_features.py, bootstrap script
demo/ # Gradio demo app
```

## Key Dependencies
- Python >=3.10, PyTorch >=2.0
- detectron2, Mask2Former (local third_party)
- open-clip-torch==2.0.2, timm==0.6.11
- numpy<2.0, omegaconf>=2.3
- Stable Diffusion via latent-diffusion/taming-transformers submodules

## Dev Commands
```bash
# Activate env (GPU server)
source /mnt/forge-data/activate.sh

# Install
uv pip install -e .

# Bootstrap third-party submodules
bash tools/bootstrap_third_party.sh

# Train
CUDA_VISIBLE_DEVICES=0,1,2,3 python tools/train_net.py --config-file configs/common/train.py --num-gpus 4

# Demo
python demo/demo.py

# Lint
ruff check odise/ --select E,F,I,B,UP
isort --check odise/
mypy odise/
```

## Conventions
- Package manager: `uv` (never pip directly)
- Search: `rg` (ripgrep), never `grep`
- Line length: 100
- Style: isort + ruff
- Config: Detectron2 LazyConfig system (Python-based configs)
- Git commit prefix: `[ODISE]`
- Training outputs: `/mnt/artifacts-datai/`

# currentDate
Today's date is 2026-03-29.
69 changes: 43 additions & 26 deletions GETTING_STARTED.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@ For further reading, please refer to [Getting Started with Detectron2](https://g

**Important Note**: ODISE's `demo/demo.py` and `tools/train_net.py` scripts link to the original pre-trained models for [Stable Diffusion v1.3](https://huggingface.co/CompVis/stable-diffusion-v-1-3-original/resolve/main/sd-v1-3.ckpt) and [CLIP](https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt). When you run them for the very first time, these scripts will automatically download the pre-trained models for Stable Diffuson and CLIP, from their original sources, to your local directories `$HOME/.torch/` and `$HOME/.cache/clip`, respectively. Their use is subject to the original license terms defined at [https://github.com/CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) and [https://github.com/openai/CLIP](https://github.com/openai/CLIP), respectively.

If you use `stable-diffusion` backbones (latent-diffusion/taming-transformers), initialize optional third_party checkouts first:

```bash
bash tools/bootstrap_third_party.sh
```

If your clone did not include submodules, or if you need a clean refresh:

```bash
bash tools/bootstrap_third_party.sh --force
```
or
```bash
git submodule update --init --recursive
```


### Inference Demo with Pre-trained ODISE Models

Expand Down Expand Up @@ -49,39 +65,40 @@ python demo/demo.py --input demo/examples/purse.jpeg --output demo/purse_pred.jp
We provide a script `tools/train_net.py` that trains all configurations of ODISE.

To train a model with `tools/train_net.py`, first prepare the datasets following the instructions in
[datasets/README.md](./datasets/README.md) and then run, for single-node (8-GPUs) NVIDIA AMP-based training:
```bash
(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp
```
For 4-node (32-GPUs) AMP-based training, run:
[datasets/README.md](./datasets/README.md) and then run, for CPU-first single-process training:
```bash
(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 0 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
(node1)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 1 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
(node2)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 2 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
(node3)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 3 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 1 --force-cpu
```

Note that our default training configurations are designed for 32 GPUs.
Since we use the AdamW optimizer, it is not clear as to how to scale the learning rate with batch size.
However, we provide the ability to automatically scale the learning rate and the batch size for any number of GPUs used for training by passing in the`--ref $REFERENCE_WORLD_SIZE` argument.
For example, if you set `$REFERENCE_WORLD_SIZE=32` while training on 8 GPUs, the batch size and learning rate will be set to 8/32 = 0.25 of the original ones.
AMP is only enabled when CUDA is available. On CPU-only machines, training falls back to full precision.

```bash
(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp --ref 32
```
For multi-GPU training (optional, if you still run distributed CUDA), keep your existing launch pattern and pass `--num-gpus` plus `--amp` as before.

ODISE trains in 6 days on 32 NVIDIA V100 GPUs.
### High-throughput Feature Extraction

`tools/extract_features.py` supports distributed extraction. For CPU-only use:

To evaluate a trained ODISE model's performance, run on single node
```
(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --eval-only --init-from /path/to/checkpoint
```
or for multi-node inference:
```bash
(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 0 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
(node1)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 1 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
(node2)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 2 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
(node3)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 3 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
python tools/extract_features.py \
--config-file configs/Panoptic/odise_label_coco_50e.py \
--num-gpus 1 \
--force-cpu \
--num-machines 1 \
--init-from /path/to/checkpoint.pth \
--output /path/to/feature_out \
--dataloader dataloader.test \
--feature-layers s2,s3,s4,s5
```

You can scale this to multi-GPU later by increasing `--num-gpus` and `--num-machines` once your environment is configured for distributed execution.

`--dataloader` is a dotted path inside the config; for built-in PANOPTIC configs this is `dataloader.test`.
Each `.pt` file stores a single image's normalized feature maps and metadata and can be merged later as needed.

To evaluate a trained ODISE model on CPU-only single process:
```
./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 1 --force-cpu --eval-only --init-from /path/to/checkpoint
```
or use distributed multi-node/multi-GPU launch flags as needed in your own environment.

To use the our provided ODISE [model zoo](README.md#model-zoo), you can pass in the arguments `--config-file configs/Panoptic/odise_label_coco_50e.py --init-from odise://Panoptic/odise_label_coco_50e` or `--config-file configs/Panoptic/odise_label_coco_50e.py --init-from odise://Panoptic/odise_caption_coco_50e` to `./tools/train_net.py`, respectively.
Loading