Skip to content

Commit 3ff1487

Browse files
mnabianktangsali
authored andcommitted
Gnn recipes bug fixes (#1672)
* gnn recipes bug fixes * minor fixes
1 parent 916eff6 commit 3ff1487

5 files changed

Lines changed: 31 additions & 34 deletions

File tree

examples/additive_manufacturing/sintering_physics/inference.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,6 @@
3232
"Mesh Graph Net Datapipe requires the Tensorflow library. Install the "
3333
+ "package at: https://www.tensorflow.org/install"
3434
)
35-
physical_devices = tf.config.list_physical_devices("GPU")
36-
37-
try:
38-
for device_ in physical_devices:
39-
tf.config.experimental.set_memory_growth(device_, True)
40-
except:
41-
# Invalid device or cannot modify virtual devices once initialized.
42-
pass
43-
4435
import hydra
4536
import torch
4637
from graph_dataset import GraphDataset

examples/additive_manufacturing/sintering_physics/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ pyvista
88
vtk
99
natsort
1010
scikit-learn
11-
tensorboard
11+
tensorboard
12+
tensorflow-cpu>=2.15,<3.0

examples/additive_manufacturing/sintering_physics/train.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,6 @@
5858
)
5959
from physicsnemo.models.vfgn.graph_network_modules import VFGNLearnedSimulator
6060

61-
physical_devices = tf.config.list_physical_devices("GPU")
62-
try:
63-
for device_ in physical_devices:
64-
tf.config.experimental.set_memory_growth(device_, True)
65-
except:
66-
# Invalid device or cannot modify virtual devices once initialized.
67-
pass
68-
6961

7062
def Train(rank_zero_logger, dist, cfg: DictConfig):
7163
"""
@@ -131,18 +123,13 @@ def Train(rank_zero_logger, dist, cfg: DictConfig):
131123
writer = SummaryWriter(log_dir=cfg.data_options.ckpt_path_vfgn)
132124

133125
optimizer = None
126+
scaler = None
134127
# todo : check device
135128
device = "cpu"
136129
step = 0
137130
running_loss = 0.0
138131
best_loss = 1000.0
139132

140-
# Native PyTorch automatic mixed precision (AMP) replaces NVIDIA Apex.
141-
# GradScaler is a no-op when enabled=False, so it is safe to construct
142-
# unconditionally and only activate when fp16 training is requested.
143-
use_amp = cfg.general.fp16
144-
scaler = torch.amp.GradScaler("cuda", enabled=use_amp)
145-
146133
rank_zero_logger.info("Training started...")
147134

148135
for features, targets in tqdm(dataset):
@@ -184,11 +171,10 @@ def Train(rank_zero_logger, dist, cfg: DictConfig):
184171

185172
sampled_noise *= noise_mask
186173

187-
amp_active = (
188-
use_amp and isinstance(device, torch.device) and device.type == "cuda"
189-
)
174+
amp_enabled = cfg.general.fp16 and scaler is not None
190175
with torch.autocast(
191-
device_type="cuda", dtype=torch.float16, enabled=amp_active
176+
device_type=device.type if isinstance(device, torch.device) else "cpu",
177+
enabled=amp_enabled,
192178
):
193179
pred_target = model(
194180
next_positions=targets.to(device),
@@ -219,8 +205,8 @@ def Train(rank_zero_logger, dist, cfg: DictConfig):
219205
model.setMessagePassingDevices(message_passing_devices)
220206
model = model.to(device)
221207
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
222-
# Mixed precision is handled via the torch.amp GradScaler / autocast
223-
# constructed above; no extra optimizer wrapping is required.
208+
if cfg.general.fp16:
209+
scaler = torch.amp.GradScaler(device.type)
224210

225211
scheduler = torch.optim.lr_scheduler.ExponentialLR(
226212
optimizer, gamma=0.1, verbose=True
@@ -398,7 +384,7 @@ def Train(rank_zero_logger, dist, cfg: DictConfig):
398384
rank_zero_logger.info(f"loss: {loss}")
399385
# back propogation
400386
optimizer.zero_grad()
401-
if use_amp:
387+
if cfg.general.fp16:
402388
scaler.scale(loss).backward()
403389
scaler.step(optimizer)
404390
scaler.update()

examples/cfd/external_aerodynamics/xaeronet/README.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,30 @@ dataset, please refer to their [paper](https://arxiv.org/pdf/2408.11969).
7575

7676
## XAeroNet-S prerequisites
7777

78-
Install the requirements using:
78+
Install the base requirements:
7979

8080
```bash
8181
pip install -r requirements.txt
82-
pip install pyg-lib -f https://data.pyg.org/whl/torch-2.8.0+cu129.html
8382
```
8483

84+
`pyg-lib` and `torch_scatter` ship as compiled CUDA extensions and must be
85+
installed from PyG's pre-built wheel index that matches your installed
86+
`torch` and CUDA versions. The two-line snippet below detects both and
87+
constructs the correct URL:
88+
89+
```bash
90+
TORCH=$(python -c "import torch; print(torch.__version__.split('+')[0])")
91+
CUDA=$(python -c "import torch; v=torch.version.cuda; \
92+
print('cu' + v.replace('.', '') if v else 'cpu')")
93+
pip install pyg-lib torch_scatter -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
94+
```
95+
96+
If PyG has not published a wheel for your exact torch+CUDA combination
97+
yet, browse <https://data.pyg.org/whl/> to find the closest match, or
98+
build from source with `pip install --no-build-isolation torch_scatter`
99+
(plain `pip install torch_scatter` fails because pip's build isolation
100+
hides the installed `torch` from the build environment).
101+
85102
See `pyg-lib` [installation instructions](https://github.com/pyg-team/pyg-lib?tab=readme-ov-file#installation)
86103
for more details.
87104

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
trimesh>=4.5.0
22
torch_geometric>=2.6.1
3-
torch_scatter>=2.1.2
43
pyvista
54
vtk
65
wandb
6+
scikit-learn
7+
tabulate
8+
matplotlib

0 commit comments

Comments
 (0)