Skip to content

Commit 2b5e754

Browse files
authored
Merge branch 'main' into psharpe/notebook-cleanup
2 parents 09344d9 + ac72286 commit 2b5e754

11 files changed

Lines changed: 1388 additions & 1034 deletions

File tree

examples/cfd/external_aerodynamics/globe/airfrans/run.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ CPUS_PER_NODE=${SLURM_CPUS_ON_NODE:-$(nproc)}
5353
export OMP_NUM_THREADS=1
5454
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS (process-level parallelism via DataLoader workers; ${CPUS_PER_NODE} CPUs / ${NUM_GPUS_PER_NODE} GPUs)"
5555

56+
### [CUDA Allocator]
57+
# expandable_segments: avoids the synchronizing cudaMalloc/cudaFree round-trips
58+
# that the default segment allocator performs when chunked kernel evaluations
59+
# stress the cache. Lets the allocator grow segments instead.
60+
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
61+
echo "PYTORCH_CUDA_ALLOC_CONF=$PYTORCH_CUDA_ALLOC_CONF"
62+
5663
### [Sync Dependencies]
5764
# Select the right CUDA extra based on the detected driver version,
5865
# then install both the project deps and example-specific requirements.

examples/cfd/external_aerodynamics/globe/airfrans/train.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def main(
104104
n_spherical_harmonics: int = 1,
105105
theta: float = 0.0,
106106
leaf_size: int = 1,
107+
tree_build_device: Literal["cpu", "cuda"] | None = None,
107108
airfrans_task: Literal["full", "scarce", "reynolds", "aoa"] = "full",
108109
patience_steps: int = 1600,
109110
use_profiler: bool = True,
@@ -140,6 +141,8 @@ def main(
140141
n_spherical_harmonics: Number of Legendre polynomial terms for angle features.
141142
theta: Barnes-Hut opening angle. Larger = more aggressive approximation.
142143
leaf_size: Maximum sources per leaf node in the Barnes-Hut tree.
144+
tree_build_device: Device on which to build cluster trees and run the
145+
dual-tree Barnes-Hut traversal. ``None`` (default) uses the input's device.
143146
airfrans_task: Which AirFRANS dataset task to train on.
144147
patience_steps: ReduceLROnPlateau patience expressed in gradient
145148
steps (world-size independent). Converted to epochs internally.
@@ -270,6 +273,7 @@ def main(
270273
self_regularization_beta=self_regularization_beta,
271274
latent_compression_scale=latent_compression_scale,
272275
expand_far_targets=expand_far_targets,
276+
tree_build_device=tree_build_device,
273277
).to(device)
274278

275279
logger0.info(f"{output_dir.name=!r}")
@@ -347,16 +351,13 @@ def main(
347351
min_lr=learning_rate / 64,
348352
threshold=1e-3,
349353
)
350-
scaler = torch.amp.GradScaler(device=device.type, enabled=amp)
351-
352354
### [Checkpoint Save/Load]
353355
metadata_dict: dict[str, Any] = {}
354356
epoch = load_checkpoint(
355357
checkpoint_dir,
356358
models=base_model,
357359
optimizer=optimizer,
358360
scheduler=scheduler,
359-
scaler=scaler,
360361
metadata_dict=metadata_dict,
361362
device=dist.device,
362363
)
@@ -430,7 +431,6 @@ def main(
430431
**config_settings,
431432
"optimizer": optimizer.__class__.__name__,
432433
"scheduler": scheduler.__class__.__name__,
433-
"scaler": scaler.__class__.__name__,
434434
"physicsnemo_pkg_info": get_physicsnemo_pkg_info(),
435435
"world_size": dist.world_size,
436436
**{f"n_{split}_samples": len(sample_paths[split]) for split in splits},
@@ -526,15 +526,13 @@ def prepare_sample(sample: AirFRANSSample) -> AirFRANSSample:
526526
if torch.isnan(batch_loss):
527527
warnings.warn(f"{batch_loss=} at: {dist.rank=}, {epoch=}")
528528
with record_function("backward"):
529-
scaler.scale(batch_loss).backward()
529+
batch_loss.backward()
530530
if gradient_clip_norm is not None:
531-
scaler.unscale_(optimizer)
532531
torch.nn.utils.clip_grad_norm_(
533532
model.parameters(), max_norm=gradient_clip_norm
534533
)
535534
with record_function("optimizer_step"):
536-
scaler.step(optimizer)
537-
scaler.update()
535+
optimizer.step()
538536
all_batch_losses.append(batch_loss.detach().clone())
539537
for k, v in batch_loss_components.items():
540538
all_batch_loss_components[k].append(v.detach().clone())
@@ -614,7 +612,6 @@ def save_ckpt() -> None:
614612
models=base_model,
615613
optimizer=optimizer,
616614
scheduler=scheduler,
617-
scaler=scaler,
618615
epoch=epoch,
619616
metadata=checkpoint_metadata(),
620617
)

examples/cfd/external_aerodynamics/globe/drivaer/run.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ CPUS_PER_NODE=${SLURM_CPUS_ON_NODE:-$(nproc)}
5151
export OMP_NUM_THREADS=1
5252
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS (process-level parallelism via DataLoader workers; ${CPUS_PER_NODE} CPUs / ${NUM_GPUS_PER_NODE} GPUs)"
5353

54+
### [CUDA Allocator]
55+
# expandable_segments: avoids the synchronizing cudaMalloc/cudaFree round-trips
56+
# that the default segment allocator performs when chunked kernel evaluations
57+
# stress the cache. Lets the allocator grow segments instead.
58+
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
59+
echo "PYTORCH_CUDA_ALLOC_CONF=$PYTORCH_CUDA_ALLOC_CONF"
60+
5461
### [Sync Dependencies]
5562
# Select the right CUDA extra based on the detected driver version,
5663
# then install both the project deps and example-specific requirements.

examples/cfd/external_aerodynamics/globe/drivaer/train.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def main(
111111
n_spherical_harmonics: int = 4,
112112
theta: float = 1.0,
113113
leaf_size: int = 1,
114+
tree_build_device: Literal["cpu", "cuda"] | None = None,
114115
n_faces_per_boundary: int = 80_000,
115116
patience_steps: int = 1600,
116117
use_profiler: bool = True,
@@ -152,6 +153,8 @@ def main(
152153
theta: Barnes-Hut opening angle. Larger values are more
153154
aggressive (more approximation, faster). 0 = exact.
154155
leaf_size: Maximum sources per leaf node in the Barnes-Hut tree.
156+
tree_build_device: Device on which to build cluster trees and run the
157+
dual-tree Barnes-Hut traversal. ``None`` (default) uses the input's device.
155158
n_faces_per_boundary: Target boundary mesh face count after decimation.
156159
patience_steps: ReduceLROnPlateau patience expressed in gradient
157160
steps (world-size independent). Converted to epochs internally.
@@ -281,6 +284,7 @@ def main(
281284
self_regularization_beta=self_regularization_beta,
282285
latent_compression_scale=latent_compression_scale,
283286
expand_far_targets=expand_far_targets,
287+
tree_build_device=tree_build_device,
284288
).to(device)
285289

286290
logger0.info(f"{output_dir.name=!r}")
@@ -358,16 +362,13 @@ def main(
358362
min_lr=learning_rate / 64,
359363
threshold=1e-3,
360364
)
361-
scaler = torch.amp.GradScaler(device=device.type, enabled=amp)
362-
363365
### [Checkpoint Save/Load]
364366
metadata_dict: dict[str, Any] = {}
365367
epoch = load_checkpoint(
366368
checkpoint_dir,
367369
models=base_model,
368370
optimizer=optimizer,
369371
scheduler=scheduler,
370-
scaler=scaler,
371372
metadata_dict=metadata_dict,
372373
device=dist.device,
373374
)
@@ -438,7 +439,6 @@ def main(
438439
**config_settings,
439440
"optimizer": optimizer.__class__.__name__,
440441
"scheduler": scheduler.__class__.__name__,
441-
"scaler": scaler.__class__.__name__,
442442
"physicsnemo_pkg_info": get_physicsnemo_pkg_info(),
443443
"world_size": dist.world_size,
444444
**{f"n_{split}_samples": len(sample_paths[split]) for split in splits},
@@ -505,15 +505,13 @@ def run_epoch(split: Split) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
505505
if torch.isnan(batch_loss):
506506
warnings.warn(f"{batch_loss=} at: {dist.rank=}, {epoch=}")
507507
with record_function("backward"):
508-
scaler.scale(batch_loss).backward()
508+
batch_loss.backward()
509509
if gradient_clip_norm is not None:
510-
scaler.unscale_(optimizer)
511510
torch.nn.utils.clip_grad_norm_(
512511
model.parameters(), max_norm=gradient_clip_norm
513512
)
514513
with record_function("optimizer_step"):
515-
scaler.step(optimizer)
516-
scaler.update()
514+
optimizer.step()
517515

518516
all_batch_losses.append(batch_loss.detach().clone())
519517
for k, v in batch_loss_components.items():
@@ -594,7 +592,6 @@ def save_ckpt() -> None:
594592
models=base_model,
595593
optimizer=optimizer,
596594
scheduler=scheduler,
597-
scaler=scaler,
598595
epoch=epoch,
599596
metadata=checkpoint_metadata(),
600597
)

0 commit comments

Comments
 (0)