From b6f519b28763e19903b2eba1bdf4dde7c58e3b42 Mon Sep 17 00:00:00 2001
From: "Eng.Ahmed ElBamby" <ahmedelbamby-aast@users.noreply.github.com>
Date: Thu, 2 Apr 2026 23:05:15 +0000
Subject: [PATCH 1/3] Add multitask configs, task preflight checks, and flash
 backend controls

---
 ultralytics/cfg/models/v13/yolov13-obb.yaml   |  52 +++
 ultralytics/cfg/models/v13/yolov13-pose.yaml  |  53 +++
 ultralytics/cfg/models/v13/yolov13-seg.yaml   |  52 +++
 ultralytics/cfg/models/v13/yolov13l-obb.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13l-pose.yaml |  50 +++
 ultralytics/cfg/models/v13/yolov13l-seg.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13l.yaml      |  50 +++
 ultralytics/cfg/models/v13/yolov13n-obb.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13n-pose.yaml |  50 +++
 ultralytics/cfg/models/v13/yolov13n-seg.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13s-obb.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13s-pose.yaml |  50 +++
 ultralytics/cfg/models/v13/yolov13s-seg.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13s.yaml      |  50 +++
 ultralytics/cfg/models/v13/yolov13x-obb.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13x-pose.yaml |  50 +++
 ultralytics/cfg/models/v13/yolov13x-seg.yaml  |  49 +++
 ultralytics/cfg/models/v13/yolov13x.yaml      |  50 +++
 ultralytics/data/utils.py                     | 144 ++++++-
 ultralytics/engine/trainer.py                 |  25 +-
 ultralytics/engine/validator.py               |   2 +-
 ultralytics/models/yolo/world/train_world.py  |   4 +-
 ultralytics/nn/modules/block.py               | 352 ++++++++++--------
 ultralytics/utils/dist.py                     |  71 +++-
 ultralytics/utils/flash_turing_interface.py   |  68 ++++
 ultralytics/utils/metrics.py                  |  22 +-
 26 files changed, 1395 insertions(+), 192 deletions(-)
 create mode 100644 ultralytics/cfg/models/v13/yolov13-obb.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13-pose.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13-seg.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13l-obb.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13l-pose.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13l-seg.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13l.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13n-obb.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13n-pose.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13n-seg.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13s-obb.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13s-pose.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13s-seg.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13s.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13x-obb.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13x-pose.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13x-seg.yaml
 create mode 100644 ultralytics/cfg/models/v13/yolov13x.yaml
 create mode 100644 ultralytics/utils/flash_turing_interface.py

diff --git a/ultralytics/cfg/models/v13/yolov13-obb.yaml b/ultralytics/cfg/models/v13/yolov13-obb.yaml
new file mode 100644
index 000000000..24b83154d
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13-obb.yaml
@@ -0,0 +1,52 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  m: [0.75, 0.75, 768]    # Medium
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+  xl: [1.25, 1.75, 512]   # Extra Large Plus
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, OBB, [nc, 1]]
diff --git a/ultralytics/cfg/models/v13/yolov13-pose.yaml b/ultralytics/cfg/models/v13/yolov13-pose.yaml
new file mode 100644
index 000000000..b37076573
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13-pose.yaml
@@ -0,0 +1,53 @@
+nc: 80 # number of classes
+kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  m: [0.75, 0.75, 768]    # Medium
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+  xl: [1.25, 1.75, 512]   # Extra Large Plus
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Pose, [nc, kpt_shape]]
diff --git a/ultralytics/cfg/models/v13/yolov13-seg.yaml b/ultralytics/cfg/models/v13/yolov13-seg.yaml
new file mode 100644
index 000000000..33ed2f075
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13-seg.yaml
@@ -0,0 +1,52 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  m: [0.75, 0.75, 768]    # Medium
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+  xl: [1.25, 1.75, 512]   # Extra Large Plus
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Segment, [nc, 32, 256]]
diff --git a/ultralytics/cfg/models/v13/yolov13l-obb.yaml b/ultralytics/cfg/models/v13/yolov13l-obb.yaml
new file mode 100644
index 000000000..b7a71e767
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13l-obb.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, OBB, [nc, 1]]
diff --git a/ultralytics/cfg/models/v13/yolov13l-pose.yaml b/ultralytics/cfg/models/v13/yolov13l-pose.yaml
new file mode 100644
index 000000000..1dbb90ba5
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13l-pose.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Pose, [nc, kpt_shape]]
diff --git a/ultralytics/cfg/models/v13/yolov13l-seg.yaml b/ultralytics/cfg/models/v13/yolov13l-seg.yaml
new file mode 100644
index 000000000..d675cc2b9
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13l-seg.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Segment, [nc, 32, 256]]
diff --git a/ultralytics/cfg/models/v13/yolov13l.yaml b/ultralytics/cfg/models/v13/yolov13l.yaml
new file mode 100644
index 000000000..68babb93e
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13l.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants, i.e. 'model=yolov13n.yaml' will call yolov13.yaml with scale 'n'
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]] # 0-P1/2
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]] # 1-P2/4
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]] # 3-P3/8
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]] # 5-P4/16
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]] # 7-P5/32
+  - [-1, 4, A2C2f, [1024, True, 1]] # 8
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]  #12     
+  - [[4, 10], 1, FullPAD_Tunnel, []]  #13    
+  - [[8, 11], 1, FullPAD_Tunnel, []] #14 
+  
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]] # cat backbone P4
+  - [-1, 2, DSC3k2, [512, True]] # 17
+  - [[-1, 9], 1, FullPAD_Tunnel, []]  #18
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]] # cat backbone P3
+  - [-1, 2, DSC3k2, [256, True]] # 21
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]  #23
+  
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]] # cat head P4
+  - [-1, 2, DSC3k2, [512, True]] # 26
+  - [[-1, 9], 1, FullPAD_Tunnel, []]  
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]] # cat head P5
+  - [-1, 2, DSC3k2, [1024,True]] # 30 (P5/32-large)
+  - [[-1, 11], 1, FullPAD_Tunnel, []]  
+  
+  - [[23, 27, 31], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v13/yolov13n-obb.yaml b/ultralytics/cfg/models/v13/yolov13n-obb.yaml
new file mode 100644
index 000000000..b7a71e767
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13n-obb.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, OBB, [nc, 1]]
diff --git a/ultralytics/cfg/models/v13/yolov13n-pose.yaml b/ultralytics/cfg/models/v13/yolov13n-pose.yaml
new file mode 100644
index 000000000..1dbb90ba5
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13n-pose.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Pose, [nc, kpt_shape]]
diff --git a/ultralytics/cfg/models/v13/yolov13n-seg.yaml b/ultralytics/cfg/models/v13/yolov13n-seg.yaml
new file mode 100644
index 000000000..d675cc2b9
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13n-seg.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Segment, [nc, 32, 256]]
diff --git a/ultralytics/cfg/models/v13/yolov13s-obb.yaml b/ultralytics/cfg/models/v13/yolov13s-obb.yaml
new file mode 100644
index 000000000..b7a71e767
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13s-obb.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, OBB, [nc, 1]]
diff --git a/ultralytics/cfg/models/v13/yolov13s-pose.yaml b/ultralytics/cfg/models/v13/yolov13s-pose.yaml
new file mode 100644
index 000000000..1dbb90ba5
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13s-pose.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Pose, [nc, kpt_shape]]
diff --git a/ultralytics/cfg/models/v13/yolov13s-seg.yaml b/ultralytics/cfg/models/v13/yolov13s-seg.yaml
new file mode 100644
index 000000000..d675cc2b9
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13s-seg.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Segment, [nc, 32, 256]]
diff --git a/ultralytics/cfg/models/v13/yolov13s.yaml b/ultralytics/cfg/models/v13/yolov13s.yaml
new file mode 100644
index 000000000..68babb93e
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13s.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants, i.e. 'model=yolov13n.yaml' will call yolov13.yaml with scale 'n'
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]] # 0-P1/2
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]] # 1-P2/4
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]] # 3-P3/8
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]] # 5-P4/16
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]] # 7-P5/32
+  - [-1, 4, A2C2f, [1024, True, 1]] # 8
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]  #12     
+  - [[4, 10], 1, FullPAD_Tunnel, []]  #13    
+  - [[8, 11], 1, FullPAD_Tunnel, []] #14 
+  
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]] # cat backbone P4
+  - [-1, 2, DSC3k2, [512, True]] # 17
+  - [[-1, 9], 1, FullPAD_Tunnel, []]  #18
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]] # cat backbone P3
+  - [-1, 2, DSC3k2, [256, True]] # 21
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]  #23
+  
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]] # cat head P4
+  - [-1, 2, DSC3k2, [512, True]] # 26
+  - [[-1, 9], 1, FullPAD_Tunnel, []]  
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]] # cat head P5
+  - [-1, 2, DSC3k2, [1024,True]] # 30 (P5/32-large)
+  - [[-1, 11], 1, FullPAD_Tunnel, []]  
+  
+  - [[23, 27, 31], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v13/yolov13x-obb.yaml b/ultralytics/cfg/models/v13/yolov13x-obb.yaml
new file mode 100644
index 000000000..b7a71e767
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13x-obb.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, OBB, [nc, 1]]
diff --git a/ultralytics/cfg/models/v13/yolov13x-pose.yaml b/ultralytics/cfg/models/v13/yolov13x-pose.yaml
new file mode 100644
index 000000000..1dbb90ba5
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13x-pose.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Pose, [nc, kpt_shape]]
diff --git a/ultralytics/cfg/models/v13/yolov13x-seg.yaml b/ultralytics/cfg/models/v13/yolov13x-seg.yaml
new file mode 100644
index 000000000..d675cc2b9
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13x-seg.yaml
@@ -0,0 +1,49 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]]
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]]
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]]
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]]
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]]
+  - [-1, 4, A2C2f, [1024, True, 1]]
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]
+  - [[4, 10], 1, FullPAD_Tunnel, []]
+  - [[8, 11], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [256, True]]
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]
+
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [512, True]]
+  - [[-1, 9], 1, FullPAD_Tunnel, []]
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]]
+  - [-1, 2, DSC3k2, [1024, True]]
+  - [[-1, 11], 1, FullPAD_Tunnel, []]
+
+  - [[23, 27, 31], 1, Segment, [nc, 32, 256]]
diff --git a/ultralytics/cfg/models/v13/yolov13x.yaml b/ultralytics/cfg/models/v13/yolov13x.yaml
new file mode 100644
index 000000000..68babb93e
--- /dev/null
+++ b/ultralytics/cfg/models/v13/yolov13x.yaml
@@ -0,0 +1,50 @@
+nc: 80 # number of classes
+scales: # model compound scaling constants, i.e. 'model=yolov13n.yaml' will call yolov13.yaml with scale 'n'
+  # [depth, width, max_channels]
+  n: [0.50, 0.25, 1024]   # Nano
+  s: [0.50, 0.50, 1024]   # Small
+  l: [1.00, 1.00, 512]    # Large
+  x: [1.00, 1.50, 512]    # Extra Large
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, Conv,  [64, 3, 2]] # 0-P1/2
+  - [-1, 1, Conv,  [128, 3, 2, 1, 2]] # 1-P2/4
+  - [-1, 2, DSC3k2,  [256, False, 0.25]]
+  - [-1, 1, Conv,  [256, 3, 2, 1, 4]] # 3-P3/8
+  - [-1, 2, DSC3k2,  [512, False, 0.25]]
+  - [-1, 1, DSConv,  [512, 3, 2]] # 5-P4/16
+  - [-1, 4, A2C2f, [512, True, 4]]
+  - [-1, 1, DSConv,  [1024, 3, 2]] # 7-P5/32
+  - [-1, 4, A2C2f, [1024, True, 1]] # 8
+
+head:
+  - [[4, 6, 8], 2, HyperACE, [512, 8, True, True, 0.5, 1, "both"]]
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [ 9, 1, DownsampleConv, []]
+  - [[6, 9], 1, FullPAD_Tunnel, []]  #12     
+  - [[4, 10], 1, FullPAD_Tunnel, []]  #13    
+  - [[8, 11], 1, FullPAD_Tunnel, []] #14 
+  
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 12], 1, Concat, [1]] # cat backbone P4
+  - [-1, 2, DSC3k2, [512, True]] # 17
+  - [[-1, 9], 1, FullPAD_Tunnel, []]  #18
+
+  - [17, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [[-1, 13], 1, Concat, [1]] # cat backbone P3
+  - [-1, 2, DSC3k2, [256, True]] # 21
+  - [10, 1, Conv, [256, 1, 1]]
+  - [[21, 22], 1, FullPAD_Tunnel, []]  #23
+  
+  - [-1, 1, Conv, [256, 3, 2]]
+  - [[-1, 18], 1, Concat, [1]] # cat head P4
+  - [-1, 2, DSC3k2, [512, True]] # 26
+  - [[-1, 9], 1, FullPAD_Tunnel, []]  
+
+  - [26, 1, Conv, [512, 3, 2]]
+  - [[-1, 14], 1, Concat, [1]] # cat head P5
+  - [-1, 2, DSC3k2, [1024,True]] # 30 (P5/32-large)
+  - [[-1, 11], 1, FullPAD_Tunnel, []]  
+  
+  - [[23, 27, 31], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/data/utils.py b/ultralytics/data/utils.py
index 50b597d86..59ad043dc 100644
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@@ -298,7 +298,146 @@ def find_dataset_yaml(path: Path) -> Path:
     return files[0]
 
 
-def check_det_dataset(dataset, autodownload=True):
+def _sample_label_files(train_paths, max_files=200):
+    """Collect a sample of label files from train image paths."""
+    label_files = []
+    seen = set()
+
+    def _add_from_dir(d: Path):
+        if not d.exists() or not d.is_dir():
+            return False
+        added = False
+        for lb in d.rglob("*.txt"):
+            s = str(lb)
+            if s not in seen:
+                seen.add(s)
+                label_files.append(lb)
+                added = True
+                if len(label_files) >= max_files:
+                    return True
+        return added
+
+    for p in train_paths:
+        p = Path(p)
+        if not p.exists():
+            continue
+
+        # train path can be a text file with image paths
+        if p.is_file() and p.suffix.lower() == ".txt":
+            try:
+                im_files = [
+                    Path(x.strip()) for x in p.read_text(encoding="utf-8", errors="ignore").splitlines() if x.strip()
+                ]
+            except Exception:
+                im_files = []
+            for im in im_files:
+                lb = Path(str(im).replace(f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}")).with_suffix(".txt")
+                if lb.exists():
+                    s = str(lb)
+                    if s not in seen:
+                        seen.add(s)
+                        label_files.append(lb)
+                        if len(label_files) >= max_files:
+                            return label_files
+            continue
+
+        if p.is_file():
+            continue
+
+        candidates = [
+            Path(str(p).replace(f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}")),
+            p.parent / "labels",
+            p.parent.parent / "labels" / p.name,
+            p / "labels",
+        ]
+        for c in candidates:
+            if _add_from_dir(c) and len(label_files) >= max_files:
+                return label_files
+
+        # Fallback: scan nearby roots for any labels subtree
+        for root in {p, p.parent, p.parent.parent}:
+            if not root.exists() or not root.is_dir():
+                continue
+            for labels_dir in root.rglob("labels"):
+                if _add_from_dir(labels_dir) and len(label_files) >= max_files:
+                    return label_files
+
+    return label_files
+
+
+def _validate_task_label_schema(data, task):
+    """Validate task-specific label line shape for a sample of train labels."""
+    if task not in {"segment", "pose", "obb"}:
+        return
+
+    train = data.get("train")
+    train_paths = train if isinstance(train, (list, tuple)) else [train]
+    label_files = _sample_label_files(train_paths)
+    if not label_files:
+        LOGGER.warning(f"WARNING ⚠️ task={task} preflight skipped: no label files sampled from train paths.")
+        return
+
+    if task == "pose":
+        kpt_shape = data.get("kpt_shape")
+        if not isinstance(kpt_shape, (list, tuple)) or len(kpt_shape) != 2:
+            raise SyntaxError(emojis(f"Pose dataset requires 'kpt_shape: [num_kpts, dims]' in data YAML. {HELP_URL}"))
+        nkpt, ndim = int(kpt_shape[0]), int(kpt_shape[1])
+        if nkpt <= 0 or ndim not in {2, 3}:
+            raise SyntaxError(
+                emojis(f"Invalid kpt_shape={kpt_shape}. Expected [num_kpts>0, dims in {{2,3}}]. {HELP_URL}")
+            )
+        expected = 5 + nkpt * ndim
+    else:
+        expected = None
+
+    errors = []
+    checked = 0
+    for lb in label_files:
+        try:
+            lines = lb.read_text(encoding="utf-8", errors="ignore").splitlines()
+        except Exception:
+            continue
+
+        for i, line in enumerate(lines, 1):
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split()
+            checked += 1
+            n = len(parts)
+
+            try:
+                _ = [float(x) for x in parts]
+            except ValueError:
+                errors.append(f"{lb}:{i} contains non-numeric values")
+                if len(errors) >= 5:
+                    break
+                continue
+
+            if task == "pose":
+                if n != expected:
+                    errors.append(f"{lb}:{i} has {n} columns, expected {expected} for pose")
+            elif task == "segment":
+                if n < 7 or n % 2 == 0:
+                    errors.append(f"{lb}:{i} has {n} columns, expected cls + polygon coords (odd >= 7)")
+            elif task == "obb":
+                if n < 9 or n % 2 == 0:
+                    errors.append(f"{lb}:{i} has {n} columns, expected cls + 8+ corner coords (odd >= 9)")
+
+            if len(errors) >= 5:
+                break
+        if len(errors) >= 5:
+            break
+
+    if errors:
+        hint = "\n".join(errors)
+        raise SyntaxError(emojis(f"Task preflight failed for task={task} ❌\n{hint}\n{HELP_URL}"))
+
+    if checked == 0:
+        LOGGER.warning(f"WARNING ⚠️ task={task} preflight sampled labels but found no non-empty label rows.")
+
+
+def check_det_dataset(dataset, autodownload=True, task=None):
     """
     Download, verify, and/or unzip a dataset if not found locally.
 
@@ -388,6 +527,9 @@ def check_det_dataset(dataset, autodownload=True):
             LOGGER.info(f"Dataset download {s}\n")
     check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf")  # download fonts
 
+    if task in {"segment", "pose", "obb"}:
+        _validate_task_label_schema(data, task)
+
     return data  # dictionary
 
 
diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py
index a373cd825..aa8287788 100644
--- a/ultralytics/engine/trainer.py
+++ b/ultralytics/engine/trainer.py
@@ -264,13 +264,19 @@ def _setup_train(self, world_size):
             self.amp = torch.tensor(check_amp(self.model), device=self.device)
             callbacks.default_callbacks = callbacks_backup  # restore callbacks
         if RANK > -1 and world_size > 1:  # DDP
-            dist.broadcast(self.amp, src=0)  # broadcast the tensor from rank 0 to all other ranks (returns None)
+            self.amp = self.amp.int()  # gloo backend may fail on bool tensor broadcast in some torch builds
+            dist.broadcast(self.amp, src=0)
         self.amp = bool(self.amp)  # as boolean
         self.scaler = (
             torch.amp.GradScaler("cuda", enabled=self.amp) if TORCH_2_4 else torch.cuda.amp.GradScaler(enabled=self.amp)
         )
         if world_size > 1:
-            self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK], find_unused_parameters=True)
+            self.model = nn.parallel.DistributedDataParallel(
+                self.model,
+                device_ids=[RANK],
+                find_unused_parameters=True,
+                gradient_as_bucket_view=False,
+            )
             self.set_model_attributes()  # set again after DDP wrapper
 
         # Check imgsz
@@ -385,6 +391,19 @@ def _do_train(self, world_size=1):
                         (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None else self.loss_items
                     )
 
+                # Non-finite loss guard for DDP stability
+                loss_is_finite = torch.isfinite(self.loss.detach()).int()
+                if RANK != -1:
+                    dist.all_reduce(loss_is_finite, op=dist.ReduceOp.MIN)
+                if not bool(loss_is_finite.item()):
+                    if RANK in {-1, 0}:
+                        loss_value = float(self.loss.detach().float().cpu())
+                        LOGGER.warning(
+                            "WARNING ⚠️ Non-finite loss detected (%.4g). Skipping optimizer step." % loss_value
+                        )
+                    self.optimizer.zero_grad(set_to_none=True)
+                    continue
+
                 # Backward
                 self.scaler.scale(self.loss).backward()
 
@@ -560,7 +579,7 @@ def get_dataset(self):
                 "pose",
                 "obb",
             }:
-                data = check_det_dataset(self.args.data)
+                data = check_det_dataset(self.args.data, task=self.args.task)
                 if "yaml_file" in data:
                     self.args.data = data["yaml_file"]  # for validating 'yolo train data=url.zip' usage
         except Exception as e:
diff --git a/ultralytics/engine/validator.py b/ultralytics/engine/validator.py
index 6dc8026f2..8757a0224 100644
--- a/ultralytics/engine/validator.py
+++ b/ultralytics/engine/validator.py
@@ -142,7 +142,7 @@ def __call__(self, trainer=None, model=None):
                 LOGGER.info(f"Setting batch={self.args.batch} input of shape ({self.args.batch}, 3, {imgsz}, {imgsz})")
 
             if str(self.args.data).split(".")[-1] in {"yaml", "yml"}:
-                self.data = check_det_dataset(self.args.data)
+                self.data = check_det_dataset(self.args.data, task=self.args.task)
             elif self.args.task == "classify":
                 self.data = check_cls_dataset(self.args.data, split=self.args.split)
             else:
diff --git a/ultralytics/models/yolo/world/train_world.py b/ultralytics/models/yolo/world/train_world.py
index 3cbdb2a4e..d344a5947 100644
--- a/ultralytics/models/yolo/world/train_world.py
+++ b/ultralytics/models/yolo/world/train_world.py
@@ -74,7 +74,9 @@ def get_dataset(self):
         data_yaml = self.args.data
         assert data_yaml.get("train", False), "train dataset not found"  # object365.yaml
         assert data_yaml.get("val", False), "validation dataset not found"  # lvis.yaml
-        data = {k: [check_det_dataset(d) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()}
+        data = {
+            k: [check_det_dataset(d, task=self.args.task) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()
+        }
         assert len(data["val"]) == 1, f"Only support validating on 1 dataset for now, but got {len(data['val'])}."
         val_split = "minival" if "lvis" in data["val"][0]["val"] else "val"
         for d in data["val"]:
diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py
index 7bfad101a..c69855427 100644
--- a/ultralytics/nn/modules/block.py
+++ b/ultralytics/nn/modules/block.py
@@ -50,10 +50,10 @@
     "PSA",
     "SCDown",
     "TorchVision",
-    "HyperACE", 
-    "DownsampleConv", 
+    "HyperACE",
+    "DownsampleConv",
     "FullPAD_Tunnel",
-    "DSC3k2"
+    "DSC3k2",
 )
 
 
@@ -1161,21 +1161,80 @@ def forward(self, x):
             y = self.m(x)
         return y
 
+
 import logging
+import os
+
 logger = logging.getLogger(__name__)
 
 USE_FLASH_ATTN = False
-try:
-    import torch
-    if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:  # Ampere or newer
-        from flash_attn.flash_attn_interface import flash_attn_func
-        USE_FLASH_ATTN = True
-    else:
-        from torch.nn.functional import scaled_dot_product_attention as sdpa
-        logger.warning("FlashAttention is not available on this device. Using scaled_dot_product_attention instead.")
-except Exception:
-    from torch.nn.functional import scaled_dot_product_attention as sdpa
-    logger.warning("FlashAttention is not available on this device. Using scaled_dot_product_attention instead.")
+FLASH_BACKEND = "fallback"
+FLASH_ERROR = ""
+
+
+def configure_flash_backend(disable_flash=None, use_turing_flash=None):
+    """Configure flash attention backend from flags or environment."""
+    global USE_FLASH_ATTN, FLASH_BACKEND, FLASH_ERROR
+
+    USE_FLASH_ATTN = False
+    FLASH_BACKEND = "fallback"
+    FLASH_ERROR = ""
+
+    try:
+        import torch
+
+        disable_flash = (os.getenv("Y13_DISABLE_FLASH", "0") == "1") if disable_flash is None else bool(disable_flash)
+        use_turing_flash = (
+            (os.getenv("Y13_USE_TURING_FLASH", "0") == "1") if use_turing_flash is None else bool(use_turing_flash)
+        )
+
+        if torch.cuda.is_available() and not disable_flash:
+            major, minor = torch.cuda.get_device_capability()
+
+            if major >= 8:
+                try:
+                    from flash_attn.flash_attn_interface import flash_attn_func as _flash_attn_func
+
+                    globals()["flash_attn_func"] = _flash_attn_func
+                    USE_FLASH_ATTN = True
+                    FLASH_BACKEND = "flash_attn"
+                except Exception as e:
+                    FLASH_ERROR = str(e)
+
+            elif (major, minor) == (7, 5) and use_turing_flash:
+                try:
+                    from ultralytics.utils.flash_turing_interface import flash_attn_func as _flash_attn_func
+
+                    globals()["flash_attn_func"] = _flash_attn_func
+                    USE_FLASH_ATTN = True
+                    FLASH_BACKEND = "flash_attn_turing"
+                except Exception as e:
+                    FLASH_ERROR = str(e)
+
+        if not USE_FLASH_ATTN:
+            if disable_flash:
+                logger.info("Flash attention disabled by Y13_DISABLE_FLASH=1, using fallback attention backend.")
+            elif FLASH_ERROR:
+                logger.warning(
+                    f"Flash attention backend unavailable ({FLASH_ERROR}). Using fallback attention backend."
+                )
+            else:
+                logger.warning(
+                    "Flash attention backend unavailable on this device/config. Using fallback attention backend."
+                )
+        else:
+            logger.info(f"Flash backend selected: {FLASH_BACKEND}")
+    except Exception as e:
+        FLASH_ERROR = str(e)
+        USE_FLASH_ATTN = False
+        FLASH_BACKEND = "fallback"
+        logger.warning(f"Flash attention initialization failed ({FLASH_ERROR}). Using fallback attention backend.")
+
+    return FLASH_BACKEND
+
+
+configure_flash_backend()
+
 
 class AAttn(nn.Module):
     """
@@ -1196,8 +1255,8 @@ class AAttn(nn.Module):
         >>> x = torch.randn(2, 64, 128, 128)
         >>> output = model(x)
         >>> print(output.shape)
-    
-    Notes: 
+
+    Notes:
         recommend that dim//num_heads be a multiple of 32 or 64.
 
     """
@@ -1217,7 +1276,6 @@ def __init__(self, dim, num_heads, area=1):
 
         self.pe = Conv(all_head_dim, dim, 5, 1, 2, g=dim, act=False)
 
-
     def forward(self, x):
         """Processes the input tensor 'x' through the area-attention"""
         B, C, H, W = x.shape
@@ -1225,8 +1283,8 @@ def forward(self, x):
 
         qk = self.qk(x).flatten(2).transpose(1, 2)
         v = self.v(x)
-        pp = self.pe(v)
-        v = v.flatten(2).transpose(1, 2)
+        pp = self.pe(v).contiguous()
+        v = v.flatten(2).transpose(1, 2).contiguous()
 
         if self.area > 1:
             qk = qk.reshape(B * self.area, N // self.area, C * 2)
@@ -1234,36 +1292,32 @@ def forward(self, x):
             B, N, _ = qk.shape
         q, k = qk.split([C, C], dim=2)
 
-        if x.is_cuda and USE_FLASH_ATTN:
+        if x.is_cuda and USE_FLASH_ATTN and self.head_dim in {64, 128}:
             q = q.view(B, N, self.num_heads, self.head_dim)
             k = k.view(B, N, self.num_heads, self.head_dim)
             v = v.view(B, N, self.num_heads, self.head_dim)
 
-            x = flash_attn_func(
-                q.contiguous().half(),
-                k.contiguous().half(),
-                v.contiguous().half()
-            ).to(q.dtype)
+            x = flash_attn_func(q.contiguous().half(), k.contiguous().half(), v.contiguous().half()).to(q.dtype)
         else:
-            q = q.transpose(1, 2).view(B, self.num_heads, self.head_dim, N)
-            k = k.transpose(1, 2).view(B, self.num_heads, self.head_dim, N)
-            v = v.transpose(1, 2).view(B, self.num_heads, self.head_dim, N)
+            q = q.transpose(1, 2).reshape(B, self.num_heads, self.head_dim, N).contiguous()
+            k = k.transpose(1, 2).reshape(B, self.num_heads, self.head_dim, N).contiguous()
+            v = v.transpose(1, 2).reshape(B, self.num_heads, self.head_dim, N).contiguous()
 
-            attn = (q.transpose(-2, -1) @ k) * (self.head_dim ** -0.5)
+            attn = (q.transpose(-2, -1) @ k) * (self.head_dim**-0.5)
             max_attn = attn.max(dim=-1, keepdim=True).values
             exp_attn = torch.exp(attn - max_attn)
             attn = exp_attn / exp_attn.sum(dim=-1, keepdim=True)
-            x = (v @ attn.transpose(-2, -1))
+            x = v @ attn.transpose(-2, -1)
 
-            x = x.permute(0, 3, 1, 2)
+            x = x.permute(0, 3, 1, 2).contiguous()
 
         if self.area > 1:
             x = x.reshape(B // self.area, N * self.area, C)
             B, N, _ = x.shape
-        x = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
+        x = x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
 
         return self.proj(x + pp)
-    
+
 
 class ABlock(nn.Module):
     """
@@ -1287,8 +1341,8 @@ class ABlock(nn.Module):
         >>> x = torch.randn(2, 64, 128, 128)
         >>> output = model(x)
         >>> print(output.shape)
-    
-    Notes: 
+
+    Notes:
         recommend that dim//num_heads be a multiple of 32 or 64.
     """
 
@@ -1316,7 +1370,7 @@ def forward(self, x):
         return x
 
 
-class A2C2f(nn.Module):  
+class A2C2f(nn.Module):
     """
     A2C2f module with residual enhanced feature extraction using ABlock blocks with area-attention. Also known as R-ELAN
 
@@ -1361,7 +1415,10 @@ def __init__(self, c1, c2, n=1, a2=True, area=1, residual=False, mlp_ratio=2.0,
         self.gamma = nn.Parameter(init_values * torch.ones((c2)), requires_grad=True) if a2 and residual else None
 
         self.m = nn.ModuleList(
-            nn.Sequential(*(ABlock(c_, num_heads, mlp_ratio, area) for _ in range(2))) if a2 else C3k(c_, c_, 2, shortcut, g) for _ in range(n)
+            nn.Sequential(*(ABlock(c_, num_heads, mlp_ratio, area) for _ in range(2)))
+            if a2
+            else C3k(c_, c_, 2, shortcut, g)
+            for _ in range(n)
         )
 
     def forward(self, x):
@@ -1372,12 +1429,13 @@ def forward(self, x):
             return x + self.gamma.view(1, -1, 1, 1) * self.cv2(torch.cat(y, 1))
         return self.cv2(torch.cat(y, 1))
 
+
 class DSBottleneck(nn.Module):
     """
     An improved bottleneck block using depthwise separable convolutions (DSConv).
 
     This class implements a lightweight bottleneck module that replaces standard convolutions with depthwise
-    separable convolutions to reduce parameters and computational cost. 
+    separable convolutions to reduce parameters and computational cost.
 
     Attributes:
         c1 (int): Number of input channels.
@@ -1399,11 +1457,12 @@ class DSBottleneck(nn.Module):
         >>> print(output.shape)
         torch.Size([2, 64, 32, 32])
     """
+
     def __init__(self, c1, c2, shortcut=True, e=0.5, k1=3, k2=5, d2=1):
         super().__init__()
         c_ = int(c2 * e)
-        self.cv1 = DSConv(c1, c_, k1, s=1, p=None, d=1)   
-        self.cv2 = DSConv(c_, c2, k2, s=1, p=None, d=d2)  
+        self.cv1 = DSConv(c1, c_, k1, s=1, p=None, d=1)
+        self.cv2 = DSConv(c_, c2, k2, s=1, p=None, d=d2)
         self.add = shortcut and c1 == c2
 
     def forward(self, x):
@@ -1440,34 +1499,13 @@ class DSC3k(C3):
         >>> print(output.shape)
         torch.Size([2, 128, 64, 64])
     """
-    def __init__(
-        self,
-        c1,                
-        c2,                 
-        n=1,                
-        shortcut=True,      
-        g=1,                 
-        e=0.5,              
-        k1=3,               
-        k2=5,               
-        d2=1                 
-    ):
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k1=3, k2=5, d2=1):
         super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)  
-
-        self.m = nn.Sequential(
-            *(
-                DSBottleneck(
-                    c_, c_,
-                    shortcut=shortcut,
-                    e=1.0,
-                    k1=k1,
-                    k2=k2,
-                    d2=d2
-                )
-                for _ in range(n)
-            )
-        )
+        c_ = int(c2 * e)
+
+        self.m = nn.Sequential(*(DSBottleneck(c_, c_, shortcut=shortcut, e=1.0, k1=k1, k2=k2, d2=d2) for _ in range(n)))
+
 
 class DSC3k2(C2f):
     """
@@ -1505,47 +1543,19 @@ class DSC3k2(C2f):
         >>> print(f"With DSC3k: {output2.shape}")
         With DSC3k: torch.Size([2, 64, 128, 128])
     """
-    def __init__(
-        self,
-        c1,          
-        c2,         
-        n=1,          
-        dsc3k=False,  
-        e=0.5,       
-        g=1,        
-        shortcut=True,
-        k1=3,       
-        k2=7,       
-        d2=1         
-    ):
+
+    def __init__(self, c1, c2, n=1, dsc3k=False, e=0.5, g=1, shortcut=True, k1=3, k2=7, d2=1):
         super().__init__(c1, c2, n, shortcut, g, e)
         if dsc3k:
             self.m = nn.ModuleList(
-                DSC3k(
-                    self.c, self.c,
-                    n=2,           
-                    shortcut=shortcut,
-                    g=g,
-                    e=1.0,  
-                    k1=k1,
-                    k2=k2,
-                    d2=d2
-                )
-                for _ in range(n)
+                DSC3k(self.c, self.c, n=2, shortcut=shortcut, g=g, e=1.0, k1=k1, k2=k2, d2=d2) for _ in range(n)
             )
         else:
             self.m = nn.ModuleList(
-                DSBottleneck(
-                    self.c, self.c,
-                    shortcut=shortcut,
-                    e=1.0,
-                    k1=k1,
-                    k2=k2,
-                    d2=d2
-                )
-                for _ in range(n)
+                DSBottleneck(self.c, self.c, shortcut=shortcut, e=1.0, k1=k1, k2=k2, d2=d2) for _ in range(n)
             )
 
+
 class AdaHyperedgeGen(nn.Module):
     """
     Generates an adaptive hyperedge participation matrix from a set of vertex features.
@@ -1572,6 +1582,7 @@ class AdaHyperedgeGen(nn.Module):
         >>> print(A.shape)
         torch.Size([2, 100, 16])
     """
+
     def __init__(self, node_dim, num_hyperedges, num_heads=4, dropout=0.1, context="both"):
         super().__init__()
         self.num_heads = num_heads
@@ -1582,47 +1593,45 @@ def __init__(self, node_dim, num_hyperedges, num_heads=4, dropout=0.1, context="
         self.prototype_base = nn.Parameter(torch.Tensor(num_hyperedges, node_dim))
         nn.init.xavier_uniform_(self.prototype_base)
         if context in ("mean", "max"):
-            self.context_net = nn.Linear(node_dim, num_hyperedges * node_dim)  
+            self.context_net = nn.Linear(node_dim, num_hyperedges * node_dim)
         elif context == "both":
-            self.context_net = nn.Linear(2*node_dim, num_hyperedges * node_dim)
+            self.context_net = nn.Linear(2 * node_dim, num_hyperedges * node_dim)
         else:
-            raise ValueError(
-                f"Unsupported context '{context}'. "
-                "Expected one of: 'mean', 'max', 'both'."
-            )
+            raise ValueError(f"Unsupported context '{context}'. Expected one of: 'mean', 'max', 'both'.")
 
         self.pre_head_proj = nn.Linear(node_dim, node_dim)
-    
+
         self.dropout = nn.Dropout(dropout)
         self.scaling = math.sqrt(self.head_dim)
 
     def forward(self, X):
         B, N, D = X.shape
         if self.context == "mean":
-            context_cat = X.mean(dim=1)          
+            context_cat = X.mean(dim=1)
         elif self.context == "max":
-            context_cat, _ = X.max(dim=1)          
+            context_cat, _ = X.max(dim=1)
         else:
-            avg_context = X.mean(dim=1)           
-            max_context, _ = X.max(dim=1)           
-            context_cat = torch.cat([avg_context, max_context], dim=-1) 
-        prototype_offsets = self.context_net(context_cat).view(B, self.num_hyperedges, D)  
-        prototypes = self.prototype_base.unsqueeze(0) + prototype_offsets           
-        
-        X_proj = self.pre_head_proj(X) 
+            avg_context = X.mean(dim=1)
+            max_context, _ = X.max(dim=1)
+            context_cat = torch.cat([avg_context, max_context], dim=-1)
+        prototype_offsets = self.context_net(context_cat).view(B, self.num_hyperedges, D)
+        prototypes = self.prototype_base.unsqueeze(0) + prototype_offsets
+
+        X_proj = self.pre_head_proj(X)
         X_heads = X_proj.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
         proto_heads = prototypes.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
-        
+
         X_heads_flat = X_heads.reshape(B * self.num_heads, N, self.head_dim)
         proto_heads_flat = proto_heads.reshape(B * self.num_heads, self.num_hyperedges, self.head_dim).transpose(1, 2)
-        
-        logits = torch.bmm(X_heads_flat, proto_heads_flat) / self.scaling 
-        logits = logits.view(B, self.num_heads, N, self.num_hyperedges).mean(dim=1) 
-        
-        logits = self.dropout(logits)  
+
+        logits = torch.bmm(X_heads_flat, proto_heads_flat) / self.scaling
+        logits = logits.view(B, self.num_heads, N, self.num_hyperedges).mean(dim=1)
+
+        logits = self.dropout(logits)
 
         return F.softmax(logits, dim=1)
 
+
 class AdaHGConv(nn.Module):
     """
     Performs the adaptive hypergraph convolution.
@@ -1646,34 +1655,30 @@ class AdaHGConv(nn.Module):
     Examples:
         >>> import torch
         >>> model = AdaHGConv(embed_dim=128, num_hyperedges=16, num_heads=8)
-        >>> x = torch.randn(2, 256, 128) # (Batch, Num_Nodes, Dim)
+        >>> x = torch.randn(2, 256, 128)  # (Batch, Num_Nodes, Dim)
         >>> output = model(x)
         >>> print(output.shape)
         torch.Size([2, 256, 128])
     """
+
     def __init__(self, embed_dim, num_hyperedges=16, num_heads=4, dropout=0.1, context="both"):
         super().__init__()
         self.edge_generator = AdaHyperedgeGen(embed_dim, num_hyperedges, num_heads, dropout, context)
-        self.edge_proj = nn.Sequential(
-            nn.Linear(embed_dim, embed_dim ),
-            nn.GELU()
-        )
-        self.node_proj = nn.Sequential(
-            nn.Linear(embed_dim, embed_dim ),
-            nn.GELU()
-        )
-        
+        self.edge_proj = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.GELU())
+        self.node_proj = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.GELU())
+
     def forward(self, X):
-        A = self.edge_generator(X)  
-        
-        He = torch.bmm(A.transpose(1, 2), X) 
+        A = self.edge_generator(X)
+
+        He = torch.bmm(A.transpose(1, 2), X)
         He = self.edge_proj(He)
-        
-        X_new = torch.bmm(A, He)  
+
+        X_new = torch.bmm(A, He)
         X_new = self.node_proj(X_new)
-        
+
         return X_new + X
-        
+
+
 class AdaHGComputation(nn.Module):
     """
     A wrapper module for applying adaptive hypergraph convolution to 4D feature maps.
@@ -1695,28 +1700,26 @@ class AdaHGComputation(nn.Module):
     Examples:
         >>> import torch
         >>> model = AdaHGComputation(embed_dim=64, num_hyperedges=8, num_heads=4)
-        >>> x = torch.randn(2, 64, 32, 32) # (B, C, H, W)
+        >>> x = torch.randn(2, 64, 32, 32)  # (B, C, H, W)
         >>> output = model(x)
         >>> print(output.shape)
         torch.Size([2, 64, 32, 32])
     """
+
     def __init__(self, embed_dim, num_hyperedges=16, num_heads=8, dropout=0.1, context="both"):
         super().__init__()
         self.embed_dim = embed_dim
         self.hgnn = AdaHGConv(
-            embed_dim=embed_dim,
-            num_hyperedges=num_hyperedges,
-            num_heads=num_heads,
-            dropout=dropout,
-            context=context
+            embed_dim=embed_dim, num_hyperedges=num_hyperedges, num_heads=num_heads, dropout=dropout, context=context
         )
-        
+
     def forward(self, x):
         B, C, H, W = x.shape
-        tokens = x.flatten(2).transpose(1, 2) 
-        tokens = self.hgnn(tokens) 
+        tokens = x.flatten(2).transpose(1, 2)
+        tokens = self.hgnn(tokens)
         x_out = tokens.transpose(1, 2).view(B, C, H, W)
-        return x_out 
+        return x_out
+
 
 class C3AH(nn.Module):
     """
@@ -1744,23 +1747,23 @@ class C3AH(nn.Module):
         >>> print(output.shape)
         torch.Size([2, 128, 32, 32])
     """
+
     def __init__(self, c1, c2, e=1.0, num_hyperedges=8, context="both"):
         super().__init__()
-        c_ = int(c2 * e)  
+        c_ = int(c2 * e)
         assert c_ % 16 == 0, "Dimension of AdaHGComputation should be a multiple of 16."
         num_heads = c_ // 16
         self.cv1 = Conv(c1, c_, 1, 1)
         self.cv2 = Conv(c1, c_, 1, 1)
-        self.m = AdaHGComputation(embed_dim=c_, 
-                          num_hyperedges=num_hyperedges, 
-                          num_heads=num_heads,
-                          dropout=0.1,
-                          context=context)
-        self.cv3 = Conv(2 * c_, c2, 1)  
-        
+        self.m = AdaHGComputation(
+            embed_dim=c_, num_hyperedges=num_hyperedges, num_heads=num_heads, dropout=0.1, context=context
+        )
+        self.cv3 = Conv(2 * c_, c2, 1)
+
     def forward(self, x):
         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
 
+
 class FuseModule(nn.Module):
     """
     A module to fuse multi-scale features for the HyperACE block.
@@ -1785,10 +1788,11 @@ class FuseModule(nn.Module):
         >>> print(output.shape)
         torch.Size([2, 64, 32, 32])
     """
+
     def __init__(self, c_in, channel_adjust):
         super(FuseModule, self).__init__()
         self.downsample = nn.AvgPool2d(kernel_size=2)
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
         if channel_adjust:
             self.conv_out = Conv(4 * c_in, c_in, 1)
         else:
@@ -1801,6 +1805,7 @@ def forward(self, x):
         out = self.conv_out(x_cat)
         return out
 
+
 class HyperACE(nn.Module):
     """
     Hypergraph-based Adaptive Correlation Enhancement (HyperACE).
@@ -1833,18 +1838,32 @@ class HyperACE(nn.Module):
         >>> print(output.shape)
         torch.Size([2, 256, 32, 32])
     """
-    def __init__(self, c1, c2, n=1, num_hyperedges=8, dsc3k=True, shortcut=False, e1=0.5, e2=1, context="both", channel_adjust=True):
+
+    def __init__(
+        self,
+        c1,
+        c2,
+        n=1,
+        num_hyperedges=8,
+        dsc3k=True,
+        shortcut=False,
+        e1=0.5,
+        e2=1,
+        context="both",
+        channel_adjust=True,
+    ):
         super().__init__()
-        self.c = int(c2 * e1) 
+        self.c = int(c2 * e1)
         self.cv1 = Conv(c1, 3 * self.c, 1, 1)
-        self.cv2 = Conv((4 + n) * self.c, c2, 1) 
+        self.cv2 = Conv((4 + n) * self.c, c2, 1)
         self.m = nn.ModuleList(
-            DSC3k(self.c, self.c, 2, shortcut, k1=3, k2=7) if dsc3k else DSBottleneck(self.c, self.c, shortcut=shortcut) for _ in range(n)
+            DSC3k(self.c, self.c, 2, shortcut, k1=3, k2=7) if dsc3k else DSBottleneck(self.c, self.c, shortcut=shortcut)
+            for _ in range(n)
         )
         self.fuse = FuseModule(c1, channel_adjust)
         self.branch1 = C3AH(self.c, self.c, e2, num_hyperedges, context)
         self.branch2 = C3AH(self.c, self.c, e2, num_hyperedges, context)
-                    
+
     def forward(self, X):
         x = self.fuse(X)
         y = list(self.cv1(x).chunk(3, 1))
@@ -1855,6 +1874,7 @@ def forward(self, X):
         y.append(out2)
         return self.cv2(torch.cat(y, 1))
 
+
 class DownsampleConv(nn.Module):
     """
     A simple downsampling block with optional channel adjustment.
@@ -1877,17 +1897,19 @@ class DownsampleConv(nn.Module):
         >>> print(output.shape)
         torch.Size([2, 128, 16, 16])
     """
+
     def __init__(self, in_channels, channel_adjust=True):
         super().__init__()
         self.downsample = nn.AvgPool2d(kernel_size=2)
         if channel_adjust:
             self.channel_adjust = Conv(in_channels, in_channels * 2, 1)
         else:
-            self.channel_adjust = nn.Identity() 
+            self.channel_adjust = nn.Identity()
 
     def forward(self, x):
         return self.channel_adjust(self.downsample(x))
 
+
 class FullPAD_Tunnel(nn.Module):
     """
     A gated fusion module for the Full-Pipeline Aggregation-and-Distribution (FullPAD) paradigm.
@@ -1908,9 +1930,11 @@ class FullPAD_Tunnel(nn.Module):
         >>> print(output.shape)
         torch.Size([2, 64, 32, 32])
     """
+
     def __init__(self):
         super().__init__()
         self.gate = nn.Parameter(torch.tensor(0.0))
+
     def forward(self, x):
         out = x[0] + self.gate * x[1]
         return out
diff --git a/ultralytics/utils/dist.py b/ultralytics/utils/dist.py
index 8b7e5bbe4..ffbd37eaf 100644
--- a/ultralytics/utils/dist.py
+++ b/ultralytics/utils/dist.py
@@ -1,45 +1,82 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
 
+from __future__ import annotations
+
 import os
 import shutil
 import socket
 import sys
 import tempfile
+from pathlib import Path
 
 from . import USER_CONFIG_DIR
 from .torch_utils import TORCH_1_9
 
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
 
-def find_free_network_port() -> int:
-    """
-    Finds a free port on localhost.
 
-    It is useful in single-node training when we don't want to connect to a real main node but have to set the
-    `MASTER_PORT` environment variable.
-    """
+def find_free_network_port() -> int:
+    """Find a free port on localhost."""
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(("127.0.0.1", 0))
-        return s.getsockname()[1]  # port
+        return s.getsockname()[1]
+
+
+def _serialize_overrides(overrides: dict) -> dict:
+    """Serialize trainer overrides for DDP subprocess compatibility."""
+    serialized = overrides.copy()
+    augmentations = serialized.get("augmentations")
+    if augmentations is not None:
+        try:
+            import albumentations as A
+
+            serialized["augmentations"] = [A.to_dict(t) for t in augmentations]
+            serialized["_augmentations_serialized"] = True
+        except Exception:
+            serialized["augmentations"] = None
+            serialized["_augmentations_serialized"] = False
+    return serialized
 
 
 def generate_ddp_file(trainer):
-    """Generates a DDP file and returns its file name."""
+    """Generate temporary python entrypoint for DDP subprocess."""
     module, name = f"{trainer.__class__.__module__}.{trainer.__class__.__name__}".rsplit(".", 1)
+    overrides = _serialize_overrides(vars(trainer.args))
 
     content = f"""
 # Ultralytics Multi-GPU training temp file (should be automatically deleted after use)
-overrides = {vars(trainer.args)}
+import sys
+from pathlib import Path, PosixPath, WindowsPath
+
+repo_root = Path(r"{PROJECT_ROOT}")
+if str(repo_root) not in sys.path:
+    sys.path.insert(0, str(repo_root))
+
+import os
+
+overrides = {overrides}
+
+if "use_turing_flash" in overrides:
+    os.environ["Y13_USE_TURING_FLASH"] = "1" if overrides["use_turing_flash"] else "0"
+if "force_disable_flash" in overrides:
+    os.environ["Y13_DISABLE_FLASH"] = "1" if overrides["force_disable_flash"] else "0"
 
 if __name__ == "__main__":
     from {module} import {name}
     from ultralytics.utils import DEFAULT_CFG_DICT
 
+    if overrides.pop("_augmentations_serialized", False) and overrides.get("augmentations") is not None:
+        import albumentations as A
+
+        overrides["augmentations"] = [A.from_dict(t) for t in overrides["augmentations"]]
+
     cfg = DEFAULT_CFG_DICT.copy()
-    cfg.update(save_dir='')   # handle the extra key 'save_dir'
+    cfg.update(save_dir='')
     trainer = {name}(cfg=cfg, overrides=overrides)
     trainer.args.model = "{getattr(trainer.hub_session, "model_url", trainer.args.model)}"
-    results = trainer.train()
+    trainer.train()
 """
+
     (USER_CONFIG_DIR / "DDP").mkdir(exist_ok=True)
     with tempfile.NamedTemporaryFile(
         prefix="_temp_",
@@ -54,11 +91,11 @@ def generate_ddp_file(trainer):
 
 
 def generate_ddp_command(world_size, trainer):
-    """Generates and returns command for distributed training."""
-    import __main__  # noqa local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
+    """Generate command tuple for distributed training."""
+    import __main__  # noqa: F401
 
-    if not trainer.resume:
-        shutil.rmtree(trainer.save_dir)  # remove the save_dir
+    if not trainer.resume and trainer.save_dir.exists():
+        shutil.rmtree(trainer.save_dir, ignore_errors=True)
     file = generate_ddp_file(trainer)
     dist_cmd = "torch.distributed.run" if TORCH_1_9 else "torch.distributed.launch"
     port = find_free_network_port()
@@ -67,6 +104,6 @@ def generate_ddp_command(world_size, trainer):
 
 
 def ddp_cleanup(trainer, file):
-    """Delete temp file if created."""
-    if f"{id(trainer)}.py" in file:  # if temp_file suffix in file
+    """Delete temp DDP file if created."""
+    if f"{id(trainer)}.py" in file and os.path.exists(file):
         os.remove(file)
diff --git a/ultralytics/utils/flash_turing_interface.py b/ultralytics/utils/flash_turing_interface.py
new file mode 100644
index 000000000..5ada6ce59
--- /dev/null
+++ b/ultralytics/utils/flash_turing_interface.py
@@ -0,0 +1,68 @@
+# Ultralytics AGPL-3.0 License
+
+from __future__ import annotations
+
+from typing import Optional, Tuple
+
+import torch
+
+import flash_attn_turing as flash_attn_gpu
+
+
+def _maybe_contiguous(x: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+
+
+def _flash_attn_forward(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    softmax_scale: float,
+    causal: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    q, k, v = [_maybe_contiguous(x) for x in (q, k, v)]
+    out, lse = flash_attn_gpu.fwd(q, k, v, softmax_scale, causal)
+    return out, lse
+
+
+def _flash_attn_backward(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    softmax_scale: float,
+    causal: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dout, q, k, v, out, lse = [_maybe_contiguous(x) for x in (dout, q, k, v, out, lse)]
+    dq, dk, dv = flash_attn_gpu.bwd(q, k, v, out, lse, dout, softmax_scale, causal)
+    return dq, dk, dv
+
+
+class _FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, softmax_scale: Optional[float], causal: bool):
+        softmax_scale = q.shape[-1] ** (-0.5) if softmax_scale is None else softmax_scale
+        out, lse = _flash_attn_forward(q, k, v, softmax_scale, causal)
+        if any(x.requires_grad for x in (q, k, v)):
+            ctx.save_for_backward(q, k, v, out, lse)
+            ctx.softmax_scale = softmax_scale
+            ctx.causal = causal
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        q, k, v, out, lse = ctx.saved_tensors
+        dq, dk, dv = _flash_attn_backward(dout, q, k, v, out, lse, ctx.softmax_scale, ctx.causal)
+        return dq, dk, dv, None, None
+
+
+def flash_attn_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+) -> torch.Tensor:
+    return _FlashAttnFunc.apply(q, k, v, softmax_scale, causal)
diff --git a/ultralytics/utils/metrics.py b/ultralytics/utils/metrics.py
index a5fbff886..a742891c8 100644
--- a/ultralytics/utils/metrics.py
+++ b/ultralytics/utils/metrics.py
@@ -741,7 +741,7 @@ def map(self):
 
     def mean_results(self):
         """Mean of results, return mp, mr, map50, map."""
-        return [self.mp, self.mr, self.map50, self.map75,self.map]
+        return [self.mp, self.mr, self.map50, self.map75, self.map]
 
     def class_result(self, i):
         """Class-aware result, return p[i], r[i], ap50[i], ap[i]."""
@@ -865,7 +865,13 @@ def process(self, tp, conf, pred_cls, target_cls):
     @property
     def keys(self):
         """Returns a list of keys for accessing specific metrics."""
-        return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP75(B)", "metrics/mAP50-95(B)"]
+        return [
+            "metrics/precision(B)",
+            "metrics/recall(B)",
+            "metrics/mAP50(B)",
+            "metrics/mAP75(B)",
+            "metrics/mAP50-95(B)",
+        ]
 
     def mean_results(self):
         """Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95."""
@@ -1267,14 +1273,20 @@ def process(self, tp, conf, pred_cls, target_cls):
     @property
     def keys(self):
         """Returns a list of keys for accessing specific metrics."""
-        return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP50-95(B)"]
+        return [
+            "metrics/precision(B)",
+            "metrics/recall(B)",
+            "metrics/mAP50(B)",
+            "metrics/mAP75(B)",
+            "metrics/mAP50-95(B)",
+        ]
 
     def mean_results(self):
-        """Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95."""
+        """Calculate mean results and return precision, recall, mAP50, mAP75, and mAP50-95."""
         return self.box.mean_results()
 
     def class_result(self, i):
-        """Return the result of evaluating the performance of an object detection model on a specific class."""
+        """Return per-class precision, recall, AP50, AP75, and AP50-95."""
         return self.box.class_result(i)
 
     @property

From 4e20b0c44459434bc88d385da0cfde6edd40619e Mon Sep 17 00:00:00 2001
From: ahmedelbamby-aast <A.Elbamby61869@student.aast.edu>
Date: Fri, 3 Apr 2026 01:19:15 +0200
Subject: [PATCH 2/3] Update ultralytics/utils/metrics.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 ultralytics/utils/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ultralytics/utils/metrics.py b/ultralytics/utils/metrics.py
index a742891c8..5579f6a30 100644
--- a/ultralytics/utils/metrics.py
+++ b/ultralytics/utils/metrics.py
@@ -740,7 +740,7 @@ def map(self):
         return self.all_ap.mean() if len(self.all_ap) else 0.0
 
     def mean_results(self):
-        """Mean of results, return mp, mr, map50, map."""
+        """Mean of results, return mp, mr, map50, map75, map."""
         return [self.mp, self.mr, self.map50, self.map75, self.map]
 
     def class_result(self, i):

From c0bf19091a194bfb6f52c65e4b9211adc4650695 Mon Sep 17 00:00:00 2001
From: ahmedelbamby-aast <A.Elbamby61869@student.aast.edu>
Date: Fri, 3 Apr 2026 01:20:10 +0200
Subject: [PATCH 3/3] Update ultralytics/data/utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 ultralytics/data/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ultralytics/data/utils.py b/ultralytics/data/utils.py
index 59ad043dc..ad06f89f6 100644
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@@ -331,7 +331,7 @@ def _add_from_dir(d: Path):
             except Exception:
                 im_files = []
             for im in im_files:
-                lb = Path(str(im).replace(f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}")).with_suffix(".txt")
+                lb = Path(img2label_paths([str(im)])[0])
                 if lb.exists():
                     s = str(lb)
                     if s not in seen: