Skip to content

Commit b80e124

Browse files
committed
TL/UCP: topo aware ring algo for allreduce
Add a new monolithic allreduce ring that fuses reduce_scatter and allgather into a single task using team->cuda_ring for topo aware multi ring transfers (up to 8 parallel rings). Algorithm changes: - each step receives into scratch, reduces with the local dst block via GPU executor, then forwards the accumulated result to the next ring peer. - in-place ring allgather distributes all fully reduced blocks across ranks. - Both process runs in one progress function, with tagged send/recv counters reset at the algo transition. - Auto selected for CUDA memory >4KB when cuda_ring is available. Signed-off-by: Juee Himalbhai Desai <jueehimalbha@nvidia.com>
1 parent 79bc267 commit b80e124

5 files changed

Lines changed: 409 additions & 4 deletions

File tree

src/components/tl/ucp/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ allreduce = \
4747
allreduce/allreduce_sliding_window.h \
4848
allreduce/allreduce_sliding_window.c \
4949
allreduce/allreduce_sliding_window_setup.c \
50-
allreduce/allreduce_dbt.c
50+
allreduce/allreduce_dbt.c \
51+
allreduce/allreduce_ring.c
5152

5253
barrier = \
5354
barrier/barrier.h \

src/components/tl/ucp/allreduce/allreduce.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
#include "tl_ucp.h"
88
#include "allreduce.h"
99
#include "utils/ucc_coll_utils.h"
10+
#include "utils/ucc_string.h"
11+
12+
#define ALLREDUCE_MAX_PATTERN_SIZE 256
1013

1114
ucc_base_coll_alg_info_t
1215
ucc_tl_ucp_allreduce_algs[UCC_TL_UCP_ALLREDUCE_ALG_LAST + 1] = {
@@ -29,9 +32,59 @@ ucc_base_coll_alg_info_t
2932
{.id = UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW,
3033
.name = "sliding_window",
3134
.desc = "sliding window allreduce (optimized for running on DPU)"},
35+
[UCC_TL_UCP_ALLREDUCE_ALG_RING] =
36+
{.id = UCC_TL_UCP_ALLREDUCE_ALG_RING,
37+
.name = "ring",
38+
.desc = "reduce-scatter ring followed by allgather ring "
39+
"(topology-aware, optimized for BW)"},
3240
[UCC_TL_UCP_ALLREDUCE_ALG_LAST] = {
3341
.id = 0, .name = NULL, .desc = NULL}};
3442

43+
char *ucc_tl_ucp_allreduce_score_str_get(ucc_tl_ucp_team_t *team)
44+
{
45+
int max_size = ALLREDUCE_MAX_PATTERN_SIZE;
46+
char *str = ucc_malloc(max_size * sizeof(char));
47+
ucc_tl_ucp_context_t *ctx = UCC_TL_UCP_TEAM_CTX(team);
48+
uint64_t cuda_types =
49+
ctx->ucp_memory_types &
50+
(UCC_BIT(UCC_MEMORY_TYPE_CUDA) |
51+
UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED));
52+
uint64_t non_cuda_types = ctx->ucp_memory_types & (~cuda_types);
53+
char *non_cuda_str;
54+
char *cuda_str;
55+
56+
if (team->cuda_ring && cuda_types) {
57+
cuda_str = ucc_malloc(max_size * sizeof(char));
58+
ucc_mtype_map_to_str(cuda_types, ",", cuda_str, max_size);
59+
if (non_cuda_types) {
60+
non_cuda_str = ucc_malloc(max_size * sizeof(char));
61+
ucc_mtype_map_to_str(non_cuda_types, ",", non_cuda_str, max_size);
62+
ucc_snprintf_safe(str, max_size,
63+
"allreduce:0-4k:@%d#allreduce:4k-inf:%s:@%d"
64+
"#allreduce:4k-inf:%s:@%d",
65+
UCC_TL_UCP_ALLREDUCE_ALG_KNOMIAL,
66+
cuda_str, UCC_TL_UCP_ALLREDUCE_ALG_RING,
67+
non_cuda_str, UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL);
68+
ucc_free(cuda_str);
69+
ucc_free(non_cuda_str);
70+
return str;
71+
}
72+
ucc_snprintf_safe(str, max_size,
73+
"allreduce:0-4k:@%d#allreduce:4k-inf:%s:@%d"
74+
"#allreduce:4k-inf:@%d",
75+
UCC_TL_UCP_ALLREDUCE_ALG_KNOMIAL,
76+
cuda_str, UCC_TL_UCP_ALLREDUCE_ALG_RING,
77+
UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL);
78+
ucc_free(cuda_str);
79+
return str;
80+
}
81+
82+
ucc_snprintf_safe(str, max_size,
83+
UCC_TL_UCP_ALLREDUCE_DEFAULT_ALG_SELECT_STR,
84+
UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL);
85+
return str;
86+
}
87+
3588
ucc_status_t ucc_tl_ucp_allreduce_init(ucc_tl_ucp_task_t *task)
3689
{
3790
ucc_status_t status;

src/components/tl/ucp/allreduce/allreduce.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ enum {
1313
UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL,
1414
UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW,
1515
UCC_TL_UCP_ALLREDUCE_ALG_DBT,
16+
UCC_TL_UCP_ALLREDUCE_ALG_RING,
1617
UCC_TL_UCP_ALLREDUCE_ALG_LAST
1718
};
1819

@@ -21,7 +22,9 @@ extern ucc_base_coll_alg_info_t
2122
ucc_status_t ucc_tl_ucp_allreduce_init(ucc_tl_ucp_task_t *task);
2223

2324
#define UCC_TL_UCP_ALLREDUCE_DEFAULT_ALG_SELECT_STR \
24-
"allreduce:0-4k:@0#allreduce:4k-inf:@1"
25+
"allreduce:0-4k:@0#allreduce:4k-inf:@%d"
26+
27+
char *ucc_tl_ucp_allreduce_score_str_get(ucc_tl_ucp_team_t *team);
2528

2629
#define CHECK_SAME_MEMTYPE(_args, _team) \
2730
do { \
@@ -77,6 +80,11 @@ ucc_status_t ucc_tl_ucp_allreduce_dbt_start(ucc_coll_task_t *task);
7780

7881
ucc_status_t ucc_tl_ucp_allreduce_dbt_progress(ucc_coll_task_t *task);
7982

83+
ucc_status_t
84+
ucc_tl_ucp_allreduce_ring_init(ucc_base_coll_args_t *coll_args,
85+
ucc_base_team_t *team,
86+
ucc_coll_task_t **task_h);
87+
8088
static inline int ucc_tl_ucp_allreduce_alg_from_str(const char *str)
8189
{
8290
int i;

0 commit comments

Comments
 (0)