Skip to content

Commit 52aaff5

Browse files
authored
Merge pull request #118 from biluriuday/partition-grouping
update GPU partition grouping logic
2 parents 0da9695 + ad2e695 commit 52aaff5

5 files changed

Lines changed: 91 additions & 31 deletions

File tree

internal/amdgpu/amdgpu.go

Lines changed: 60 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
100100

101101
renderDevIds := GetDevIdsFromTopology(fs)
102102

103-
// Map to store devices by unique_id to maintain grouping
104-
uniqueIdDevices := make(map[string][]DeviceInfo)
103+
// Map to store devices by parent dev ID to maintain grouping
104+
devIdToDevices := make(map[string][]DeviceInfo)
105105
var uniqueIds []string // To maintain order
106106

107107
// Process PCI devices
@@ -156,17 +156,17 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
156156

157157
if len(drmDevs) > 0 && renderMinor > 0 {
158158
if devID, exists := renderDevIds[renderMinor]; exists {
159-
if _, exists := uniqueIdDevices[devID]; !exists {
159+
if _, exists := devIdToDevices[devID]; !exists {
160160
uniqueIds = append(uniqueIds, devID)
161161
}
162-
uniqueIdDevices[devID] = append(uniqueIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
162+
devIdToDevices[devID] = append(devIdToDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
163163
}
164164
}
165165
}
166166

167-
// Sort devices within each unique_id group by render minor number
167+
// Sort devices within each parent dev ID group by render minor number
168168
for _, devID := range uniqueIds {
169-
sort.Slice(uniqueIdDevices[devID], func(i, j int) bool {
169+
sort.Slice(devIdToDevices[devID], func(i, j int) bool {
170170
getRenderID := func(devInfo DeviceInfo) int {
171171
devs := devInfo.DrmDevices
172172
for _, dev := range devs {
@@ -178,14 +178,14 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
178178
}
179179
return 0
180180
}
181-
return getRenderID(uniqueIdDevices[devID][i]) < getRenderID(uniqueIdDevices[devID][j])
181+
return getRenderID(devIdToDevices[devID][i]) < getRenderID(devIdToDevices[devID][j])
182182
})
183183
}
184184

185185
// Combine all devices maintaining the unique_id order
186186
var devs []DeviceInfo
187187
for _, devID := range uniqueIds {
188-
devs = append(devs, uniqueIdDevices[devID]...)
188+
devs = append(devs, devIdToDevices[devID]...)
189189
}
190190

191191
return devs, nil
@@ -242,8 +242,10 @@ func GetAMDGPUWithFS(fs FileSystem, dev string) (AMDGPU, error) {
242242

243243
var topoUniqueIdRe = regexp.MustCompile(`unique_id\s(\d+)`)
244244
var renderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`)
245+
var locationIdRe = regexp.MustCompile(`location_id\s(\d+)`)
246+
var domainRe = regexp.MustCompile(`domain\s(\d+)`)
245247

246-
// GetDevIdsFromTopology returns a map of render minor numbers to unique_ids
248+
// GetDevIdsFromTopology returns a map of render minor numbers to parent devID
247249
func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
248250
topoRoot := "/sys/class/kfd/kfd"
249251
if len(topoRootParam) == 1 {
@@ -257,6 +259,54 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin
257259
return renderDevIds
258260
}
259261

262+
for _, nodeFile := range nodeFiles {
263+
slog.Debug("Parsing topology node file", "file", nodeFile)
264+
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
265+
if err != nil {
266+
slog.Debug("Error parsing render minor", "file", nodeFile, "error", err)
267+
continue
268+
}
269+
270+
if renderMinor <= 0 || renderMinor > math.MaxInt32 {
271+
continue
272+
}
273+
274+
locationId, e := ParseTopologyProperties(fs, nodeFile, locationIdRe)
275+
if e != nil {
276+
slog.Debug("Error parsing location_id", "file", nodeFile, "error", e)
277+
continue
278+
}
279+
280+
domain, e := ParseTopologyProperties(fs, nodeFile, domainRe)
281+
if e != nil {
282+
slog.Debug("Error parsing domain", "file", nodeFile, "error", e)
283+
continue
284+
}
285+
286+
dev := (locationId >> 3) & 0x1f
287+
bus := (locationId >> 8) & 0xff
288+
devID := fmt.Sprintf("%04x:%02x:%02x:0", domain, bus, dev)
289+
290+
renderDevIds[int(renderMinor)] = devID
291+
}
292+
293+
return renderDevIds
294+
}
295+
296+
// GetUniqueIdsFromTopology returns a map of render minor numbers to unique_ids
297+
func GetUniqueIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
298+
topoRoot := "/sys/class/kfd/kfd"
299+
if len(topoRootParam) == 1 {
300+
topoRoot = topoRootParam[0]
301+
}
302+
303+
renderDevIds := make(map[int]string)
304+
nodeFiles, err := fs.Glob(topoRoot + "/topology/nodes/*/properties")
305+
if err != nil {
306+
slog.Warn("Failed to glob topology nodes", "error", err)
307+
return renderDevIds
308+
}
309+
260310
for _, nodeFile := range nodeFiles {
261311
slog.Debug("Parsing topology node file", "file", nodeFile)
262312
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
@@ -331,7 +381,7 @@ func GetUniqueIdToDeviceIndexMapWithFS(fs FileSystem) (map[string][]int, error)
331381
return nil, fmt.Errorf("getting AMD GPUs: %w", err)
332382
}
333383

334-
renderDevIds := GetDevIdsFromTopology(fs)
384+
renderDevIds := GetUniqueIdsFromTopology(fs)
335385
uniqueIdToIndex := make(map[string][]int)
336386

337387
// Process each device group and assign index

internal/amdgpu/amdgpu_test.go

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -527,32 +527,32 @@ func TestGetDevIdsFromTopology(t *testing.T) {
527527
name: "single GPU topology",
528528
testCase: "single_gpu",
529529
expectedResult: map[int]string{
530-
128: "1",
530+
128: "0000:05:00:0",
531531
},
532532
},
533533
{
534534
name: "GPU with partition topology",
535535
testCase: "gpu_with_partition",
536536
expectedResult: map[int]string{
537-
128: "1",
538-
129: "1",
537+
128: "0000:05:00:0",
538+
129: "0000:05:00:0",
539539
},
540540
},
541541
{
542542
name: "multiple GPUs topology",
543543
testCase: "multiple_gpus",
544544
expectedResult: map[int]string{
545-
128: "1",
546-
130: "2",
545+
128: "0000:05:00:0",
546+
130: "0000:48:00:0",
547547
},
548548
},
549549
{
550550
name: "unordered partitions topology",
551551
testCase: "unordered_partitions",
552552
expectedResult: map[int]string{
553-
128: "1",
554-
129: "1",
555-
130: "2",
553+
128: "0000:05:00:0",
554+
129: "0000:05:00:0",
555+
130: "0000:48:00:0",
556556
},
557557
},
558558
}
@@ -589,8 +589,10 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) {
589589
name: "GPU with partition UUID mapping",
590590
testCase: "gpu_with_partition",
591591
expectedResult: map[string][]int{
592-
"0x1": {0, 1},
593-
"1": {0, 1},
592+
"0x1": {0},
593+
"0x2": {1},
594+
"1": {0},
595+
"2": {1},
594596
},
595597
expectedError: nil,
596598
},
@@ -599,20 +601,22 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) {
599601
testCase: "multiple_gpus",
600602
expectedResult: map[string][]int{
601603
"0x1": {0},
604+
"0x3": {1},
602605
"1": {0},
603-
"0x2": {1},
604-
"2": {1},
606+
"3": {1},
605607
},
606608
expectedError: nil,
607609
},
608610
{
609611
name: "unordered partitions UUID mapping",
610612
testCase: "unordered_partitions",
611613
expectedResult: map[string][]int{
612-
"0x1": {0, 1},
613-
"1": {0, 1},
614-
"0x2": {2},
615-
"2": {2},
614+
"0x1": {0},
615+
"0x2": {1},
616+
"0x3": {2},
617+
"1": {0},
618+
"2": {1},
619+
"3": {2},
616620
},
617621
expectedError: nil,
618622
},

tests/amdgpu/topology/nodes/0/properties

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,6 @@ mem_banks_count 1
77
caches_count 0
88
io_links_count 1
99
cpu_core_id_base 0
10-
simd_id_base 0
10+
simd_id_base 0
11+
location_id 1280
12+
domain 0
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
drm_render_minor 129
2-
unique_id 1
2+
unique_id 2
33
gfx_target_version 90402
44
cpu_cores_count 20
55
simd_count 0
66
mem_banks_count 1
77
caches_count 0
88
io_links_count 1
99
cpu_core_id_base 0
10-
simd_id_base 0
10+
simd_id_base 0
11+
location_id 1281
12+
domain 0
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
drm_render_minor 130
2-
unique_id 2
2+
unique_id 3
33
gfx_target_version 90402
44
cpu_cores_count 20
55
simd_count 0
66
mem_banks_count 1
77
caches_count 0
88
io_links_count 1
99
cpu_core_id_base 0
10-
simd_id_base 0
10+
simd_id_base 0
11+
location_id 18432
12+
domain 0

0 commit comments

Comments
 (0)