@@ -100,8 +100,8 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
100100
101101 renderDevIds := GetDevIdsFromTopology (fs )
102102
103- // Map to store devices by unique_id to maintain grouping
104- uniqueIdDevices := make (map [string ][]DeviceInfo )
103+ // Map to store devices by parent dev ID to maintain grouping
104+ devIdToDevices := make (map [string ][]DeviceInfo )
105105 var uniqueIds []string // To maintain order
106106
107107 // Process PCI devices
@@ -156,17 +156,17 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
156156
157157 if len (drmDevs ) > 0 && renderMinor > 0 {
158158 if devID , exists := renderDevIds [renderMinor ]; exists {
159- if _ , exists := uniqueIdDevices [devID ]; ! exists {
159+ if _ , exists := devIdToDevices [devID ]; ! exists {
160160 uniqueIds = append (uniqueIds , devID )
161161 }
162- uniqueIdDevices [devID ] = append (uniqueIdDevices [devID ], DeviceInfo {DrmDevices : drmDevs , PartitionType : combinedPartitionType })
162+ devIdToDevices [devID ] = append (devIdToDevices [devID ], DeviceInfo {DrmDevices : drmDevs , PartitionType : combinedPartitionType })
163163 }
164164 }
165165 }
166166
167- // Sort devices within each unique_id group by render minor number
167+ // Sort devices within each parent dev ID group by render minor number
168168 for _ , devID := range uniqueIds {
169- sort .Slice (uniqueIdDevices [devID ], func (i , j int ) bool {
169+ sort .Slice (devIdToDevices [devID ], func (i , j int ) bool {
170170 getRenderID := func (devInfo DeviceInfo ) int {
171171 devs := devInfo .DrmDevices
172172 for _ , dev := range devs {
@@ -178,14 +178,14 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
178178 }
179179 return 0
180180 }
181- return getRenderID (uniqueIdDevices [devID ][i ]) < getRenderID (uniqueIdDevices [devID ][j ])
181+ return getRenderID (devIdToDevices [devID ][i ]) < getRenderID (devIdToDevices [devID ][j ])
182182 })
183183 }
184184
185185 // Combine all devices maintaining the unique_id order
186186 var devs []DeviceInfo
187187 for _ , devID := range uniqueIds {
188- devs = append (devs , uniqueIdDevices [devID ]... )
188+ devs = append (devs , devIdToDevices [devID ]... )
189189 }
190190
191191 return devs , nil
@@ -242,8 +242,10 @@ func GetAMDGPUWithFS(fs FileSystem, dev string) (AMDGPU, error) {
242242
243243var topoUniqueIdRe = regexp .MustCompile (`unique_id\s(\d+)` )
244244var renderMinorRe = regexp .MustCompile (`drm_render_minor\s(\d+)` )
245+ var locationIdRe = regexp .MustCompile (`location_id\s(\d+)` )
246+ var domainRe = regexp .MustCompile (`domain\s(\d+)` )
245247
246- // GetDevIdsFromTopology returns a map of render minor numbers to unique_ids
248+ // GetDevIdsFromTopology returns a map of render minor numbers to parent devID
247249func GetDevIdsFromTopology (fs FileSystem , topoRootParam ... string ) map [int ]string {
248250 topoRoot := "/sys/class/kfd/kfd"
249251 if len (topoRootParam ) == 1 {
@@ -257,6 +259,54 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin
257259 return renderDevIds
258260 }
259261
262+ for _ , nodeFile := range nodeFiles {
263+ slog .Debug ("Parsing topology node file" , "file" , nodeFile )
264+ renderMinor , err := ParseTopologyProperties (fs , nodeFile , renderMinorRe )
265+ if err != nil {
266+ slog .Debug ("Error parsing render minor" , "file" , nodeFile , "error" , err )
267+ continue
268+ }
269+
270+ if renderMinor <= 0 || renderMinor > math .MaxInt32 {
271+ continue
272+ }
273+
274+ locationId , e := ParseTopologyProperties (fs , nodeFile , locationIdRe )
275+ if e != nil {
276+ slog .Debug ("Error parsing location_id" , "file" , nodeFile , "error" , e )
277+ continue
278+ }
279+
280+ domain , e := ParseTopologyProperties (fs , nodeFile , domainRe )
281+ if e != nil {
282+ slog .Debug ("Error parsing domain" , "file" , nodeFile , "error" , e )
283+ continue
284+ }
285+
286+ dev := (locationId >> 3 ) & 0x1f
287+ bus := (locationId >> 8 ) & 0xff
288+ devID := fmt .Sprintf ("%04x:%02x:%02x:0" , domain , bus , dev )
289+
290+ renderDevIds [int (renderMinor )] = devID
291+ }
292+
293+ return renderDevIds
294+ }
295+
296+ // GetUniqueIdsFromTopology returns a map of render minor numbers to unique_ids
297+ func GetUniqueIdsFromTopology (fs FileSystem , topoRootParam ... string ) map [int ]string {
298+ topoRoot := "/sys/class/kfd/kfd"
299+ if len (topoRootParam ) == 1 {
300+ topoRoot = topoRootParam [0 ]
301+ }
302+
303+ renderDevIds := make (map [int ]string )
304+ nodeFiles , err := fs .Glob (topoRoot + "/topology/nodes/*/properties" )
305+ if err != nil {
306+ slog .Warn ("Failed to glob topology nodes" , "error" , err )
307+ return renderDevIds
308+ }
309+
260310 for _ , nodeFile := range nodeFiles {
261311 slog .Debug ("Parsing topology node file" , "file" , nodeFile )
262312 renderMinor , err := ParseTopologyProperties (fs , nodeFile , renderMinorRe )
@@ -331,7 +381,7 @@ func GetUniqueIdToDeviceIndexMapWithFS(fs FileSystem) (map[string][]int, error)
331381 return nil , fmt .Errorf ("getting AMD GPUs: %w" , err )
332382 }
333383
334- renderDevIds := GetDevIdsFromTopology (fs )
384+ renderDevIds := GetUniqueIdsFromTopology (fs )
335385 uniqueIdToIndex := make (map [string ][]int )
336386
337387 // Process each device group and assign index
0 commit comments