@@ -19,6 +19,7 @@ package manager
1919import (
2020 "fmt"
2121 "sort"
22+ "sync"
2223
2324 "ascend-common/devmanager"
2425 "ascend-common/devmanager/dcmi"
@@ -39,17 +40,18 @@ type Device struct {
3940}
4041
4142type AscendManager struct {
42- mgr * devmanager.DeviceManager
43- config internal.VNPUConfig
43+ mu sync.RWMutex
44+ mgr * devmanager.DeviceManager
45+ config internal.VNPUConfig
4446 globalConfig internal.Config
45- devs []* Device
46- nodeConfig * internal.NodeConfig
47+ devs []* Device
48+ nodeConfig * internal.NodeConfig
4749}
4850
4951func NewAscendManager () (* AscendManager , error ) {
5052 mgr , err := devmanager .AutoInit ("" , 30 )
5153 if err != nil {
52- return nil , err
54+ return nil , fmt . Errorf ( "failed to auto-init device manager: %w" , err )
5355 }
5456 return & AscendManager {
5557 mgr : mgr ,
@@ -58,7 +60,7 @@ func NewAscendManager() (*AscendManager, error) {
5860}
5961
6062func (am * AscendManager ) LoadNodeConfig (nodePath string , nodeName string ) error {
61- nodeConfigList , err := internal .LoadNodeConfig (nodePath )
63+ nodeConfigList , err := internal .LoadNodeConfig (nodePath )
6264 if err != nil {
6365 klog .Warningf ("Failed to load node config from %s: %v" , nodePath , err )
6466 return err
@@ -71,19 +73,19 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error
7173 return nil
7274 }
7375 }
74-
76+
7577 klog .Infof ("No specific config found for node %s, will use default settings" , nodeName )
7678 return nil
7779}
7880
7981func (am * AscendManager ) LoadConfig (path string ) error {
8082 config , err := internal .LoadConfig (path )
8183 if err != nil {
82- return err
84+ return fmt . Errorf ( "failed to load config from %s: %w" , path , err )
8385 }
8486 chipInfo , err := am .mgr .GetValidChipInfo ()
8587 if err != nil {
86- return err
88+ return fmt . Errorf ( "failed to get valid chip info: %w" , err )
8789 }
8890 if chipInfo .Type != "Ascend" {
8991 return fmt .Errorf ("chip type is not Ascend" )
@@ -129,7 +131,7 @@ func (am *AscendManager) UpdateDevice() error {
129131 return err
130132 }
131133
132- am . devs = make ([]* Device , 0 , len (IDs ))
134+ newDevs : = make ([]* Device , 0 , len (IDs ))
133135 for _ , ID := range IDs {
134136 phyID , err := am .mgr .GetPhysicIDFromLogicID (ID )
135137 if err != nil {
@@ -151,7 +153,7 @@ func (am *AscendManager) UpdateDevice() error {
151153 klog .Errorf ("failed to get device health: %v" , err )
152154 return err
153155 }
154- am . devs = append (am . devs , & Device {
156+ newDevs = append (newDevs , & Device {
155157 UUID : uuid ,
156158 LogicID : ID ,
157159 PhyID : phyID ,
@@ -162,14 +164,21 @@ func (am *AscendManager) UpdateDevice() error {
162164 Health : health == 0 ,
163165 })
164166 }
167+ am .mu .Lock ()
168+ am .devs = newDevs
169+ am .mu .Unlock ()
165170 return nil
166171}
167172
168173func (am * AscendManager ) GetDevices () []* Device {
174+ am .mu .RLock ()
175+ defer am .mu .RUnlock ()
169176 return am .devs
170177}
171178
172179func (am * AscendManager ) GetDeviceByUUID (UUID string ) * Device {
180+ am .mu .RLock ()
181+ defer am .mu .RUnlock ()
173182 for _ , dev := range am .devs {
174183 if dev .UUID == UUID {
175184 return dev
@@ -181,6 +190,7 @@ func (am *AscendManager) GetDeviceByUUID(UUID string) *Device {
181190func (am * AscendManager ) GetIDs () []int32 {
182191 _ , IDs , err := am .mgr .GetDeviceList ()
183192 if err != nil {
193+ klog .Errorf ("failed to get device list: %v" , err )
184194 return nil
185195 }
186196 return IDs
@@ -195,6 +205,7 @@ func (am *AscendManager) GetUnHealthIDs() []int32 {
195205 for _ , d := range IDs {
196206 healthCode , err := am .mgr .GetDeviceHealth (d )
197207 if err != nil {
208+ klog .Warningf ("failed to get device health for %d: %v" , d , err )
198209 continue
199210 }
200211 if healthCode != 0 {
@@ -209,7 +220,7 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
209220
210221 _ , IDs , err := am .mgr .GetDeviceList ()
211222 if err != nil {
212- return fmt .Errorf ("failed to get device list: %v " , err )
223+ return fmt .Errorf ("failed to get device list: %w " , err )
213224 }
214225 klog .Infof ("Found %d devices to check for idle vNPUs,%+v" , len (IDs ), IDs )
215226
@@ -254,14 +265,13 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
254265 return nil
255266}
256267
257-
258268func (am * AscendManager ) GetNodeConfig () * internal.NodeConfig {
259- return am .nodeConfig
269+ return am .nodeConfig
260270}
261271
262272func (am * AscendManager ) IsHamiVnpuCore () bool {
263273 if am .nodeConfig != nil {
264274 return am .nodeConfig .HamiVnpuCore
265275 }
266276 return am .globalConfig .VNPUs .HamiVnpuCore
267- }
277+ }
0 commit comments