Skip to content

Commit d08966d

Browse files
Merge pull request Project-HAMi#77 from peachest/fix/error-handling
fix error handling
2 parents 82f2da2 + 644b6a4 commit d08966d

4 files changed

Lines changed: 375 additions & 255 deletions

File tree

cmd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ import (
3636
var (
3737
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
3838
configFile = flag.String("config_file", "", "config file path")
39-
nodeConfigFile = flag.String("node_config_file", "", "node specific config file path")
39+
nodeConfigFile = flag.String("node_config_file", "", "node specific config file path")
4040
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
4141
checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them")
4242
)

internal/manager/manager.go

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package manager
1919
import (
2020
"fmt"
2121
"sort"
22+
"sync"
2223

2324
"ascend-common/devmanager"
2425
"ascend-common/devmanager/dcmi"
@@ -39,17 +40,18 @@ type Device struct {
3940
}
4041

4142
type AscendManager struct {
42-
mgr *devmanager.DeviceManager
43-
config internal.VNPUConfig
43+
mu sync.RWMutex
44+
mgr *devmanager.DeviceManager
45+
config internal.VNPUConfig
4446
globalConfig internal.Config
45-
devs []*Device
46-
nodeConfig *internal.NodeConfig
47+
devs []*Device
48+
nodeConfig *internal.NodeConfig
4749
}
4850

4951
func NewAscendManager() (*AscendManager, error) {
5052
mgr, err := devmanager.AutoInit("", 30)
5153
if err != nil {
52-
return nil, err
54+
return nil, fmt.Errorf("failed to auto-init device manager: %w", err)
5355
}
5456
return &AscendManager{
5557
mgr: mgr,
@@ -58,7 +60,7 @@ func NewAscendManager() (*AscendManager, error) {
5860
}
5961

6062
func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error {
61-
nodeConfigList, err := internal.LoadNodeConfig(nodePath)
63+
nodeConfigList, err := internal.LoadNodeConfig(nodePath)
6264
if err != nil {
6365
klog.Warningf("Failed to load node config from %s: %v", nodePath, err)
6466
return err
@@ -71,19 +73,19 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error
7173
return nil
7274
}
7375
}
74-
76+
7577
klog.Infof("No specific config found for node %s, will use default settings", nodeName)
7678
return nil
7779
}
7880

7981
func (am *AscendManager) LoadConfig(path string) error {
8082
config, err := internal.LoadConfig(path)
8183
if err != nil {
82-
return err
84+
return fmt.Errorf("failed to load config from %s: %w", path, err)
8385
}
8486
chipInfo, err := am.mgr.GetValidChipInfo()
8587
if err != nil {
86-
return err
88+
return fmt.Errorf("failed to get valid chip info: %w", err)
8789
}
8890
if chipInfo.Type != "Ascend" {
8991
return fmt.Errorf("chip type is not Ascend")
@@ -129,7 +131,7 @@ func (am *AscendManager) UpdateDevice() error {
129131
return err
130132
}
131133

132-
am.devs = make([]*Device, 0, len(IDs))
134+
newDevs := make([]*Device, 0, len(IDs))
133135
for _, ID := range IDs {
134136
phyID, err := am.mgr.GetPhysicIDFromLogicID(ID)
135137
if err != nil {
@@ -151,7 +153,7 @@ func (am *AscendManager) UpdateDevice() error {
151153
klog.Errorf("failed to get device health: %v", err)
152154
return err
153155
}
154-
am.devs = append(am.devs, &Device{
156+
newDevs = append(newDevs, &Device{
155157
UUID: uuid,
156158
LogicID: ID,
157159
PhyID: phyID,
@@ -162,14 +164,21 @@ func (am *AscendManager) UpdateDevice() error {
162164
Health: health == 0,
163165
})
164166
}
167+
am.mu.Lock()
168+
am.devs = newDevs
169+
am.mu.Unlock()
165170
return nil
166171
}
167172

168173
func (am *AscendManager) GetDevices() []*Device {
174+
am.mu.RLock()
175+
defer am.mu.RUnlock()
169176
return am.devs
170177
}
171178

172179
func (am *AscendManager) GetDeviceByUUID(UUID string) *Device {
180+
am.mu.RLock()
181+
defer am.mu.RUnlock()
173182
for _, dev := range am.devs {
174183
if dev.UUID == UUID {
175184
return dev
@@ -181,6 +190,7 @@ func (am *AscendManager) GetDeviceByUUID(UUID string) *Device {
181190
func (am *AscendManager) GetIDs() []int32 {
182191
_, IDs, err := am.mgr.GetDeviceList()
183192
if err != nil {
193+
klog.Errorf("failed to get device list: %v", err)
184194
return nil
185195
}
186196
return IDs
@@ -195,6 +205,7 @@ func (am *AscendManager) GetUnHealthIDs() []int32 {
195205
for _, d := range IDs {
196206
healthCode, err := am.mgr.GetDeviceHealth(d)
197207
if err != nil {
208+
klog.Warningf("failed to get device health for %d: %v", d, err)
198209
continue
199210
}
200211
if healthCode != 0 {
@@ -209,7 +220,7 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
209220

210221
_, IDs, err := am.mgr.GetDeviceList()
211222
if err != nil {
212-
return fmt.Errorf("failed to get device list: %v", err)
223+
return fmt.Errorf("failed to get device list: %w", err)
213224
}
214225
klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs)
215226

@@ -254,14 +265,13 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
254265
return nil
255266
}
256267

257-
258268
func (am *AscendManager) GetNodeConfig() *internal.NodeConfig {
259-
return am.nodeConfig
269+
return am.nodeConfig
260270
}
261271

262272
func (am *AscendManager) IsHamiVnpuCore() bool {
263273
if am.nodeConfig != nil {
264274
return am.nodeConfig.HamiVnpuCore
265275
}
266276
return am.globalConfig.VNPUs.HamiVnpuCore
267-
}
277+
}

0 commit comments

Comments
 (0)