@@ -16,6 +16,12 @@ import (
1616func collectDiskHealthMetrics (cfg DiskHealthMetricsConfig ) []NormalizedSmartData {
1717 var allMetrics []NormalizedSmartData
1818
19+ // Check if nvme-cli is available for enhanced NVMe support
20+ nvmeCliAvailable := checkNVMeCliInstalled ()
21+ if nvmeCliAvailable {
22+ log .Info ().Msg ("nvme-cli detected, enhanced NVMe metrics will be available" )
23+ }
24+
1925 for _ , disk := range cfg .Disks {
2026 //FIXME rawData, err := collectSmartData(fmt.Sprintf("/dev/%s", disk))
2127 rawData , err := collectSmartData (disk )
@@ -26,25 +32,41 @@ func collectDiskHealthMetrics(cfg DiskHealthMetricsConfig) []NormalizedSmartData
2632 continue
2733 }
2834
29- // Normalize the device information
35+ // Enhance NVMe devices with nvme-cli data if available
36+ var nvmeController * NVMeIDControllerOutput
37+ var nvmeErrors * NVMeErrorLogOutput
38+
39+ if nvmeCliAvailable && rawData .Device .Protocol == "NVMe" {
40+ nvmeController , err = collectNVMeControllerData (disk )
41+ if err != nil {
42+ log .Warn ().Err (err ).Str ("disk" , disk ).Msg ("failed to collect NVMe controller data, continuing with smartctl only" )
43+ }
44+
45+ nvmeErrors , err = collectNVMeErrorLog (disk )
46+ if err != nil {
47+ log .Warn ().Err (err ).Str ("disk" , disk ).Msg ("failed to collect NVMe error log, continuing without error log data" )
48+ }
49+
50+ // Enhance the smartctl data with nvme-cli information
51+ enhanceNVMeData (rawData , nvmeController , nvmeErrors )
52+ }
53+
3054 deviceInfo := & DeviceInfo {}
3155 FillDeviceInfoFromSmartData (deviceInfo , rawData )
3256 NormalizeVendor (deviceInfo )
3357 NormalizeDeviceInfo (deviceInfo )
3458
35- // Normalize Smart Attributes
3659 smartAttrs := GetSmartAttributes ()
3760 ProcessAndUpdateSmartAttributes (smartAttrs , rawData )
38- CleanupSmartAttributes (smartAttrs )
3961
40- //FIXME: just for debug Print out the updated smartAttrs
41- // for key, attr := range smartAttrs {
42- // fmt.Printf("%s: %s (Unit: %s, Value: %d, Threshold: %d, Worst: %d, Raw: %d)\n", key, attr.Description, attr.Unit, attr.Value, attr.Threshold, attr.Worst, attr.RawValue )
43- // }
62+ // Process NVMe-specific attributes if we have nvme-cli data
63+ if nvmeController != nil || nvmeErrors != nil {
64+ processNVMeSpecificAttributes ( smartAttrs , nvmeController , nvmeErrors )
65+ }
4466
45- // Normalize the data
46- normalizedData := normalizeSmartData (rawData , deviceInfo , smartAttrs , cfg .NodeName , cfg .InstanceID , cfg .CephOSDBasePath )
67+ CleanupSmartAttributes (smartAttrs )
4768
69+ normalizedData := normalizeSmartData (rawData , deviceInfo , smartAttrs , cfg .NodeName , cfg .InstanceID , cfg .CephOSDBasePath )
4870 allMetrics = append (allMetrics , normalizedData )
4971 }
5072
@@ -86,7 +108,10 @@ func normalizeSmartData(smartData *SmartCtlOutput, deviceInfo *DeviceInfo, attri
86108 reallocatedSectors = & smartData .SCSIGrownDefectList
87109 }
88110
111+ enhanceDeviceInfo (deviceInfo )
112+
89113 osdID , _ := getOSDIDForDisk (smartData .Device .Name , basePath ) // Ignore error as it's handled within the function
114+
90115 return NormalizedSmartData {
91116 NodeName : nodeName ,
92117 InstanceID : instanceID ,
@@ -115,31 +140,6 @@ func findSmartAttributeByID(attributes []SmartCtlATASMARTEntry, id int64) *Smart
115140 return nil
116141}
117142
118- // func findSmartAttributeByName(attributes []SmartCtlATASMARTEntry, name string) *SmartCtlATASMARTEntry {
119- // for _, attr := range attributes {
120- // if attr.Name == name {
121- // return &attr
122- // }
123- // }
124- // return nil
125- // }
126-
127- // func parseSMARTOutput(output []byte, attribute string) uint64 {
128- // lines := strings.Split(string(output), "\n")
129- // for _, line := range lines {
130- // if strings.Contains(line, attribute) {
131- // fields := strings.Fields(line)
132- // value, err := strconv.ParseUint(fields[9], 10, 64)
133- // if err != nil {
134- // log.Printf("Error parsing %s value: %v", attribute, err)
135- // return 0
136- // }
137- // return value
138- // }
139- // }
140- // return 0
141- // }
142-
143143func StartMonitoring (cfg DiskHealthMetricsConfig ) {
144144 if ! checkSmartctlInstalled () {
145145 log .Fatal ().Msg ("smartctl is not installed. please install smartmontools package." )
0 commit comments