Skip to content

Commit be688fb

Browse files
authored
diskhealth logic extended and fixed (#34)
On-behalf-of: SAP <filipp.akinfiev@clyso.com> Signed-off-by: Filipp Akinfiev <filipp.akinfiev@clyso.com>
1 parent a0a26d1 commit be688fb

9 files changed

Lines changed: 717 additions & 78 deletions

File tree

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} GO111MODULE=on \
3434
FROM alpine
3535
LABEL source_repository="https://github.com/cobaltcore-dev/prysm"
3636
# Install smartctl
37-
RUN apk add --no-cache smartmontools
37+
RUN apk add --no-cache smartmontools nvme-cli
3838

3939
# copy app bianry
4040
COPY --from=builder /out/prysm /bin/prysm
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
package diskhealthmetrics
6+
7+
import (
8+
"fmt"
9+
"strings"
10+
11+
"golang.org/x/text/cases"
12+
"golang.org/x/text/language"
13+
)
14+
15+
// enhanceDeviceInfo adds additional context about OEM relationships
16+
func enhanceDeviceInfo(deviceInfo *DeviceInfo) {
17+
if deviceInfo == nil {
18+
return
19+
}
20+
21+
// Common OEM relationships and rebranding patterns
22+
oem := detectOEMRelationship(deviceInfo.Vendor, deviceInfo.DeviceModel, deviceInfo.Product)
23+
if oem != "" {
24+
// You could add this to a new field in DeviceInfo or include it in existing fields
25+
if deviceInfo.ModelFamily == "" {
26+
deviceInfo.ModelFamily = oem
27+
}
28+
}
29+
}
30+
31+
// detectOEMRelationship detects common OEM relationships
32+
func detectOEMRelationship(vendor, model, product string) string {
33+
vendor = strings.ToLower(vendor)
34+
model = strings.ToLower(model)
35+
product = strings.ToLower(product)
36+
37+
// Create a title caser for English
38+
caser := cases.Title(language.English)
39+
40+
// Common OEM patterns - check all three fields for comprehensive detection
41+
42+
// Lenovo OEM patterns
43+
if strings.Contains(vendor, "lenovo") {
44+
if strings.Contains(model, "toshiba") || strings.Contains(product, "toshiba") {
45+
return "Lenovo (Toshiba OEM)"
46+
}
47+
if strings.Contains(model, "seagate") || strings.Contains(product, "seagate") {
48+
return "Lenovo (Seagate OEM)"
49+
}
50+
if strings.Contains(model, "hgst") || strings.Contains(product, "hgst") {
51+
return "Lenovo (HGST OEM)"
52+
}
53+
}
54+
55+
// Dell OEM patterns
56+
if strings.Contains(vendor, "dell") {
57+
if strings.Contains(model, "seagate") || strings.Contains(product, "seagate") {
58+
return "Dell (Seagate OEM)"
59+
}
60+
if strings.Contains(model, "western digital") || strings.Contains(product, "western digital") || strings.Contains(product, "wd") {
61+
return "Dell (WD OEM)"
62+
}
63+
if strings.Contains(model, "toshiba") || strings.Contains(product, "toshiba") {
64+
return "Dell (Toshiba OEM)"
65+
}
66+
}
67+
68+
// HP/HPE OEM patterns
69+
if strings.Contains(vendor, "hp") || strings.Contains(vendor, "hpe") {
70+
if strings.Contains(model, "western digital") || strings.Contains(product, "western digital") || strings.Contains(product, "wd") {
71+
return "HP (WD OEM)"
72+
}
73+
if strings.Contains(model, "seagate") || strings.Contains(product, "seagate") {
74+
return "HP (Seagate OEM)"
75+
}
76+
if strings.Contains(model, "toshiba") || strings.Contains(product, "toshiba") {
77+
return "HP (Toshiba OEM)"
78+
}
79+
}
80+
81+
// Supermicro OEM patterns
82+
if strings.Contains(vendor, "supermicro") {
83+
if strings.Contains(model, "intel") || strings.Contains(product, "intel") {
84+
return "Supermicro (Intel OEM)"
85+
}
86+
if strings.Contains(model, "samsung") || strings.Contains(product, "samsung") {
87+
return "Supermicro (Samsung OEM)"
88+
}
89+
}
90+
91+
// Generic patterns - sometimes the product field contains the actual manufacturer
92+
if strings.Contains(product, "seagate") && !strings.Contains(vendor, "seagate") {
93+
return fmt.Sprintf("%s (Seagate OEM)", caser.String(vendor))
94+
}
95+
if strings.Contains(product, "western digital") || strings.Contains(product, "wd") && !strings.Contains(vendor, "western digital") {
96+
return fmt.Sprintf("%s (WD OEM)", caser.String(vendor))
97+
}
98+
if strings.Contains(product, "toshiba") && !strings.Contains(vendor, "toshiba") {
99+
return fmt.Sprintf("%s (Toshiba OEM)", caser.String(vendor))
100+
}
101+
if strings.Contains(product, "hgst") && !strings.Contains(vendor, "hgst") {
102+
return fmt.Sprintf("%s (HGST OEM)", caser.String(vendor))
103+
}
104+
if strings.Contains(product, "samsung") && !strings.Contains(vendor, "samsung") {
105+
return fmt.Sprintf("%s (Samsung OEM)", caser.String(vendor))
106+
}
107+
if strings.Contains(product, "intel") && !strings.Contains(vendor, "intel") {
108+
return fmt.Sprintf("%s (Intel OEM)", caser.String(vendor))
109+
}
110+
111+
return ""
112+
}

pkg/producers/diskhealthmetrics/diskhealthmetrics.go

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ import (
1616
func collectDiskHealthMetrics(cfg DiskHealthMetricsConfig) []NormalizedSmartData {
1717
var allMetrics []NormalizedSmartData
1818

19+
// Check if nvme-cli is available for enhanced NVMe support
20+
nvmeCliAvailable := checkNVMeCliInstalled()
21+
if nvmeCliAvailable {
22+
log.Info().Msg("nvme-cli detected, enhanced NVMe metrics will be available")
23+
}
24+
1925
for _, disk := range cfg.Disks {
2026
//FIXME rawData, err := collectSmartData(fmt.Sprintf("/dev/%s", disk))
2127
rawData, err := collectSmartData(disk)
@@ -26,25 +32,41 @@ func collectDiskHealthMetrics(cfg DiskHealthMetricsConfig) []NormalizedSmartData
2632
continue
2733
}
2834

29-
// Normalize the device information
35+
// Enhance NVMe devices with nvme-cli data if available
36+
var nvmeController *NVMeIDControllerOutput
37+
var nvmeErrors *NVMeErrorLogOutput
38+
39+
if nvmeCliAvailable && rawData.Device.Protocol == "NVMe" {
40+
nvmeController, err = collectNVMeControllerData(disk)
41+
if err != nil {
42+
log.Warn().Err(err).Str("disk", disk).Msg("failed to collect NVMe controller data, continuing with smartctl only")
43+
}
44+
45+
nvmeErrors, err = collectNVMeErrorLog(disk)
46+
if err != nil {
47+
log.Warn().Err(err).Str("disk", disk).Msg("failed to collect NVMe error log, continuing without error log data")
48+
}
49+
50+
// Enhance the smartctl data with nvme-cli information
51+
enhanceNVMeData(rawData, nvmeController, nvmeErrors)
52+
}
53+
3054
deviceInfo := &DeviceInfo{}
3155
FillDeviceInfoFromSmartData(deviceInfo, rawData)
3256
NormalizeVendor(deviceInfo)
3357
NormalizeDeviceInfo(deviceInfo)
3458

35-
// Normalize Smart Attributes
3659
smartAttrs := GetSmartAttributes()
3760
ProcessAndUpdateSmartAttributes(smartAttrs, rawData)
38-
CleanupSmartAttributes(smartAttrs)
3961

40-
//FIXME: just for debug Print out the updated smartAttrs
41-
// for key, attr := range smartAttrs {
42-
// fmt.Printf("%s: %s (Unit: %s, Value: %d, Threshold: %d, Worst: %d, Raw: %d)\n", key, attr.Description, attr.Unit, attr.Value, attr.Threshold, attr.Worst, attr.RawValue)
43-
// }
62+
// Process NVMe-specific attributes if we have nvme-cli data
63+
if nvmeController != nil || nvmeErrors != nil {
64+
processNVMeSpecificAttributes(smartAttrs, nvmeController, nvmeErrors)
65+
}
4466

45-
// Normalize the data
46-
normalizedData := normalizeSmartData(rawData, deviceInfo, smartAttrs, cfg.NodeName, cfg.InstanceID, cfg.CephOSDBasePath)
67+
CleanupSmartAttributes(smartAttrs)
4768

69+
normalizedData := normalizeSmartData(rawData, deviceInfo, smartAttrs, cfg.NodeName, cfg.InstanceID, cfg.CephOSDBasePath)
4870
allMetrics = append(allMetrics, normalizedData)
4971
}
5072

@@ -86,7 +108,10 @@ func normalizeSmartData(smartData *SmartCtlOutput, deviceInfo *DeviceInfo, attri
86108
reallocatedSectors = &smartData.SCSIGrownDefectList
87109
}
88110

111+
enhanceDeviceInfo(deviceInfo)
112+
89113
osdID, _ := getOSDIDForDisk(smartData.Device.Name, basePath) // Ignore error as it's handled within the function
114+
90115
return NormalizedSmartData{
91116
NodeName: nodeName,
92117
InstanceID: instanceID,
@@ -115,31 +140,6 @@ func findSmartAttributeByID(attributes []SmartCtlATASMARTEntry, id int64) *Smart
115140
return nil
116141
}
117142

118-
// func findSmartAttributeByName(attributes []SmartCtlATASMARTEntry, name string) *SmartCtlATASMARTEntry {
119-
// for _, attr := range attributes {
120-
// if attr.Name == name {
121-
// return &attr
122-
// }
123-
// }
124-
// return nil
125-
// }
126-
127-
// func parseSMARTOutput(output []byte, attribute string) uint64 {
128-
// lines := strings.Split(string(output), "\n")
129-
// for _, line := range lines {
130-
// if strings.Contains(line, attribute) {
131-
// fields := strings.Fields(line)
132-
// value, err := strconv.ParseUint(fields[9], 10, 64)
133-
// if err != nil {
134-
// log.Printf("Error parsing %s value: %v", attribute, err)
135-
// return 0
136-
// }
137-
// return value
138-
// }
139-
// }
140-
// return 0
141-
// }
142-
143143
func StartMonitoring(cfg DiskHealthMetricsConfig) {
144144
if !checkSmartctlInstalled() {
145145
log.Fatal().Msg("smartctl is not installed. please install smartmontools package.")

0 commit comments

Comments
 (0)