Skip to content

Commit 4d8ff36

Browse files
refactor: split NetworkData into sub-structs, bump schema to 1.1.0 (#9)
Break the 60+ field NetworkData monolith into logical sub-structs: NetworkData ├── TCP *TCPStats (existing — unchanged) ├── Conntrack *ConntrackStats (existing — unchanged) ├── TCPExt *TCPExtendedStats (NEW: ListenOverflows, Drops, RcvQ, ZeroWindow + rates) ├── UDP *UDPStats (NEW: RcvbufErrors, SndbufErrors, InErrors, rate) ├── Softnet *SoftnetData (NEW: Stats[], IRQDistribution[], DropRate, SqueezeRate) ├── SocketMem *SocketMemStats (NEW: TCPInUse, Orphans, MemPages, UDPInUse) ├── Sysctls *NetworkSysctls (NEW: 27 sysctl values) ├── Interfaces []NetworkInterface (existing — unchanged) ├── ListenSockets []ListenSocket (existing — unchanged) └── BCC fields (TotalConnections, AvgLatencyMs, etc. — unchanged) This is a breaking JSON schema change: fields that were at the top level of the network data object are now nested under sub-structs. All consumers updated: anomaly evaluators, recommendation engine, collector, tests. Schema version bumped from 1.0.0 to 1.1.0. Verified on production server (37.27.106.246): - All sub-structs populated correctly (sysctls, tcp_ext, softnet, socket_mem, conntrack) - 8 recommendations generated for detected suboptimal settings - MCP server initializes correctly - 451 unit tests pass Co-authored-by: dmitriimaksimovdevelop <227611064+dmitriimaksimovdevelop@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e6418f3 commit 4d8ff36

11 files changed

Lines changed: 305 additions & 255 deletions

File tree

README.md

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -309,29 +309,34 @@ The JSON report has four main sections:
309309
| `summary.health_score` | 90-100 = healthy, 70-89 = some issues, <70 = needs attention |
310310
| `summary.anomalies` | Each has `severity` (warning/critical), `metric`, `message` |
311311
| `summary.recommendations` | Copy-paste the `commands` field to fix issues |
312-
| `categories.network[0].data` | Raw metrics — interfaces, TCP, conntrack, softnet, etc. |
312+
| `categories.network[0].data` | Raw metrics in sub-structs: `.sysctls`, `.tcp_ext`, `.softnet`, `.udp`, `.socket_mem` |
313313

314314
### Network Deep Diagnostics — Manual Inspection
315315

316316
```bash
317317
# Conntrack table usage
318318
jq '.categories.network[0].data.conntrack' report.json
319-
# {"count": 15000, "max": 65536, "usage_pct": 22.9}
320319

321320
# Softnet drops (per-CPU) — any "dropped" > 0 is bad
322-
jq '.categories.network[0].data.softnet_stats[] | select(.dropped > 0)' report.json
321+
jq '.categories.network[0].data.softnet.stats[] | select(.dropped > 0)' report.json
323322

324-
# Listen overflows (accept queue full)
325-
jq '.categories.network[0].data | {listen_overflows, listen_drops}' report.json
323+
# Listen overflows (accept queue full) — rate-based
324+
jq '.categories.network[0].data.tcp_ext | {listen_overflows, listen_drops, listen_overflow_rate}' report.json
326325

327326
# NIC ring buffer (is it maxed out?)
328327
jq '.categories.network[0].data.interfaces[] | {name, driver, ring_rx_current, ring_rx_max, rx_discards}' report.json
329328

330329
# IRQ imbalance (check if one CPU handles all network interrupts)
331-
jq '.categories.network[0].data.irq_distribution' report.json
330+
jq '.categories.network[0].data.softnet.irq_distribution' report.json
332331

333332
# TCP memory pressure
334-
jq '.categories.network[0].data | {prune_called, tcp_abort_on_memory, tcp_mem}' report.json
333+
jq '.categories.network[0].data.tcp_ext | {prune_called, tcp_abort_on_memory}' report.json
334+
335+
# All sysctls at a glance
336+
jq '.categories.network[0].data.sysctls' report.json
337+
338+
# Socket memory and orphan sockets
339+
jq '.categories.network[0].data.socket_mem' report.json
335340
```
336341

337342
### Useful jq One-Liners

internal/collector/network.go

Lines changed: 97 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -97,96 +97,106 @@ func (c *NetworkCollector) Collect(ctx context.Context, cfg CollectConfig) (*mod
9797
// ss — connection state summary
9898
c.parseSSConnections(ctx, data)
9999

100-
// TCP sysctl tuning parameters
101-
data.CongestionCtrl = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_congestion_control")
102-
data.TCPRmem = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_rmem")
103-
data.TCPWmem = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_wmem")
104-
data.SomaxConn = readSysctlInt(c.procRoot, "sys/net/core/somaxconn")
105-
data.TCPMaxSynBacklog = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_max_syn_backlog")
106-
data.TCPTWReuse = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_tw_reuse")
107-
108-
// Deep network diagnostics — sysctls
109-
data.TCPMem = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_mem")
110-
data.TCPMaxTwBuckets = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_max_tw_buckets")
111-
data.TCPKeepaliveTime = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_keepalive_time")
112-
data.TCPKeepaliveIntvl = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_keepalive_intvl")
113-
data.TCPKeepaliveProbes = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_keepalive_probes")
114-
data.NetdevBudget = readSysctlInt(c.procRoot, "sys/net/core/netdev_budget")
115-
data.NetdevBudgetUsecs = readSysctlInt(c.procRoot, "sys/net/core/netdev_budget_usecs")
116-
data.NetdevMaxBacklog = readSysctlInt(c.procRoot, "sys/net/core/netdev_max_backlog")
117-
data.RmemMax = readSysctlInt(c.procRoot, "sys/net/core/rmem_max")
118-
data.WmemMax = readSysctlInt(c.procRoot, "sys/net/core/wmem_max")
119-
data.IPLocalPortRange = readSysctlString(c.procRoot, "sys/net/ipv4/ip_local_port_range")
120-
data.TCPFinTimeout = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_fin_timeout")
121-
data.TCPSlowStartAfterIdle = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_slow_start_after_idle")
122-
data.TCPFastOpen = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_fastopen")
123-
data.TCPSyncookies = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_syncookies")
124-
data.TCPNotsentLowat = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_notsent_lowat")
125-
data.DefaultQdisc = readSysctlString(c.procRoot, "sys/net/core/default_qdisc")
126-
data.TCPMtuProbing = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_mtu_probing")
127-
data.ARPGcThresh1 = readSysctlInt(c.procRoot, "sys/net/ipv4/neigh/default/gc_thresh1")
128-
data.ARPGcThresh2 = readSysctlInt(c.procRoot, "sys/net/ipv4/neigh/default/gc_thresh2")
129-
data.ARPGcThresh3 = readSysctlInt(c.procRoot, "sys/net/ipv4/neigh/default/gc_thresh3")
100+
// Initialize sub-structs
101+
data.Sysctls = &model.NetworkSysctls{}
102+
data.TCPExt = &model.TCPExtendedStats{}
103+
data.UDP = &model.UDPStats{}
104+
data.Softnet = &model.SoftnetData{}
105+
data.SocketMem = &model.SocketMemStats{}
106+
107+
// TCP/network sysctl tuning parameters
108+
sc := data.Sysctls
109+
sc.CongestionCtrl = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_congestion_control")
110+
sc.TCPRmem = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_rmem")
111+
sc.TCPWmem = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_wmem")
112+
sc.SomaxConn = readSysctlInt(c.procRoot, "sys/net/core/somaxconn")
113+
sc.TCPMaxSynBacklog = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_max_syn_backlog")
114+
sc.TCPTWReuse = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_tw_reuse")
115+
sc.TCPMem = readSysctlString(c.procRoot, "sys/net/ipv4/tcp_mem")
116+
sc.TCPMaxTwBuckets = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_max_tw_buckets")
117+
sc.TCPKeepaliveTime = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_keepalive_time")
118+
sc.TCPKeepaliveIntvl = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_keepalive_intvl")
119+
sc.TCPKeepaliveProbes = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_keepalive_probes")
120+
sc.NetdevBudget = readSysctlInt(c.procRoot, "sys/net/core/netdev_budget")
121+
sc.NetdevBudgetUsecs = readSysctlInt(c.procRoot, "sys/net/core/netdev_budget_usecs")
122+
sc.NetdevMaxBacklog = readSysctlInt(c.procRoot, "sys/net/core/netdev_max_backlog")
123+
sc.RmemMax = readSysctlInt(c.procRoot, "sys/net/core/rmem_max")
124+
sc.WmemMax = readSysctlInt(c.procRoot, "sys/net/core/wmem_max")
125+
sc.IPLocalPortRange = readSysctlString(c.procRoot, "sys/net/ipv4/ip_local_port_range")
126+
sc.TCPFinTimeout = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_fin_timeout")
127+
sc.TCPSlowStartAfterIdle = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_slow_start_after_idle")
128+
sc.TCPFastOpen = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_fastopen")
129+
sc.TCPSyncookies = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_syncookies")
130+
sc.TCPNotsentLowat = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_notsent_lowat")
131+
sc.DefaultQdisc = readSysctlString(c.procRoot, "sys/net/core/default_qdisc")
132+
sc.TCPMtuProbing = readSysctlInt(c.procRoot, "sys/net/ipv4/tcp_mtu_probing")
133+
sc.ARPGcThresh1 = readSysctlInt(c.procRoot, "sys/net/ipv4/neigh/default/gc_thresh1")
134+
sc.ARPGcThresh2 = readSysctlInt(c.procRoot, "sys/net/ipv4/neigh/default/gc_thresh2")
135+
sc.ARPGcThresh3 = readSysctlInt(c.procRoot, "sys/net/ipv4/neigh/default/gc_thresh3")
130136

131137
// Conntrack table stats
132138
data.Conntrack = c.parseConntrack()
133139

134140
// Softnet stats (per-CPU packet processing)
135-
data.SoftnetStats = c.parseSoftnetStat()
141+
data.Softnet.Stats = c.parseSoftnetStat()
136142

137143
// IRQ distribution (two-point delta — reuse pre/post interval samples)
138-
data.IRQDistribution = c.computeIRQDistribution(irqSample1)
144+
data.Softnet.IRQDistribution = c.computeIRQDistribution(irqSample1)
139145

140146
// Extended TCP stats from /proc/net/netstat
141147
c.parseNetstat(data)
142148

143149
// Compute rate fields from two-point deltas
144150
secs := interval.Seconds()
151+
ext := data.TCPExt
145152
if secs > 0 {
146153
// Softnet drop/squeeze rates
147-
if softnet1 != nil && len(data.SoftnetStats) == len(softnet1) {
154+
if softnet1 != nil && len(data.Softnet.Stats) == len(softnet1) {
148155
var dropDelta, squeezeDelta int64
149156
for i := range softnet1 {
150-
dd := data.SoftnetStats[i].Dropped - softnet1[i].Dropped
151-
sd := data.SoftnetStats[i].TimeSqueeze - softnet1[i].TimeSqueeze
157+
dd := data.Softnet.Stats[i].Dropped - softnet1[i].Dropped
158+
sd := data.Softnet.Stats[i].TimeSqueeze - softnet1[i].TimeSqueeze
152159
if dd > 0 {
153160
dropDelta += dd
154161
}
155162
if sd > 0 {
156163
squeezeDelta += sd
157164
}
158165
}
159-
data.SoftnetDropRate = float64(dropDelta) / secs
160-
data.SoftnetSqueezeRate = float64(squeezeDelta) / secs
166+
data.Softnet.DropRate = float64(dropDelta) / secs
167+
data.Softnet.SqueezeRate = float64(squeezeDelta) / secs
161168
}
162169
// TCP extended counter rates
163-
if netstat1.ListenOverflows > 0 || data.ListenOverflows > 0 {
164-
d := data.ListenOverflows - netstat1.ListenOverflows
165-
if d > 0 {
166-
data.ListenOverflowRate = float64(d) / secs
170+
if netstat1.TCPExt != nil {
171+
n1 := netstat1.TCPExt
172+
if n1.ListenOverflows > 0 || ext.ListenOverflows > 0 {
173+
d := ext.ListenOverflows - n1.ListenOverflows
174+
if d > 0 {
175+
ext.ListenOverflowRate = float64(d) / secs
176+
}
167177
}
168-
}
169-
if netstat1.TCPAbortOnMemory > 0 || data.TCPAbortOnMemory > 0 {
170-
d := data.TCPAbortOnMemory - netstat1.TCPAbortOnMemory
171-
if d > 0 {
172-
data.TCPAbortMemRate = float64(d) / secs
178+
if n1.TCPAbortOnMemory > 0 || ext.TCPAbortOnMemory > 0 {
179+
d := ext.TCPAbortOnMemory - n1.TCPAbortOnMemory
180+
if d > 0 {
181+
ext.TCPAbortMemRate = float64(d) / secs
182+
}
183+
}
184+
if ext.TCPRcvQDrop > n1.TCPRcvQDrop {
185+
ext.TCPRcvQDropRate = float64(ext.TCPRcvQDrop-n1.TCPRcvQDrop) / secs
186+
}
187+
if ext.TCPZeroWindowDrop > n1.TCPZeroWindowDrop {
188+
ext.TCPZeroWindowDropRate = float64(ext.TCPZeroWindowDrop-n1.TCPZeroWindowDrop) / secs
173189
}
174190
}
175191
// UDP rcvbuf error rate
176-
if snmp1.UDPRcvbufErrors > 0 || data.UDPRcvbufErrors > 0 {
177-
d := data.UDPRcvbufErrors - snmp1.UDPRcvbufErrors
178-
if d > 0 {
179-
data.UDPRcvbufErrRate = float64(d) / secs
192+
if snmp1.UDP != nil && data.UDP != nil {
193+
if snmp1.UDP.RcvbufErrors > 0 || data.UDP.RcvbufErrors > 0 {
194+
d := data.UDP.RcvbufErrors - snmp1.UDP.RcvbufErrors
195+
if d > 0 {
196+
data.UDP.RcvbufErrRate = float64(d) / secs
197+
}
180198
}
181199
}
182-
// TCPRcvQDrop rate (app not reading from ESTAB sockets)
183-
if data.TCPRcvQDrop > netstat1.TCPRcvQDrop {
184-
data.TCPRcvQDropRate = float64(data.TCPRcvQDrop-netstat1.TCPRcvQDrop) / secs
185-
}
186-
// TCPZeroWindowDrop rate
187-
if data.TCPZeroWindowDrop > netstat1.TCPZeroWindowDrop {
188-
data.TCPZeroWindowDropRate = float64(data.TCPZeroWindowDrop-netstat1.TCPZeroWindowDrop) / secs
189-
}
190200
}
191201

192202
// Listen queue depths + ESTABLISHED Recv-Q saturation
@@ -309,6 +319,9 @@ func (c *NetworkCollector) parseSNMP(data *model.NetworkData) {
309319
if udpHeaders == nil {
310320
udpHeaders = fields[1:]
311321
} else {
322+
if data.UDP == nil {
323+
data.UDP = &model.UDPStats{}
324+
}
312325
vals := fields[1:]
313326
for i, header := range udpHeaders {
314327
if i >= len(vals) {
@@ -317,11 +330,11 @@ func (c *NetworkCollector) parseSNMP(data *model.NetworkData) {
317330
v, _ := strconv.ParseInt(vals[i], 10, 64)
318331
switch header {
319332
case "RcvbufErrors":
320-
data.UDPRcvbufErrors = v
333+
data.UDP.RcvbufErrors = v
321334
case "SndbufErrors":
322-
data.UDPSndbufErrors = v
335+
data.UDP.SndbufErrors = v
323336
case "InErrors":
324-
data.UDPInErrors = v
337+
data.UDP.InErrors = v
325338
}
326339
}
327340
}
@@ -492,25 +505,29 @@ func (c *NetworkCollector) parseNetstat(data *model.NetworkData) {
492505
break
493506
}
494507
v, _ := strconv.ParseInt(vals[i], 10, 64)
508+
if data.TCPExt == nil {
509+
data.TCPExt = &model.TCPExtendedStats{}
510+
}
511+
ext := data.TCPExt
495512
switch header {
496513
case "ListenOverflows":
497-
data.ListenOverflows = v
514+
ext.ListenOverflows = v
498515
case "ListenDrops":
499-
data.ListenDrops = v
516+
ext.ListenDrops = v
500517
case "TCPAbortOnMemory":
501-
data.TCPAbortOnMemory = v
518+
ext.TCPAbortOnMemory = v
502519
case "TCPOFOQueue":
503-
data.TCPOFOQueue = v
520+
ext.TCPOFOQueue = v
504521
case "PruneCalled":
505-
data.PruneCalled = v
522+
ext.PruneCalled = v
506523
case "TCPRcvQDrop":
507-
data.TCPRcvQDrop = v
524+
ext.TCPRcvQDrop = v
508525
case "TCPZeroWindowDrop":
509-
data.TCPZeroWindowDrop = v
526+
ext.TCPZeroWindowDrop = v
510527
case "TCPToZeroWindowAdv":
511-
data.TCPToZeroWindowAdv = v
528+
ext.TCPToZeroWindowAdv = v
512529
case "TCPFromZeroWindowAdv":
513-
data.TCPFromZeroWindowAdv = v
530+
ext.TCPFromZeroWindowAdv = v
514531
}
515532
}
516533
break
@@ -732,23 +749,29 @@ func (c *NetworkCollector) parseSockstat(data *model.NetworkData) {
732749
}
733750
// Format: "TCP: inuse 123 orphan 4 tw 567 alloc 890 mem 12"
734751
if fields[0] == "TCP:" {
752+
if data.SocketMem == nil {
753+
data.SocketMem = &model.SocketMemStats{}
754+
}
735755
for i := 1; i+1 < len(fields); i += 2 {
736756
v, _ := strconv.Atoi(fields[i+1])
737757
switch fields[i] {
738758
case "inuse":
739-
data.TCPSocketsInUse = v
759+
data.SocketMem.TCPInUse = v
740760
case "orphan":
741-
data.TCPOrphans = v
761+
data.SocketMem.TCPOrphans = v
742762
case "mem":
743-
data.TCPMemPages = v
763+
data.SocketMem.TCPMemPages = v
744764
}
745765
}
746766
}
747767
if fields[0] == "UDP:" {
768+
if data.SocketMem == nil {
769+
data.SocketMem = &model.SocketMemStats{}
770+
}
748771
for i := 1; i+1 < len(fields); i += 2 {
749772
v, _ := strconv.Atoi(fields[i+1])
750773
if fields[i] == "inuse" {
751-
data.UDPSocketsInUse = v
774+
data.SocketMem.UDPInUse = v
752775
}
753776
}
754777
}

0 commit comments

Comments
 (0)