Skip to content

Commit 4fcdaf4

Browse files
authored
retry network tracer init (#67)
* retry network tracer init * retry tracer tests + configuration * retry tracer tests + configuration
1 parent 8d15e82 commit 4fcdaf4

6 files changed

Lines changed: 165 additions & 3 deletions

File tree

checks/net_common.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,13 @@ package checks
33
import (
44
"fmt"
55
"github.com/StackVista/stackstate-process-agent/model"
6+
"github.com/StackVista/tcptracer-bpf/pkg/tracer"
67
"github.com/StackVista/tcptracer-bpf/pkg/tracer/common"
8+
tracerConfig "github.com/StackVista/tcptracer-bpf/pkg/tracer/config"
9+
log "github.com/cihub/seelog"
710
"net"
811
"strings"
12+
"time"
913
)
1014

1115
type ip struct {
@@ -186,3 +190,32 @@ func calculateDirection(d common.Direction) model.ConnectionDirection {
186190
return model.ConnectionDirection_none
187191
}
188192
}
193+
194+
// retryTracerInit tries to create a network tracer with a given retry duration and retry amount
195+
func retryTracerInit(retryDuration time.Duration, retryAmount int, config *tracerConfig.Config,
196+
makeTracer func(*tracerConfig.Config) (tracer.Tracer, error)) (tracer.Tracer, error) {
197+
198+
retryTicker := time.NewTicker(retryDuration)
199+
retriesLeft := retryAmount
200+
201+
var t tracer.Tracer
202+
var err error
203+
204+
retry:
205+
for {
206+
select {
207+
case <-retryTicker.C:
208+
t, err = makeTracer(config)
209+
if err == nil {
210+
break retry
211+
}
212+
log.Debugf("failed to create network tracer: %s. Retrying..", err)
213+
retriesLeft = retriesLeft - 1
214+
if retriesLeft == 0 {
215+
break retry
216+
}
217+
}
218+
}
219+
220+
return t, err
221+
}

checks/net_linux.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@ package checks
44

55
import (
66
"bytes"
7-
"github.com/patrickmn/go-cache"
8-
97
"github.com/StackVista/stackstate-process-agent/config"
108
"github.com/StackVista/stackstate-process-agent/model"
119
"github.com/StackVista/stackstate-process-agent/net"
1210
"github.com/StackVista/tcptracer-bpf/pkg/tracer"
1311
tracerConfig "github.com/StackVista/tcptracer-bpf/pkg/tracer/config"
1412
log "github.com/cihub/seelog"
13+
"github.com/patrickmn/go-cache"
1514
"os"
1615
)
1716

@@ -39,7 +38,7 @@ func (c *ConnectionsCheck) Init(cfg *config.AgentConfig, sysInfo *model.SystemIn
3938
conf.MaxConnections = cfg.MaxPerMessage
4039
conf.BackfillFromProc = cfg.NetworkInitialConnectionsFromProc
4140

42-
t, err := tracer.NewTracer(conf)
41+
t, err := retryTracerInit(cfg.NetworkTracerInitRetryDuration, cfg.NetworkTracerInitRetryAmount, conf, tracer.NewTracer)
4342
if err != nil {
4443
log.Errorf("failed to create network tracer: %s. Set the environment STS_NETWORK_TRACING_ENABLED to false to disable network connections reporting", err)
4544
return

checks/net_linux_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package checks
2+
3+
import (
4+
"errors"
5+
"github.com/StackVista/tcptracer-bpf/pkg/tracer"
6+
tracerConfig "github.com/StackVista/tcptracer-bpf/pkg/tracer/config"
7+
"github.com/stretchr/testify/assert"
8+
"testing"
9+
"time"
10+
)
11+
12+
func TestConnectionsCheck_retryTracerInit(t *testing.T) {
13+
retryDuration := 40 * time.Millisecond
14+
retryAmount := 3
15+
testRetry := 0
16+
17+
for _, tc := range []struct {
18+
name string
19+
mockMakeTracerFunc func(config *tracerConfig.Config) (tracer.Tracer, error)
20+
expectedTracer tracer.Tracer
21+
expectedError string
22+
}{
23+
{
24+
name: "Returns the tracer when the make tracer function returns one",
25+
mockMakeTracerFunc: func(config *tracerConfig.Config) (tracer.Tracer, error) {
26+
return &tracer.LinuxTracer{}, nil
27+
},
28+
expectedTracer: &tracer.LinuxTracer{},
29+
expectedError: "",
30+
},
31+
{
32+
name: "Returns the tracer when the amount of retries are below the configured amount. ie retrying makes it work.",
33+
mockMakeTracerFunc: func(config *tracerConfig.Config) (tracer.Tracer, error) {
34+
if testRetry < retryAmount-1 {
35+
testRetry = testRetry + 1
36+
return nil, errors.New("failed to create tracer")
37+
}
38+
return &tracer.LinuxTracer{}, nil
39+
},
40+
expectedTracer: &tracer.LinuxTracer{},
41+
expectedError: "",
42+
},
43+
{
44+
name: "Returns an error when max retries are reached.",
45+
mockMakeTracerFunc: func(config *tracerConfig.Config) (tracer.Tracer, error) {
46+
if testRetry <= retryAmount-1 {
47+
testRetry = testRetry + 1
48+
return nil, errors.New("failed to create tracer")
49+
}
50+
return &tracer.LinuxTracer{}, nil
51+
},
52+
expectedTracer: nil,
53+
expectedError: "failed to create tracer",
54+
},
55+
{
56+
name: "Return an error when the make tracer function returns an error",
57+
mockMakeTracerFunc: func(config *tracerConfig.Config) (tracer.Tracer, error) {
58+
return nil, errors.New("failed to create tracer")
59+
},
60+
expectedError: "failed to create tracer",
61+
},
62+
} {
63+
t.Run(tc.name, func(t *testing.T) {
64+
tr, err := retryTracerInit(retryDuration, retryAmount, tracerConfig.DefaultConfig, tc.mockMakeTracerFunc)
65+
if tc.expectedError != "" {
66+
assert.EqualError(t, err, tc.expectedError)
67+
} else {
68+
assert.NoError(t, err)
69+
}
70+
assert.EqualValues(t, tc.expectedTracer, tr)
71+
72+
// reset the test retries
73+
testRetry = 0
74+
})
75+
}
76+
}

config/config.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ type AgentConfig struct {
118118
NetworkInitialConnectionsFromProc bool
119119
NetworkTracerSocketPath string
120120
NetworkTracerLogFile string
121+
NetworkTracerInitRetryDuration time.Duration
122+
NetworkTracerInitRetryAmount int
121123

122124
// Check config
123125
EnabledChecks []string
@@ -236,6 +238,8 @@ func NewDefaultAgentConfig() *AgentConfig {
236238
NetworkInitialConnectionsFromProc: true,
237239
NetworkTracerSocketPath: defaultNetworkTracerSocketPath,
238240
NetworkTracerLogFile: defaultNetworkLogFilePath,
241+
NetworkTracerInitRetryDuration: 5 * time.Second,
242+
NetworkTracerInitRetryAmount: 3,
239243

240244
// Check config
241245
EnabledChecks: containerChecks,
@@ -693,6 +697,15 @@ func mergeEnvironmentVariables(c *AgentConfig) *AgentConfig {
693697
c.NetworkRelationCacheDurationMin = time.Duration(durationS) * time.Minute
694698
}
695699

700+
if v := os.Getenv("STS_NETWORK_TRACER_INIT_RETRY_DURATION_SEC"); v != "" {
701+
durationS, _ := strconv.Atoi(v)
702+
c.NetworkTracerInitRetryDuration = time.Duration(durationS) * time.Second
703+
}
704+
705+
if v, err := strconv.Atoi(os.Getenv("STS_NETWORK_TRACER_INIT_RETRY_AMOUNT")); err == nil {
706+
c.NetworkTracerInitRetryAmount = v
707+
}
708+
696709
if v, err := strconv.Atoi(os.Getenv("STS_PROCESS_FILTER_SHORT_LIVED_QUALIFIER_SECS")); err == nil {
697710
setProcessFilters(c, true, v)
698711
}

config/config_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,19 @@ func TestSetBlacklistFromEnv(t *testing.T) {
341341
os.Unsetenv("STS_PROCESS_BLACKLIST_INCLUSIONS_MEM_THRESHOLD")
342342
}
343343

344+
func TestSetNetworkTracerInitRetryFromEnv(t *testing.T) {
345+
os.Setenv("STS_NETWORK_TRACER_INIT_RETRY_DURATION_SEC", "30")
346+
os.Setenv("STS_NETWORK_TRACER_INIT_RETRY_AMOUNT", "4")
347+
348+
agentConfig, _ := NewAgentConfig(nil, nil, nil)
349+
350+
assert.Equal(t, 30*time.Second, agentConfig.NetworkTracerInitRetryDuration)
351+
assert.Equal(t, 4, agentConfig.NetworkTracerInitRetryAmount)
352+
353+
os.Unsetenv("STS_NETWORK_TRACER_INIT_RETRY_DURATION_SEC")
354+
os.Unsetenv("STS_NETWORK_TRACER_INIT_RETRY_AMOUNT")
355+
}
356+
344357
func TestOnlyEnvConfig(t *testing.T) {
345358
// setting an API Key should be enough to generate valid config
346359
os.Setenv("DD_API_KEY", "apikey_from_env")
@@ -1280,3 +1293,19 @@ func TestStackStatePreferPROCESS_AGENT_URLOverYamlsts_sts_url(t *testing.T) {
12801293
assert.Equal("apikey_30", ep.APIKey)
12811294
assert.Equal("process-endpoint.test.stackstate.com", ep.Endpoint.Hostname())
12821295
}
1296+
1297+
func TestNetworkTracerInitRetry_FromYaml(t *testing.T) {
1298+
var ddy YamlAgentConfig
1299+
err := yaml.Unmarshal([]byte(strings.Join([]string{
1300+
"network_tracer_config:",
1301+
" network_tracer_retry_init_duration_sec: 50",
1302+
" network_tracer_retry_init_amount: 10",
1303+
}, "\n")), &ddy)
1304+
assert.NoError(t, err)
1305+
1306+
agentConfig, err := NewAgentConfig(nil, &ddy, nil)
1307+
assert.NoError(t, err)
1308+
1309+
assert.Equal(t, 10, agentConfig.NetworkTracerInitRetryAmount)
1310+
assert.Equal(t, 50*time.Second, agentConfig.NetworkTracerInitRetryDuration)
1311+
}

config/yaml_config.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ type YamlAgentConfig struct {
121121
UnixSocketPath string `yaml:"nettracer_socket"`
122122
// The full path to the file where network-tracer logs will be written.
123123
LogFile string `yaml:"log_file"`
124+
// A integer indicating the amount of seconds for the retry interval for initializing the network tracer.
125+
NetworkTracerInitRetryDuration int `yaml:"network_tracer_retry_init_duration_sec"`
126+
// A integer indicating the amount of retries to use for initializing the network tracer.
127+
NetworkTracerInitRetryAmount int `yaml:"network_tracer_retry_init_amount"`
124128
} `yaml:"network_tracer_config"`
125129
}
126130

@@ -247,6 +251,14 @@ func mergeYamlConfig(agentConf *AgentConfig, yc *YamlAgentConfig) (*AgentConfig,
247251
agentConf.NetworkRelationCacheDurationMin = time.Duration(yc.Process.NetworkRelationCacheDurationMin) * time.Minute
248252
}
249253

254+
if yc.Network.NetworkTracerInitRetryDuration > 0 {
255+
agentConf.NetworkTracerInitRetryDuration = time.Duration(yc.Network.NetworkTracerInitRetryDuration) * time.Second
256+
}
257+
258+
if yc.Network.NetworkTracerInitRetryAmount > 0 {
259+
agentConf.NetworkTracerInitRetryAmount = yc.Network.NetworkTracerInitRetryAmount
260+
}
261+
250262
// DataScrubber
251263
if yc.Process.ScrubArgs != nil {
252264
agentConf.Scrubber.Enabled = *yc.Process.ScrubArgs

0 commit comments

Comments
 (0)