Skip to content

Commit f650eb5

Browse files
vparfonovclaude
andcommitted
feat(vector): add support for raising file descriptor limits
Add annotation-driven support for Vector's VECTOR_RAISE_FD_LIMIT env var, enabling the collector to raise file descriptor soft limits at startup. This prevents "Too many open files" errors when monitoring large numbers of log files concurrently. Key changes: - New annotation: observability.openshift.io/raise-fd-limit (default: false) - Sets VECTOR_RAISE_FD_LIMIT env var on the collector container - Vector reads the env var natively, no entrypoint script changes needed - Input validation rejects invalid values (only "true"/"false" allowed) - Status condition reports validation errors to the user Default matches Vector's behavior (false = don't raise fd limits). Users must explicitly set the annotation to "true" to opt in. Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
1 parent 218eb4a commit f650eb5

9 files changed

Lines changed: 151 additions & 0 deletions

File tree

api/observability/v1/conditions.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ const (
3535
// ConditionTypeMaxUnavailable validates the value of the max-unavailable-rollout annotation
3636
ConditionTypeMaxUnavailable = GroupName + "/MaxUnavailableAnnotation"
3737

38+
// ConditionTypeRaiseFdLimit validates the value of the raise-fd-limit annotation
39+
ConditionTypeRaiseFdLimit = GroupName + "/RaiseFdLimitAnnotation"
40+
3841
// ConditionTypeReady indicates the service is ready.
3942
//
4043
// Ready=True means the operands are running and providing some service.
@@ -89,6 +92,9 @@ const (
8992
// ReasonKubeCacheSupported indicates the support for the use-apiserver-cache annotation value
9093
ReasonKubeCacheSupported = "KubeCacheAnnotationSupported"
9194

95+
// ReasonRaiseFdLimitSupported indicates the support for the raise-fd-limit annotation value
96+
ReasonRaiseFdLimitSupported = "RaiseFdLimitAnnotationSupported"
97+
9298
// ReasonReconciliationComplete when the operator has initialized, validated, and deployed the resources for the workload
9399
ReasonReconciliationComplete = "ReconciliationComplete"
94100

docs/features/raise_fd_limit.adoc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
== Vector Collector: Raising File Descriptor Limits
2+
3+
This feature enables Vector to automatically raise its file descriptor soft limit to match the hard limit at startup. This prevents "Too many open files" errors when Vector is monitoring a large number of log files concurrently.
4+
5+
Systems often default to restrictive file descriptor soft limits (e.g. 1024 on Linux), which can cause Vector to fail when processing many log sources simultaneously. When enabled, the operator sets the `VECTOR_RAISE_FD_LIMIT` environment variable, which Vector reads natively to raise the soft limit at startup without requiring manual sysadmin intervention.
6+
7+
Annotation: `observability.openshift.io/raise-fd-limit`.
8+
9+
The Cluster Logging Operator does not enable this feature by default (`false`), matching Vector's default behavior.
10+
11+
Supported values are:
12+
13+
. `true` - raise the file descriptor soft limit to match the hard limit
14+
. `false` (default) - do not modify file descriptor limits
15+
16+
=== Example
17+
.Enable Raising File Descriptor Limits
18+
[source]
19+
----
20+
apiVersion: "observability.openshift.io/v1"
21+
kind: ClusterLogForwarder
22+
metadata:
23+
name: instance
24+
namespace: openshift-logging
25+
annotations:
26+
observability.openshift.io/raise-fd-limit: "true"
27+
spec:
28+
outputs:
29+
- name: devel
30+
type: elasticsearch
31+
pipelines:
32+
- name: devel-logs
33+
inputRefs:
34+
- application
35+
outputRefs:
36+
- devel
37+
----
38+
This configuration will enable Vector to raise its file descriptor soft limit at startup.

internal/collector/collector.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ func (f *Factory) NewCollectorContainer(inputs internalobs.Inputs, outputs inter
208208
{Name: "OPENSHIFT_CLUSTER_ID", Value: clusterID},
209209
{Name: "POD_IP", ValueFrom: &v1.EnvVarSource{FieldRef: &v1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "status.podIP"}}},
210210
{Name: "POD_IPS", ValueFrom: &v1.EnvVarSource{FieldRef: &v1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "status.podIPs"}}},
211+
{Name: "VECTOR_RAISE_FD_LIMIT", Value: RaiseFdLimit(f.annotations)},
211212
}
212213
collector.Env = append(collector.Env, utils.GetProxyEnvVars()...)
213214

@@ -405,3 +406,13 @@ func LogLevel(annotations map[string]string) string {
405406
}
406407
return "warn"
407408
}
409+
410+
func RaiseFdLimit(annotations map[string]string) string {
411+
if value, ok := annotations[constants.AnnotationVectorRaiseFdLimit]; ok {
412+
if value == "true" || value == "false" {
413+
return value
414+
}
415+
log.V(0).Info("Invalid value for annotation, using default", "annotation", constants.AnnotationVectorRaiseFdLimit, "value", value, "default", "false")
416+
}
417+
return "false"
418+
}

internal/collector/collector_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,30 @@ var _ = Describe("Factory#Daemonset", func() {
138138
Expect(collector.Env).To(IncludeEnvVar(v1.EnvVar{Name: "VECTOR_LOG", Value: logLevelDebug}))
139139
})
140140

141+
It("should set VECTOR_RAISE_FD_LIMIT to false by default", func() {
142+
Expect(collector.Env).To(IncludeEnvVar(v1.EnvVar{Name: "VECTOR_RAISE_FD_LIMIT", Value: "false"}))
143+
})
144+
145+
It("should set VECTOR_RAISE_FD_LIMIT to true when annotation is set", func() {
146+
factory.annotations = map[string]string{
147+
constants.AnnotationVectorRaiseFdLimit: "true",
148+
}
149+
150+
podSpec = *factory.NewPodSpec(nil, obs.ClusterLogForwarderSpec{}, "1234", tls.GetClusterTLSProfileSpec(nil), constants.OpenshiftNS)
151+
collector = podSpec.Containers[0]
152+
Expect(collector.Env).To(IncludeEnvVar(v1.EnvVar{Name: "VECTOR_RAISE_FD_LIMIT", Value: "true"}))
153+
})
154+
155+
It("should default VECTOR_RAISE_FD_LIMIT to false when annotation has invalid value", func() {
156+
factory.annotations = map[string]string{
157+
constants.AnnotationVectorRaiseFdLimit: "yes",
158+
}
159+
160+
podSpec = *factory.NewPodSpec(nil, obs.ClusterLogForwarderSpec{}, "1234", tls.GetClusterTLSProfileSpec(nil), constants.OpenshiftNS)
161+
collector = podSpec.Containers[0]
162+
Expect(collector.Env).To(IncludeEnvVar(v1.EnvVar{Name: "VECTOR_RAISE_FD_LIMIT", Value: "false"}))
163+
})
164+
141165
Context("the volume mounts", func() {
142166
It("should mount all output configmaps", func() {
143167
Expect(collector.VolumeMounts).To(IncludeVolumeMount(

internal/constants/annotations.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ const (
1313
// CLO's default log level for vector is `warn`: https://issues.redhat.com/browse/LOG-3435
1414
AnnotationVectorLogLevel = "observability.openshift.io/log-level"
1515

16+
// AnnotationVectorRaiseFdLimit controls whether Vector raises its file descriptor soft limit at startup.
17+
// Valid values are "true" or "false". Default is "false" (matching Vector's default).
18+
AnnotationVectorRaiseFdLimit = "observability.openshift.io/raise-fd-limit"
19+
1620
AnnotationSecretHash = "observability.openshift.io/secret-hash"
1721
AnnotationConfigMapHash = "observability.openshift.io/configmap-hash"
1822

internal/validations/observability/validate.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ var (
1414
clfValidators = []func(internalcontext.ForwarderContext){
1515
validateLogLevelAnnotation,
1616
validateMaxUnavailableAnnotation,
17+
validateRaiseFdLimitAnnotation,
1718
ValidatePermissions,
1819
inputs.Validate,
1920
outputs.Validate,

internal/validations/observability/validate_annotations.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const (
1919
var (
2020
compiledMaxUnavailableRegex = regexp.MustCompile(validMaxUnavailableRegex)
2121
allowedLogLevels = sets.NewString("trace", "debug", "info", "warn", "error", "off")
22+
allowedRaiseFdLimitValues = sets.NewString("true", "false")
2223
enabledValues = sets.NewString("true", "enabled")
2324
)
2425

@@ -53,6 +54,18 @@ func validateLogLevelAnnotation(context internalcontext.ForwarderContext) {
5354
internalobs.RemoveConditionByType(&context.Forwarder.Status.Conditions, obs.ConditionTypeLogLevel)
5455
}
5556

57+
func validateRaiseFdLimitAnnotation(context internalcontext.ForwarderContext) {
58+
if value, ok := context.Forwarder.Annotations[constants.AnnotationVectorRaiseFdLimit]; ok {
59+
if !allowedRaiseFdLimitValues.Has(value) {
60+
condition := internalobs.NewCondition(obs.ConditionTypeRaiseFdLimit, obs.ConditionFalse, obs.ReasonRaiseFdLimitSupported, "")
61+
condition.Message = fmt.Sprintf("raise-fd-limit value %q must be one of [true, false]", value)
62+
internalobs.SetCondition(&context.Forwarder.Status.Conditions, condition)
63+
return
64+
}
65+
}
66+
internalobs.RemoveConditionByType(&context.Forwarder.Status.Conditions, obs.ConditionTypeRaiseFdLimit)
67+
}
68+
5669
func IsEnabledValue(val string) bool {
5770
return enabledValues.Has(strings.ToLower(val))
5871
}

internal/validations/observability/validate_annotations_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,27 @@ var _ = Describe("[internal][validations] validate clusterlogforwarder annotatio
4949
Entry("should pass with level off", "off"))
5050
})
5151

52+
Context("#validateRaiseFdLimit", func() {
53+
It("should pass validation if no annotations are set", func() {
54+
validateRaiseFdLimitAnnotation(context)
55+
Expect(clf.Status.Conditions).To(BeEmpty())
56+
})
57+
58+
It("should fail validation if value is not true or false", func() {
59+
clf.Annotations = map[string]string{constants.AnnotationVectorRaiseFdLimit: "yes"}
60+
validateRaiseFdLimitAnnotation(context)
61+
Expect(clf.Status.Conditions).To(HaveCondition(obs.ConditionTypeRaiseFdLimit, false, obs.ReasonRaiseFdLimitSupported, ".*must be one of.*"))
62+
})
63+
64+
DescribeTable("valid raise-fd-limit values", func(value string) {
65+
clf.Annotations = map[string]string{constants.AnnotationVectorRaiseFdLimit: value}
66+
validateRaiseFdLimitAnnotation(context)
67+
Expect(clf.Status.Conditions).To(BeEmpty())
68+
},
69+
Entry("should pass with value true", "true"),
70+
Entry("should pass with value false", "false"))
71+
})
72+
5273
Context("#validateMaxUnavailable", func() {
5374
It("should pass validation if no annotations are set", func() {
5475
validateMaxUnavailableAnnotation(context)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package misc
2+
3+
import (
4+
. "github.com/onsi/ginkgo/v2"
5+
. "github.com/onsi/gomega"
6+
obs "github.com/openshift/cluster-logging-operator/api/observability/v1"
7+
"github.com/openshift/cluster-logging-operator/test/framework/functional"
8+
testruntime "github.com/openshift/cluster-logging-operator/test/runtime/observability"
9+
)
10+
11+
var _ = Describe("[Functional][Misc][RaiseFdLimit] Vector raise-fd-limit", func() {
12+
13+
var framework *functional.CollectorFunctionalFramework
14+
15+
BeforeEach(func() {
16+
framework = functional.NewCollectorFunctionalFramework()
17+
testruntime.NewClusterLogForwarderBuilder(framework.Forwarder).
18+
FromInput(obs.InputTypeInfrastructure).
19+
ToHttpOutput()
20+
})
21+
22+
AfterEach(func() {
23+
framework.Cleanup()
24+
})
25+
26+
It("should start successfully with VECTOR_RAISE_FD_LIMIT env var", func() {
27+
Expect(framework.Deploy()).To(BeNil())
28+
29+
logs, err := framework.ReadCollectorLogs()
30+
Expect(err).To(BeNil())
31+
Expect(logs).To(ContainSubstring("Vector has started."))
32+
})
33+
})

0 commit comments

Comments
 (0)