Skip to content

Commit 2978570

Browse files
committed
fix(chart): address open PR review comments for Langfuse retention jobs
1 parent 7f36121 commit 2978570

4 files changed

Lines changed: 87 additions & 21 deletions

File tree

infrastructure/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,13 +201,15 @@ langfuseRetention:
201201
schedule: "30 3 * * *"
202202
mutationSync: 0
203203
clickhouse:
204-
database: "default"
205-
onCluster: true
204+
database: "default" # set this to the same DB your Langfuse deployment uses
205+
onCluster: false # true only for clustered ClickHouse setups
206206
clusterName: "default"
207207
```
208208

209209
Notes:
210210
- ClickHouse connection/auth for retention jobs is taken from `langfuse.clickhouse.*` (same source as Langfuse itself).
211+
- Make sure `langfuseRetention.clickhouse.database` matches your Langfuse ClickHouse database, not just the chart default.
212+
- Set `langfuseRetention.clickhouse.onCluster=true` only when your ClickHouse deployment is clustered and `clusterName` exists.
211213
- The CronJob applies idempotent `ALTER TABLE ... MODIFY TTL` statements on Langfuse tables (`traces`, `observations`, `scores`).
212214
- If `hardDelete.enabled=true`, an additional CronJob executes deterministic `ALTER TABLE ... DELETE WHERE ...` mutations.
213215
- Deletion is then handled by ClickHouse background merges (not instant at the exact cutoff timestamp).

infrastructure/rag/templates/langfuse-retention-cronjob.yaml

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,27 +20,37 @@ spec:
2020
app.kubernetes.io/name: rag
2121
app.kubernetes.io/instance: {{ .Release.Name }}
2222
spec:
23+
securityContext:
24+
runAsUser: {{ .Values.langfuseRetention.podSecurityContext.runAsUser }}
25+
runAsNonRoot: {{ .Values.langfuseRetention.podSecurityContext.runAsNonRoot }}
26+
{{- if .Values.shared.imagePullSecret }}
27+
imagePullSecrets:
28+
- name: {{ .Values.shared.imagePullSecret.name }}
29+
{{- end }}
2330
restartPolicy: OnFailure
2431
containers:
2532
- name: apply-clickhouse-ttl
2633
image: {{ $retentionImage | quote }}
2734
imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }}
35+
securityContext:
36+
allowPrivilegeEscalation: {{ .Values.langfuseRetention.securityContext.allowPrivilegeEscalation }}
37+
{{- with .Values.langfuseRetention.resources }}
38+
resources:
39+
{{ toYaml . | nindent 16 }}
40+
{{- end }}
2841
command:
2942
- /bin/bash
3043
- -ec
3144
args:
3245
- |
3346
set -euo pipefail
3447
35-
PASSWORD="${CLICKHOUSE_PASSWORD:-}"
36-
if [ -z "${PASSWORD}" ]; then
37-
PASSWORD="${CLICKHOUSE_PASSWORD_LITERAL:-}"
38-
fi
39-
40-
if [ -z "${PASSWORD}" ]; then
48+
if [ -z "${CLICKHOUSE_PASSWORD:-}" ] && [ -z "${CLICKHOUSE_PASSWORD_LITERAL:-}" ]; then
4149
echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret."
4250
exit 1
4351
fi
52+
export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-${CLICKHOUSE_PASSWORD_LITERAL:-}}"
53+
unset CLICKHOUSE_PASSWORD_LITERAL
4454
4555
ON_CLUSTER_CLAUSE=""
4656
if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then
@@ -54,16 +64,29 @@ spec:
5464
EOF_TABLES
5565
)"
5666
67+
IDENTIFIER_REGEX='^[A-Za-z_][A-Za-z0-9_]*$'
68+
5769
while IFS=$'\t' read -r table ts_col; do
5870
[ -z "${table}" ] && continue
5971
72+
if ! [[ "${table}" =~ ${IDENTIFIER_REGEX} ]]; then
73+
echo "Invalid table identifier: ${table}"
74+
exit 1
75+
fi
76+
if ! [[ "${ts_col}" =~ ${IDENTIFIER_REGEX} ]]; then
77+
echo "Invalid timestamp column identifier: ${ts_col}"
78+
exit 1
79+
fi
80+
6081
echo "Applying TTL=${RETENTION_DAYS}d to ${CLICKHOUSE_DATABASE}.${table} (${ts_col})"
61-
clickhouse-client \
82+
if ! clickhouse-client \
6283
--host "${CLICKHOUSE_HOST}" \
6384
--port "${CLICKHOUSE_PORT}" \
6485
--user "${CLICKHOUSE_USER}" \
65-
--password "${PASSWORD}" \
66-
--query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} MODIFY TTL toDateTime(${ts_col}) + toIntervalDay(${RETENTION_DAYS})"
86+
--query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} MODIFY TTL ${ts_col} + toIntervalDay(${RETENTION_DAYS})"; then
87+
echo "Failed applying TTL on ${CLICKHOUSE_DATABASE}.${table}"
88+
exit 1
89+
fi
6790
done <<< "${TABLE_ROWS}"
6891
env:
6992
{{ include "rag.langfuseRetentionClickhouseEnv" . | nindent 16 }}

infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,27 +20,37 @@ spec:
2020
app.kubernetes.io/name: rag
2121
app.kubernetes.io/instance: {{ .Release.Name }}
2222
spec:
23+
securityContext:
24+
runAsUser: {{ .Values.langfuseRetention.podSecurityContext.runAsUser }}
25+
runAsNonRoot: {{ .Values.langfuseRetention.podSecurityContext.runAsNonRoot }}
26+
{{- if .Values.shared.imagePullSecret }}
27+
imagePullSecrets:
28+
- name: {{ .Values.shared.imagePullSecret.name }}
29+
{{- end }}
2330
restartPolicy: OnFailure
2431
containers:
2532
- name: delete-expired-rows
2633
image: {{ $retentionImage | quote }}
2734
imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }}
35+
securityContext:
36+
allowPrivilegeEscalation: {{ .Values.langfuseRetention.securityContext.allowPrivilegeEscalation }}
37+
{{- with .Values.langfuseRetention.resources }}
38+
resources:
39+
{{ toYaml . | nindent 16 }}
40+
{{- end }}
2841
command:
2942
- /bin/bash
3043
- -ec
3144
args:
3245
- |
3346
set -euo pipefail
3447
35-
PASSWORD="${CLICKHOUSE_PASSWORD:-}"
36-
if [ -z "${PASSWORD}" ]; then
37-
PASSWORD="${CLICKHOUSE_PASSWORD_LITERAL:-}"
38-
fi
39-
40-
if [ -z "${PASSWORD}" ]; then
48+
if [ -z "${CLICKHOUSE_PASSWORD:-}" ] && [ -z "${CLICKHOUSE_PASSWORD_LITERAL:-}" ]; then
4149
echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret."
4250
exit 1
4351
fi
52+
export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-${CLICKHOUSE_PASSWORD_LITERAL:-}}"
53+
unset CLICKHOUSE_PASSWORD_LITERAL
4454
4555
ON_CLUSTER_CLAUSE=""
4656
if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then
@@ -55,17 +65,29 @@ spec:
5565
)"
5666
5767
CUTOFF_UNIX="$(( $(date -u +%s) - RETENTION_DAYS * 86400 ))"
68+
IDENTIFIER_REGEX='^[A-Za-z_][A-Za-z0-9_]*$'
5869
5970
while IFS=$'\t' read -r table ts_col; do
6071
[ -z "${table}" ] && continue
6172
73+
if ! [[ "${table}" =~ ${IDENTIFIER_REGEX} ]]; then
74+
echo "Invalid table identifier: ${table}"
75+
exit 1
76+
fi
77+
if ! [[ "${ts_col}" =~ ${IDENTIFIER_REGEX} ]]; then
78+
echo "Invalid timestamp column identifier: ${ts_col}"
79+
exit 1
80+
fi
81+
6282
echo "Deleting rows older than ${RETENTION_DAYS}d from ${CLICKHOUSE_DATABASE}.${table} (${ts_col})"
63-
clickhouse-client \
83+
if ! clickhouse-client \
6484
--host "${CLICKHOUSE_HOST}" \
6585
--port "${CLICKHOUSE_PORT}" \
6686
--user "${CLICKHOUSE_USER}" \
67-
--password "${PASSWORD}" \
68-
--query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} DELETE WHERE toDateTime(${ts_col}) < toDateTime(${CUTOFF_UNIX}) SETTINGS mutations_sync = ${MUTATION_SYNC}"
87+
--query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} DELETE WHERE ${ts_col} < toDateTime(${CUTOFF_UNIX}) SETTINGS mutations_sync = ${MUTATION_SYNC}"; then
88+
echo "Failed deleting expired rows from ${CLICKHOUSE_DATABASE}.${table}"
89+
exit 1
90+
fi
6991
done <<< "${TABLE_ROWS}"
7092
env:
7193
- name: MUTATION_SYNC

infrastructure/rag/values.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,21 @@ langfuseRetention:
740740
enabled: false
741741
retentionDays: 365
742742
schedule: "15 */6 * * *"
743+
podSecurityContext:
744+
runAsUser: 1001
745+
runAsNonRoot: true
746+
securityContext:
747+
allowPrivilegeEscalation: false
748+
# Optional resources for both retention CronJobs.
749+
# Example:
750+
# resources:
751+
# requests:
752+
# cpu: 100m
753+
# memory: 128Mi
754+
# limits:
755+
# cpu: 500m
756+
# memory: 512Mi
757+
resources: {}
743758
# Optional deterministic deletion in addition to TTL.
744759
# Uses ALTER TABLE ... DELETE WHERE ... and can run nightly.
745760
hardDelete:
@@ -754,10 +769,14 @@ langfuseRetention:
754769
pullPolicy: IfNotPresent
755770
clickhouse:
756771
# Connection/auth are taken from langfuse.clickhouse.*.
772+
# Align this with the database Langfuse actually uses in ClickHouse.
757773
database: "default"
758-
onCluster: true
774+
# Set to true only for clustered ClickHouse deployments where clusterName exists.
775+
# Keep false for single-node/non-clustered deployments.
776+
onCluster: false
759777
clusterName: "default"
760778
tables:
779+
# timestampColumn should be a Date/DateTime/DateTime64 column in the target table.
761780
- name: "traces"
762781
timestampColumn: "timestamp"
763782
- name: "observations"

0 commit comments

Comments
 (0)