Skip to content
This repository was archived by the owner on Aug 7, 2025. It is now read-only.

Commit 696861c

Browse files
askervingrahamwhaley
authored andcommitted
metrics: change collectd output to host /opt/collectd/run
Currently we loose collectd data from a node when scaling ends to a system failure on the node - yet this data can be very helpful in root causing the failure. This patch changes collectd configuration so that the output will be continuously written to host filesystem instead of the collectd container overlay that will be lost unless scaling reaches graceful exit. Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
1 parent 07fd841 commit 696861c

2 files changed

Lines changed: 16 additions & 8 deletions

File tree

metrics/collectd/collectd.bash

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,25 @@ collectd_pod="collectd"
1212
init_stats() {
1313
local wait_time=$1
1414

15-
# create collectd-config configmap
15+
# create collectd-config configmap, delete old if there is one
16+
kubectl get configmap collectd-config >/dev/null 2>&1 && kubectl delete configmap collectd-config
1617
kubectl create configmap collectd-config --from-file=${COLLECTD_DIR}/collectd.conf
1718

19+
# if there is collectd daemonset already running, delete it
20+
# to make sure that the latest configmap will be used.
21+
kubectl get daemonset collectd >/dev/null 2>&1 && kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}"
22+
1823
# Launch our stats gathering pod
1924
kubectl apply -f ${COLLECTD_DIR}/${collectd_pod}.yaml
2025
kubectl rollout status --timeout=${wait_time}s daemonset/${collectd_pod}
2126

27+
# clear existing collectd output
28+
while read -u 3 name node; do
29+
kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run/localhost/*"
30+
done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
31+
2232
# attempting to provide buffer for collectd to be installed and running,
23-
# and CPU collection to build adequate history
33+
# and CPU collection to build adequate history
2434
sleep 12
2535
}
2636

@@ -30,11 +40,9 @@ cleanup_stats() {
3040

3141
# get logs before shutting down stats daemonset
3242
while read -u 3 name node; do
33-
kubectl exec -ti $name -- sh -c "cd /opt/collectd; tar -czvf localhost.tar.gz localhost"
34-
# make a backup on the host in-case collection fail
35-
kubectl exec -ti $name -- sh -c "mkdir -p /mnt/opt/collectd"
36-
kubectl exec -ti $name -- sh -c "cp /opt/collectd/localhost.tar.gz /mnt/opt/collectd/localhost.tar.gz"
37-
kubectl cp $name:/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
43+
kubectl exec -ti $name -- sh -c "cd /mnt/opt/collectd/run; rm -f ../localhost.tar.gz; tar -czvf ../localhost.tar.gz localhost"
44+
kubectl cp $name:/mnt/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
45+
kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run"
3846
done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
3947

4048
kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}" || true

metrics/collectd/collectd.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Hostname localhost
1717
ValuesPercentage true
1818
</Plugin>
1919
<Plugin "csv">
20-
DataDir "/opt/collectd"
20+
DataDir "/mnt/opt/collectd/run"
2121
StoreRates true
2222
</Plugin>
2323
<Plugin "interface">

0 commit comments

Comments
 (0)