@@ -83,19 +83,22 @@ FOLDER_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
8383 -d " {\" title\" : \" $FOLDER_NAME \" }" \
8484 " $GRAFANA_URL /api/folders" )
8585
86+ FOLDER_UID=$( echo " $FOLDER_RESPONSE " | jq -r ' .uid // empty' )
8687FOLDER_ID=$( echo " $FOLDER_RESPONSE " | jq -r ' .id // empty' )
87- if [[ -z " $FOLDER_ID " ]]; then
88+ if [[ -z " $FOLDER_UID " ]]; then
8889 # Try to get existing folder
89- EXISTING_FOLDER=$( curl -s -u " $GRAFANA_USER :$GRAFANA_PASSWORD " " $GRAFANA_URL /api/folders" | jq -r " .[] | select(.title == \" $FOLDER_NAME \" ) | .id // empty " )
90+ EXISTING_FOLDER=$( curl -s -u " $GRAFANA_USER :$GRAFANA_PASSWORD " " $GRAFANA_URL /api/folders" | jq -r " .[] | select(.title == \" $FOLDER_NAME \" )" )
9091 if [[ -n " $EXISTING_FOLDER " ]]; then
91- FOLDER_ID=" $EXISTING_FOLDER "
92- log " 📁 Using existing folder: $FOLDER_ID "
92+ FOLDER_UID=$( echo " $EXISTING_FOLDER " | jq -r ' .uid' )
93+ FOLDER_ID=$( echo " $EXISTING_FOLDER " | jq -r ' .id' )
94+ log " 📁 Using existing folder: $FOLDER_UID "
9395 else
96+ FOLDER_UID=" "
9497 FOLDER_ID=0
95- log " ⚠️ Using General folder (ID: 0) "
98+ log " ⚠️ Using General folder"
9699 fi
97100else
98- log " ✅ Folder created: $FOLDER_ID "
101+ log " ✅ Folder created: $FOLDER_UID "
99102fi
100103
101104log " 📊 Creating JVM dashboard..."
@@ -178,104 +181,101 @@ WEBHOOK_USER="grafana-alerts"
178181SECRET_VALUE=$( aws secretsmanager get-secret-value --secret-id " $SECRET_NAME " --query ' SecretString' --output text)
179182WEBHOOK_PASSWORD=$( echo " $SECRET_VALUE " | jq -r ' .password' )
180183
181- CONTACT_RESPONSE=$( curl -s -X POST -H " Content-Type: application/json" \
182- -u " $GRAFANA_USER :$GRAFANA_PASSWORD " \
183- -d " {
184- \" name\" : \" $CONTACT_POINT_NAME \" ,
185- \" type\" : \" webhook\" ,
186- \" settings\" : {
187- \" url\" : \" $LAMBDA_URL \" ,
188- \" httpMethod\" : \" POST\" ,
189- \" username\" : \" $WEBHOOK_USER \" ,
190- \" password\" : \" $WEBHOOK_PASSWORD \" ,
191- \" authorization_scheme\" : \" basic\"
192- },
193- \" disableResolveMessage\" : false
194- }" \
195- " $GRAFANA_URL /api/v1/provisioning/contact-points" )
184+ # Check if contact point already exists
185+ EXISTING_CONTACT=$( curl -s -u " $GRAFANA_USER :$GRAFANA_PASSWORD " " $GRAFANA_URL /api/v1/provisioning/contact-points" | jq -r " .[] | select(.name == \" $CONTACT_POINT_NAME \" ) | .name // empty" )
196186
197- log " 🚨 Creating alert rule with classic conditions..."
198- ALERT_RESPONSE=$( curl -s -X POST -H " Content-Type: application/json" \
199- -u " $GRAFANA_USER :$GRAFANA_PASSWORD " \
200- -d " {
201- \" title\" : \" $ALERT_TITLE \" ,
202- \" condition\" : \" B\" ,
203- \" data\" : [
204- {
205- \" refId\" : \" A\" ,
206- \" relativeTimeRange\" : {\" from\" : 600, \" to\" : 0},
207- \" datasourceUid\" : \" promds\" ,
208- \" model\" : {
209- \" expr\" : \" sum(jvm_threads_live_threads{job=~\\\" kubernetes-pods|ecs-unicorn-store-spring\\\" }) by (task_pod_id, cluster_type, cluster, container_name, namespace, container_ip)\" ,
210- \" instant\" : true,
211- \" refId\" : \" A\"
212- }
187+ if [[ -z " $EXISTING_CONTACT " ]]; then
188+ CONTACT_RESPONSE=$( curl -s -X POST -H " Content-Type: application/json" \
189+ -u " $GRAFANA_USER :$GRAFANA_PASSWORD " \
190+ -d " {
191+ \" name\" : \" $CONTACT_POINT_NAME \" ,
192+ \" type\" : \" webhook\" ,
193+ \" settings\" : {
194+ \" url\" : \" $LAMBDA_URL \" ,
195+ \" httpMethod\" : \" POST\" ,
196+ \" username\" : \" $WEBHOOK_USER \" ,
197+ \" password\" : \" $WEBHOOK_PASSWORD \" ,
198+ \" authorization_scheme\" : \" basic\"
213199 },
214- {
215- \" refId\" : \" B\" ,
216- \" relativeTimeRange\" : {\" from\" : 0, \" to\" : 0},
217- \" datasourceUid\" : \" -100\" ,
218- \" model\" : {
219- \" conditions\" : [
220- {
221- \" evaluator\" : {\" params\" : [$THREAD_THRESHOLD ], \" type\" : \" gt\" },
222- \" operator\" : {\" type\" : \" and\" },
223- \" query\" : {\" params\" : [\" A\" ]},
224- \" reducer\" : {\" params\" : [], \" type\" : \" last\" },
225- \" type\" : \" query\"
226- }
227- ],
228- \" refId\" : \" B\" ,
229- \" type\" : \" classic_conditions\"
230- }
200+ \" disableResolveMessage\" : false
201+ }" \
202+ " $GRAFANA_URL /api/v1/provisioning/contact-points" )
203+
204+ # Check if contact point creation was successful
205+ if echo " $CONTACT_RESPONSE " | jq -e ' .name' > /dev/null 2>&1 ; then
206+ log " ✅ Contact point created"
207+ else
208+ log " ❌ Contact point creation failed:"
209+ echo " $CONTACT_RESPONSE " | jq .
210+ fi
211+ else
212+ log " ✅ Contact point already exists"
213+ fi
214+
215+ log " 🚨 Creating alert rule with proper label preservation..."
216+ # Note: Using raw metrics without sum() and by() to preserve all original labels
217+ # This ensures both 'pod' (for EKS) and 'task_pod_id' (for ECS) labels are available
218+ # The Lambda function will process ALL metrics in the valueString, handling multiple
219+ # containers (EKS pods + ECS tasks) in a single alert when they exceed the threshold
220+ ALERT_PAYLOAD=" {
221+ \" title\" : \" $ALERT_TITLE \" ,
222+ \" condition\" : \" B\" ,
223+ \" data\" : [
224+ {
225+ \" refId\" : \" A\" ,
226+ \" relativeTimeRange\" : {\" from\" : 600, \" to\" : 0},
227+ \" datasourceUid\" : \" promds\" ,
228+ \" model\" : {
229+ \" expr\" : \" jvm_threads_live_threads{job=~\\\" kubernetes-pods|ecs-unicorn-store-spring\\\" }\" ,
230+ \" instant\" : true,
231+ \" refId\" : \" A\"
231232 }
232- ],
233- \" intervalSeconds\" : 60,
234- \" noDataState\" : \" NoData\" ,
235- \" execErrState\" : \" Alerting\" ,
236- \" for\" : \" 1m\" ,
237- \" annotations\" : {
238- \" summary\" : \" High JVM Threads\" ,
239- \" description\" : \" High number of JVM threads detected. Triggering Lambda thread dump.\" ,
240- \" webhookUrl\" : \" $LAMBDA_URL \"
241233 },
242- \" labels\" : {
243- \" severity\" : \" critical\" ,
244- \" alertname\" : \" High JVM Threads\" ,
245- \" cluster\" : \" {{ \$ labels.cluster }}\" ,
246- \" cluster_type\" : \" {{ \$ labels.cluster_type }}\" ,
247- \" container_name\" : \" {{ \$ labels.container_name }}\" ,
248- \" namespace\" : \" {{ \$ labels.namespace }}\" ,
249- \" task_pod_id\" : \" {{ \$ labels.task_pod_id }}\" ,
250- \" container_ip\" : \" {{ \$ labels.container_ip }}\"
234+ {
235+ \" refId\" : \" B\" ,
236+ \" relativeTimeRange\" : {\" from\" : 0, \" to\" : 0},
237+ \" datasourceUid\" : \" -100\" ,
238+ \" model\" : {
239+ \" conditions\" : [
240+ {
241+ \" evaluator\" : {\" params\" : [$THREAD_THRESHOLD ], \" type\" : \" gt\" },
242+ \" operator\" : {\" type\" : \" and\" },
243+ \" query\" : {\" params\" : [\" A\" ]},
244+ \" reducer\" : {\" params\" : [], \" type\" : \" last\" },
245+ \" type\" : \" query\"
246+ }
247+ ],
248+ \" refId\" : \" B\" ,
249+ \" type\" : \" classic_conditions\"
250+ }
251251 }
252- }" \
253- " $GRAFANA_URL /api/v1/provisioning/alert-rules" )
252+ ],
253+ \" intervalSeconds\" : 60,
254+ \" noDataState\" : \" NoData\" ,
255+ \" execErrState\" : \" Alerting\" ,
256+ \" for\" : \" 1m\" ,
257+ \" annotations\" : {
258+ \" summary\" : \" High JVM Threads\" ,
259+ \" description\" : \" High number of JVM threads detected. Triggering Lambda thread dump.\" ,
260+ \" webhookUrl\" : \" $LAMBDA_URL \"
261+ },
262+ \" labels\" : {
263+ \" severity\" : \" critical\" ,
264+ \" alertname\" : \" High JVM Threads\" ,
265+ \" cluster\" : \" {{ \$ labels.cluster }}\" ,
266+ \" cluster_type\" : \" {{ \$ labels.cluster_type }}\" ,
267+ \" container_name\" : \" {{ \$ labels.container_name }}\" ,
268+ \" namespace\" : \" {{ \$ labels.namespace }}\" ,
269+ \" task_pod_id\" : \" {{ \$ labels.task_pod_id }}\" ,
270+ \" container_ip\" : \" {{ \$ labels.container_ip }}\"
271+ }
272+ }"
254273
255- log " 🚨 Creating notification policy..."
256- POLICY_RESPONSE=$( curl -s -X PUT -H " Content-Type: application/json" \
257- -u " $GRAFANA_USER :$GRAFANA_PASSWORD " \
258- -d " {
259- \" receiver\" : \" $CONTACT_POINT_NAME \" ,
260- \" group_by\" : [\" alertname\" ],
261- \" routes\" : [
262- {
263- \" receiver\" : \" $CONTACT_POINT_NAME \" ,
264- \" group_by\" : [\" alertname\" , \" pod\" ],
265- \" matchers\" : [\" severity = critical\" ],
266- \" group_wait\" : \" 30s\" ,
267- \" group_interval\" : \" 5m\" ,
268- \" repeat_interval\" : \" 4h\"
269- }
270- ],
271- \" group_wait\" : \" 30s\" ,
272- \" group_interval\" : \" 5m\" ,
273- \" repeat_interval\" : \" 1h\"
274- }" \
275- " $GRAFANA_URL /api/v1/provisioning/policies" )
274+ # Add folderUID if we have one
275+ if [[ -n " $FOLDER_UID " ]]; then
276+ ALERT_PAYLOAD=$( echo " $ALERT_PAYLOAD " | jq " . + {\" folderUID\" : \" $FOLDER_UID \" }" )
277+ fi
276278
277- log " ✅ Alert rule created"
278- log " ✅ JVM monitoring setup complete"
279- log " 🌍 Grafana: $GRAFANA_URL "
280- log " 📊 Dashboard shows jvm_threads_live_threads from both EKS and ECS"
281- log " 🚨 Alert triggers Lambda thread dump when threads > $THREAD_THRESHOLD , stops when threads < $THREAD_THRESHOLD "
279+ ALERT_RESPONSE=$( curl -s -X POST -H " Content-Type: application/json" \
280+ -u " $GRAFANA_USER :$GRAFANA_PASSWORD " \
281+ -d " $ALERT_PAYLOAD " \
0 commit comments