Skip to content

Commit 30475c1

Browse files
author
Yuriy Bezsonov
committed
WIP: refactoring - almost done
1 parent ad4d81c commit 30475c1

2 files changed

Lines changed: 183 additions & 165 deletions

File tree

infrastructure/scripts/setup/monitoring-jvm.sh

Lines changed: 100 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,22 @@ FOLDER_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
8383
-d "{\"title\": \"$FOLDER_NAME\"}" \
8484
"$GRAFANA_URL/api/folders")
8585

86+
FOLDER_UID=$(echo "$FOLDER_RESPONSE" | jq -r '.uid // empty')
8687
FOLDER_ID=$(echo "$FOLDER_RESPONSE" | jq -r '.id // empty')
87-
if [[ -z "$FOLDER_ID" ]]; then
88+
if [[ -z "$FOLDER_UID" ]]; then
8889
# Try to get existing folder
89-
EXISTING_FOLDER=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" "$GRAFANA_URL/api/folders" | jq -r ".[] | select(.title == \"$FOLDER_NAME\") | .id // empty")
90+
EXISTING_FOLDER=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" "$GRAFANA_URL/api/folders" | jq -r ".[] | select(.title == \"$FOLDER_NAME\")")
9091
if [[ -n "$EXISTING_FOLDER" ]]; then
91-
FOLDER_ID="$EXISTING_FOLDER"
92-
log "📁 Using existing folder: $FOLDER_ID"
92+
FOLDER_UID=$(echo "$EXISTING_FOLDER" | jq -r '.uid')
93+
FOLDER_ID=$(echo "$EXISTING_FOLDER" | jq -r '.id')
94+
log "📁 Using existing folder: $FOLDER_UID"
9395
else
96+
FOLDER_UID=""
9497
FOLDER_ID=0
95-
log "⚠️ Using General folder (ID: 0)"
98+
log "⚠️ Using General folder"
9699
fi
97100
else
98-
log "✅ Folder created: $FOLDER_ID"
101+
log "✅ Folder created: $FOLDER_UID"
99102
fi
100103

101104
log "📊 Creating JVM dashboard..."
@@ -178,104 +181,101 @@ WEBHOOK_USER="grafana-alerts"
178181
SECRET_VALUE=$(aws secretsmanager get-secret-value --secret-id "$SECRET_NAME" --query 'SecretString' --output text)
179182
WEBHOOK_PASSWORD=$(echo "$SECRET_VALUE" | jq -r '.password')
180183

181-
CONTACT_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
182-
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
183-
-d "{
184-
\"name\": \"$CONTACT_POINT_NAME\",
185-
\"type\": \"webhook\",
186-
\"settings\": {
187-
\"url\": \"$LAMBDA_URL\",
188-
\"httpMethod\": \"POST\",
189-
\"username\": \"$WEBHOOK_USER\",
190-
\"password\": \"$WEBHOOK_PASSWORD\",
191-
\"authorization_scheme\": \"basic\"
192-
},
193-
\"disableResolveMessage\": false
194-
}" \
195-
"$GRAFANA_URL/api/v1/provisioning/contact-points")
184+
# Check if contact point already exists
185+
EXISTING_CONTACT=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" "$GRAFANA_URL/api/v1/provisioning/contact-points" | jq -r ".[] | select(.name == \"$CONTACT_POINT_NAME\") | .name // empty")
196186

197-
log "🚨 Creating alert rule with classic conditions..."
198-
ALERT_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
199-
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
200-
-d "{
201-
\"title\": \"$ALERT_TITLE\",
202-
\"condition\": \"B\",
203-
\"data\": [
204-
{
205-
\"refId\": \"A\",
206-
\"relativeTimeRange\": {\"from\": 600, \"to\": 0},
207-
\"datasourceUid\": \"promds\",
208-
\"model\": {
209-
\"expr\": \"sum(jvm_threads_live_threads{job=~\\\"kubernetes-pods|ecs-unicorn-store-spring\\\"}) by (task_pod_id, cluster_type, cluster, container_name, namespace, container_ip)\",
210-
\"instant\": true,
211-
\"refId\": \"A\"
212-
}
187+
if [[ -z "$EXISTING_CONTACT" ]]; then
188+
CONTACT_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
189+
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
190+
-d "{
191+
\"name\": \"$CONTACT_POINT_NAME\",
192+
\"type\": \"webhook\",
193+
\"settings\": {
194+
\"url\": \"$LAMBDA_URL\",
195+
\"httpMethod\": \"POST\",
196+
\"username\": \"$WEBHOOK_USER\",
197+
\"password\": \"$WEBHOOK_PASSWORD\",
198+
\"authorization_scheme\": \"basic\"
213199
},
214-
{
215-
\"refId\": \"B\",
216-
\"relativeTimeRange\": {\"from\": 0, \"to\": 0},
217-
\"datasourceUid\": \"-100\",
218-
\"model\": {
219-
\"conditions\": [
220-
{
221-
\"evaluator\": {\"params\": [$THREAD_THRESHOLD], \"type\": \"gt\"},
222-
\"operator\": {\"type\": \"and\"},
223-
\"query\": {\"params\": [\"A\"]},
224-
\"reducer\": {\"params\": [], \"type\": \"last\"},
225-
\"type\": \"query\"
226-
}
227-
],
228-
\"refId\": \"B\",
229-
\"type\": \"classic_conditions\"
230-
}
200+
\"disableResolveMessage\": false
201+
}" \
202+
"$GRAFANA_URL/api/v1/provisioning/contact-points")
203+
204+
# Check if contact point creation was successful
205+
if echo "$CONTACT_RESPONSE" | jq -e '.name' > /dev/null 2>&1; then
206+
log "✅ Contact point created"
207+
else
208+
log "❌ Contact point creation failed:"
209+
echo "$CONTACT_RESPONSE" | jq .
210+
fi
211+
else
212+
log "✅ Contact point already exists"
213+
fi
214+
215+
log "🚨 Creating alert rule with proper label preservation..."
216+
# Note: Using raw metrics without sum() and by() to preserve all original labels
217+
# This ensures both 'pod' (for EKS) and 'task_pod_id' (for ECS) labels are available
218+
# The Lambda function will process ALL metrics in the valueString, handling multiple
219+
# containers (EKS pods + ECS tasks) in a single alert when they exceed the threshold
220+
ALERT_PAYLOAD="{
221+
\"title\": \"$ALERT_TITLE\",
222+
\"condition\": \"B\",
223+
\"data\": [
224+
{
225+
\"refId\": \"A\",
226+
\"relativeTimeRange\": {\"from\": 600, \"to\": 0},
227+
\"datasourceUid\": \"promds\",
228+
\"model\": {
229+
\"expr\": \"jvm_threads_live_threads{job=~\\\"kubernetes-pods|ecs-unicorn-store-spring\\\"}\",
230+
\"instant\": true,
231+
\"refId\": \"A\"
231232
}
232-
],
233-
\"intervalSeconds\": 60,
234-
\"noDataState\": \"NoData\",
235-
\"execErrState\": \"Alerting\",
236-
\"for\": \"1m\",
237-
\"annotations\": {
238-
\"summary\": \"High JVM Threads\",
239-
\"description\": \"High number of JVM threads detected. Triggering Lambda thread dump.\",
240-
\"webhookUrl\": \"$LAMBDA_URL\"
241233
},
242-
\"labels\": {
243-
\"severity\": \"critical\",
244-
\"alertname\": \"High JVM Threads\",
245-
\"cluster\": \"{{ \$labels.cluster }}\",
246-
\"cluster_type\": \"{{ \$labels.cluster_type }}\",
247-
\"container_name\": \"{{ \$labels.container_name }}\",
248-
\"namespace\": \"{{ \$labels.namespace }}\",
249-
\"task_pod_id\": \"{{ \$labels.task_pod_id }}\",
250-
\"container_ip\": \"{{ \$labels.container_ip }}\"
234+
{
235+
\"refId\": \"B\",
236+
\"relativeTimeRange\": {\"from\": 0, \"to\": 0},
237+
\"datasourceUid\": \"-100\",
238+
\"model\": {
239+
\"conditions\": [
240+
{
241+
\"evaluator\": {\"params\": [$THREAD_THRESHOLD], \"type\": \"gt\"},
242+
\"operator\": {\"type\": \"and\"},
243+
\"query\": {\"params\": [\"A\"]},
244+
\"reducer\": {\"params\": [], \"type\": \"last\"},
245+
\"type\": \"query\"
246+
}
247+
],
248+
\"refId\": \"B\",
249+
\"type\": \"classic_conditions\"
250+
}
251251
}
252-
}" \
253-
"$GRAFANA_URL/api/v1/provisioning/alert-rules")
252+
],
253+
\"intervalSeconds\": 60,
254+
\"noDataState\": \"NoData\",
255+
\"execErrState\": \"Alerting\",
256+
\"for\": \"1m\",
257+
\"annotations\": {
258+
\"summary\": \"High JVM Threads\",
259+
\"description\": \"High number of JVM threads detected. Triggering Lambda thread dump.\",
260+
\"webhookUrl\": \"$LAMBDA_URL\"
261+
},
262+
\"labels\": {
263+
\"severity\": \"critical\",
264+
\"alertname\": \"High JVM Threads\",
265+
\"cluster\": \"{{ \$labels.cluster }}\",
266+
\"cluster_type\": \"{{ \$labels.cluster_type }}\",
267+
\"container_name\": \"{{ \$labels.container_name }}\",
268+
\"namespace\": \"{{ \$labels.namespace }}\",
269+
\"task_pod_id\": \"{{ \$labels.task_pod_id }}\",
270+
\"container_ip\": \"{{ \$labels.container_ip }}\"
271+
}
272+
}"
254273

255-
log "🚨 Creating notification policy..."
256-
POLICY_RESPONSE=$(curl -s -X PUT -H "Content-Type: application/json" \
257-
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
258-
-d "{
259-
\"receiver\": \"$CONTACT_POINT_NAME\",
260-
\"group_by\": [\"alertname\"],
261-
\"routes\": [
262-
{
263-
\"receiver\": \"$CONTACT_POINT_NAME\",
264-
\"group_by\": [\"alertname\", \"pod\"],
265-
\"matchers\": [\"severity = critical\"],
266-
\"group_wait\": \"30s\",
267-
\"group_interval\": \"5m\",
268-
\"repeat_interval\": \"4h\"
269-
}
270-
],
271-
\"group_wait\": \"30s\",
272-
\"group_interval\": \"5m\",
273-
\"repeat_interval\": \"1h\"
274-
}" \
275-
"$GRAFANA_URL/api/v1/provisioning/policies")
274+
# Add folderUID if we have one
275+
if [[ -n "$FOLDER_UID" ]]; then
276+
ALERT_PAYLOAD=$(echo "$ALERT_PAYLOAD" | jq ". + {\"folderUID\": \"$FOLDER_UID\"}")
277+
fi
276278

277-
log "✅ Alert rule created"
278-
log "✅ JVM monitoring setup complete"
279-
log "🌍 Grafana: $GRAFANA_URL"
280-
log "📊 Dashboard shows jvm_threads_live_threads from both EKS and ECS"
281-
log "🚨 Alert triggers Lambda thread dump when threads > $THREAD_THRESHOLD, stops when threads < $THREAD_THRESHOLD"
279+
ALERT_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
280+
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
281+
-d "$ALERT_PAYLOAD" \

infrastructure/scripts/setup/thread-dump-lambda/src/index.py

Lines changed: 83 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -79,45 +79,57 @@ def verify_basic_auth(event: Dict[str, Any]) -> bool:
7979
logger.error(f"Authentication error: {str(e)}")
8080
return False
8181

82-
def extract_pod_info_from_valuestring(value_string: str) -> Dict[str, str]:
83-
"""Extract pod information from Grafana alert valueString"""
82+
def extract_all_metrics_from_valuestring(value_string: str) -> list:
83+
"""Extract all metric information from Grafana alert valueString"""
8484
try:
85-
logger.info(f"Extracting pod info from valueString: {value_string}")
86-
87-
# First, extract the labels section
88-
labels_match = re.search(r'labels=\{([^}]+)\}', value_string)
89-
if not labels_match:
90-
logger.warning("Could not find labels section in valueString")
91-
return {}
92-
93-
labels_str = labels_match.group(1)
94-
logger.info(f"Extracted labels section: {labels_str}")
95-
96-
# Now extract individual labels from the labels section
97-
labels = {}
98-
for label_pair in labels_str.split(', '):
99-
if '=' in label_pair:
100-
key, value = label_pair.split('=', 1)
101-
labels[key.strip()] = value.strip()
102-
logger.info(f"Parsed label: {key.strip()}={value.strip()}")
103-
104-
logger.info(f"All parsed labels: {labels}")
105-
106-
# Map the labels to our expected keys
107-
result = {
108-
'cluster': labels.get('cluster'),
109-
'cluster_type': labels.get('cluster_type'),
110-
'container_name': labels.get('container_name'),
111-
'task_pod_id': labels.get('task_pod_id') or labels.get('pod'), # Use pod if task_pod_id not found
112-
'namespace': labels.get('namespace'),
113-
'container_ip': labels.get('container_ip') or labels.get('exported_instance') # Add container_ip extraction
114-
}
115-
116-
logger.info(f"Final extracted values: {result}")
117-
return result
85+
logger.info(f"Extracting all metrics from valueString: {value_string}")
86+
87+
# Find all metric entries in the valueString
88+
# Pattern: [ var='...' metric='...' labels={...} value=... ]
89+
import re
90+
metric_pattern = r'\[\s*var=\'[^\']+\'\s*metric=\'[^\']+\'\s*labels=\{([^}]+)\}\s*value=(\d+)\s*\]'
91+
matches = re.findall(metric_pattern, value_string)
92+
93+
if not matches:
94+
logger.warning("Could not find any metric entries in valueString")
95+
return []
96+
97+
results = []
98+
for i, (labels_str, value) in enumerate(matches):
99+
logger.info(f"Processing metric {i+1}: labels={{{labels_str}}} value={value}")
100+
101+
# Parse individual labels from the labels section
102+
labels = {}
103+
for label_pair in labels_str.split(', '):
104+
if '=' in label_pair:
105+
key, label_value = label_pair.split('=', 1)
106+
labels[key.strip()] = label_value.strip()
107+
108+
logger.info(f"All parsed labels for metric {i+1}: {labels}")
109+
110+
# Map the labels to our expected keys
111+
task_pod_id = labels.get('task_pod_id')
112+
# Use pod if task_pod_id is missing, empty, or "unknown"
113+
if not task_pod_id or task_pod_id == 'unknown':
114+
task_pod_id = labels.get('pod')
115+
116+
result = {
117+
'cluster': labels.get('cluster'),
118+
'cluster_type': labels.get('cluster_type'),
119+
'container_name': labels.get('container_name'),
120+
'task_pod_id': task_pod_id,
121+
'namespace': labels.get('namespace'),
122+
'container_ip': labels.get('container_ip') or labels.get('exported_instance'),
123+
'thread_count': int(value)
124+
}
125+
126+
logger.info(f"Final extracted values for metric {i+1}: {result}")
127+
results.append(result)
128+
129+
return results
118130
except Exception as e:
119-
logger.error(f"Error extracting pod info from valueString: {str(e)}")
120-
return {}
131+
logger.error(f"Error extracting metrics from valueString: {str(e)}")
132+
return []
121133

122134
def analyze_thread_dump(thread_dump: str) -> str:
123135
region_name = os.environ.get('AWS_REGION', 'us-east-1')
@@ -298,35 +310,41 @@ def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
298310
if any(is_invalid(val) for val in [cluster_type, cluster_name, task_pod_id, container_name]):
299311
logger.info("Some labels are missing or invalid, trying to extract from valueString")
300312

301-
# Try to extract from valueString
313+
# Try to extract all metrics from valueString
302314
if 'valueString' in alert:
303-
extracted_info = extract_pod_info_from_valuestring(alert['valueString'])
304-
logger.info(f"Raw extracted info: {extracted_info}")
305-
306-
# Update missing values
307-
if is_invalid(cluster_type) and extracted_info.get('cluster_type'):
308-
cluster_type = extracted_info['cluster_type']
309-
310-
if is_invalid(cluster_name) and extracted_info.get('cluster'):
311-
cluster_name = extracted_info['cluster']
312-
313-
if is_invalid(task_pod_id) and extracted_info.get('task_pod_id'):
314-
task_pod_id = extracted_info['task_pod_id']
315-
316-
if is_invalid(container_name) and extracted_info.get('container_name'):
317-
container_name = extracted_info['container_name']
318-
319-
if is_invalid(container_ip) and extracted_info.get('container_ip'):
320-
container_ip = extracted_info['container_ip']
321-
logger.info(f"Using container_ip from valueString: {container_ip}")
322-
323-
# IMPORTANT: Always override namespace with extracted value if available
324-
if extracted_info.get('namespace'):
325-
namespace = extracted_info['namespace']
326-
logger.info(f"Overriding namespace with extracted value: {namespace}")
327-
328-
logger.info(f"Final extracted values: cluster_type={cluster_type}, cluster={cluster_name}, task_pod_id={task_pod_id}, container_name={container_name}, namespace={namespace}")
329-
315+
extracted_metrics = extract_all_metrics_from_valuestring(alert['valueString'])
316+
logger.info(f"Extracted {len(extracted_metrics)} metrics from valueString")
317+
318+
# Process each metric separately
319+
for i, metric_info in enumerate(extracted_metrics):
320+
logger.info(f"Processing metric {i+1}: {metric_info}")
321+
322+
# Use extracted values for this metric
323+
metric_cluster_type = metric_info.get('cluster_type')
324+
metric_cluster_name = metric_info.get('cluster')
325+
metric_task_pod_id = metric_info.get('task_pod_id')
326+
metric_container_name = metric_info.get('container_name')
327+
metric_namespace = metric_info.get('namespace', 'default')
328+
metric_container_ip = metric_info.get('container_ip')
329+
330+
# Validate this metric's inputs
331+
if any(is_invalid(val) for val in [metric_cluster_type, metric_cluster_name, metric_task_pod_id, metric_container_name]):
332+
logger.warning(f"Skipping metric {i+1} due to missing values: cluster_type={metric_cluster_type}, cluster={metric_cluster_name}, task_pod_id={metric_task_pod_id}, container_name={metric_container_name}")
333+
continue
334+
335+
# Process this metric
336+
try:
337+
result = process_alert(metric_cluster_type, metric_cluster_name, metric_task_pod_id, metric_container_name, metric_namespace, s3_bucket, metric_container_ip)
338+
results.append(result)
339+
logger.info(f"Successfully processed metric {i+1}")
340+
except Exception as e:
341+
logger.error(f"Error processing metric {i+1}: {str(e)}")
342+
continue
343+
344+
# Skip the original single-metric processing since we handled all metrics
345+
continue
346+
347+
# Original single-metric processing (fallback)
330348
# Validate inputs after extraction attempt
331349
if any(is_invalid(val) for val in [cluster_type, cluster_name, task_pod_id, container_name]):
332350
logger.warning(f"Still missing or invalid alert labels after extraction: cluster_type={cluster_type}, cluster={cluster_name}, task_pod_id={task_pod_id}, container_name={container_name}")

0 commit comments

Comments
 (0)