Skip to content

Commit 71ee50c

Browse files
author
Yuriy Bezsonov
committed
refactor(perf-platform): rename Grafana role and consolidate setup ownership
Rename the Grafana pod role from grafana-cloudwatch-pod-role to grafana-eks-pod-role to match the existing <component>-eks-pod-role convention used by ai-jvm-analyzer, perf-analyzer, perf-collector, and pyroscope. Add grafana* and pyroscope* to the workshop IAM policy's PassRole resource list so participants can attach the new roles. Reorganize setup-script ownership so the three analysis modules compose cleanly. monitoring.sh now owns the shared Workshop Dashboards Grafana folder; analysis.sh and perf-platform.sh look it up by title and fail loud if missing. Each downstream script upserts its own notification- policy routes keyed on receiver name, so order between analysis.sh and perf-platform.sh no longer clobbers either side. perf-platform.sh drops the SSM mirror for the internal NLB DNS — consumers look it up at point of need with kubectl get svc. The result is monitoring.sh + analysis.sh = modules 1+2, monitoring.sh + perf-platform.sh = module 3 standalone. Regenerate the CFN template from the updated CDK.
1 parent 5e28304 commit 71ee50c

6 files changed

Lines changed: 217 additions & 111 deletions

File tree

infra/cdk/src/main/java/sample/com/constructs/PerfPlatform.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* - perf-analyzer-eks-pod-role (perf-analyzer Spring Boot service)
1313
* - perf-collector-eks-pod-role (perf-collector DaemonSet)
1414
* - pyroscope-eks-pod-role (Pyroscope server, for S3-backed storage)
15-
* - grafana-cloudwatch-pod-role (Grafana, to read ALB metrics from CloudWatch)
15+
* - grafana-eks-pod-role (Grafana, to read ALB metrics from CloudWatch)
1616
*
1717
* On Amazon ECS Fargate the collector sidecar runs inside the target task and
1818
* reuses that task's existing role — we add S3-write for profiling artifacts to
@@ -29,7 +29,7 @@ public class PerfPlatform extends Construct {
2929
private final Role perfAnalyzerEksPodRole;
3030
private final Role perfCollectorEksPodRole;
3131
private final Role pyroscopeEksPodRole;
32-
private final Role grafanaCloudwatchPodRole;
32+
private final Role grafanaEksPodRole;
3333

3434
public static class PerfPlatformProps {
3535
private Bucket workshopBucket;
@@ -59,7 +59,7 @@ public PerfPlatform(final Construct scope, final String id, final PerfPlatformPr
5959
this.perfAnalyzerEksPodRole = createAnalyzerEksPodRole(props);
6060
this.perfCollectorEksPodRole = createCollectorEksPodRole(props);
6161
this.pyroscopeEksPodRole = createPyroscopeEksPodRole(props);
62-
this.grafanaCloudwatchPodRole = createGrafanaCloudwatchPodRole();
62+
this.grafanaEksPodRole = createGrafanaEksPodRole();
6363
grantProfilingWriteToUnicornEcsTaskRole(props);
6464
}
6565

@@ -142,11 +142,11 @@ private Role createPyroscopeEksPodRole(PerfPlatformProps props) {
142142
* and HTTPCode_Target_5XX_Count for whichever ALB(s) participants deploy
143143
* during the workshop.
144144
*/
145-
private Role createGrafanaCloudwatchPodRole() {
145+
private Role createGrafanaEksPodRole() {
146146
ServicePrincipal podsPrincipal = ServicePrincipal.Builder.create("pods.eks.amazonaws.com").build();
147147

148-
Role role = Role.Builder.create(this, "GrafanaCloudwatchPodRole")
149-
.roleName("grafana-cloudwatch-pod-role")
148+
Role role = Role.Builder.create(this, "GrafanaEksPodRole")
149+
.roleName("grafana-eks-pod-role")
150150
.assumedBy(podsPrincipal)
151151
.description("Role for Grafana to read CloudWatch metrics for the perf-platform Latency Metrics dashboard and ServiceLatency alert")
152152
.build();
@@ -311,7 +311,7 @@ public Role getPyroscopeEksPodRole() {
311311
return pyroscopeEksPodRole;
312312
}
313313

314-
public Role getGrafanaCloudwatchPodRole() {
315-
return grafanaCloudwatchPodRole;
314+
public Role getGrafanaEksPodRole() {
315+
return grafanaEksPodRole;
316316
}
317317
}

infra/cdk/src/main/resources/iam-policy.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@
7676
"arn:aws:iam::{{.AccountId}}:role/ai-jvm-analyzer*",
7777
"arn:aws:iam::{{.AccountId}}:role/perf-analyzer*",
7878
"arn:aws:iam::{{.AccountId}}:role/perf-collector*",
79+
"arn:aws:iam::{{.AccountId}}:role/pyroscope*",
80+
"arn:aws:iam::{{.AccountId}}:role/grafana*",
7981
"arn:aws:iam::{{.AccountId}}:role/workshop*",
8082
"arn:aws:iam::{{.AccountId}}:role/aiagent*",
8183
"arn:aws:iam::{{.AccountId}}:role/mcpserver*"

infra/cfn/java-on-aws-stack.yaml

Lines changed: 87 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ Resources:
422422
Fn::GetAtt:
423423
- CodeBuildRoleE9A44575
424424
- Arn
425-
ContentHash: "1778416824589"
425+
ContentHash: "1778821434343"
426426
ProjectName:
427427
Ref: CodeBuildProjectA0FF5539
428428
ServiceToken:
@@ -2063,9 +2063,11 @@ Resources:
20632063
Resource:
20642064
- !Sub arn:aws:iam::${AWS::AccountId}:role/ai-jvm-analyzer*
20652065
- !Sub arn:aws:iam::${AWS::AccountId}:role/aiagent*
2066+
- !Sub arn:aws:iam::${AWS::AccountId}:role/grafana*
20662067
- !Sub arn:aws:iam::${AWS::AccountId}:role/mcpserver*
20672068
- !Sub arn:aws:iam::${AWS::AccountId}:role/perf-analyzer*
20682069
- !Sub arn:aws:iam::${AWS::AccountId}:role/perf-collector*
2070+
- !Sub arn:aws:iam::${AWS::AccountId}:role/pyroscope*
20692071
- !Sub arn:aws:iam::${AWS::AccountId}:role/service-role/unicorn*
20702072
- !Sub arn:aws:iam::${AWS::AccountId}:role/unicorn*
20712073
- !Sub arn:aws:iam::${AWS::AccountId}:role/workshop*
@@ -2252,19 +2254,21 @@ Resources:
22522254
Roles:
22532255
- Ref: PerfPlatformAnalyzerEksPodRoleA97B019B
22542256
Type: AWS::IAM::Policy
2255-
PerfPlatformCollectorEcsTaskRoleA67C7D99:
2257+
PerfPlatformCollectorEksPodRole3090D9EA:
22562258
Properties:
22572259
AssumeRolePolicyDocument:
22582260
Statement:
2259-
- Action: sts:AssumeRole
2261+
- Action:
2262+
- sts:AssumeRole
2263+
- sts:TagSession
22602264
Effect: Allow
22612265
Principal:
2262-
Service: ecs-tasks.amazonaws.com
2266+
Service: pods.eks.amazonaws.com
22632267
Version: "2012-10-17"
2264-
Description: Role for perf-collector ECS Fargate sidecar to upload profiling artifacts to S3
2265-
RoleName: perf-collector-ecs-task-role
2268+
Description: Role for perf-collector EKS DaemonSet pod to upload profiling artifacts to S3
2269+
RoleName: perf-collector-eks-pod-role
22662270
Type: AWS::IAM::Role
2267-
PerfPlatformCollectorEcsTaskRoleDefaultPolicy698DCB5D:
2271+
PerfPlatformCollectorEksPodRoleDefaultPolicy102C5ADB:
22682272
Properties:
22692273
PolicyDocument:
22702274
Statement:
@@ -2280,11 +2284,11 @@ Resources:
22802284
- Arn
22812285
- /perf-platform/profiling/*
22822286
Version: "2012-10-17"
2283-
PolicyName: PerfPlatformCollectorEcsTaskRoleDefaultPolicy698DCB5D
2287+
PolicyName: PerfPlatformCollectorEksPodRoleDefaultPolicy102C5ADB
22842288
Roles:
2285-
- Ref: PerfPlatformCollectorEcsTaskRoleA67C7D99
2289+
- Ref: PerfPlatformCollectorEksPodRole3090D9EA
22862290
Type: AWS::IAM::Policy
2287-
PerfPlatformCollectorEksPodRole3090D9EA:
2291+
PerfPlatformGrafanaEksPodRole8BAC861C:
22882292
Properties:
22892293
AssumeRolePolicyDocument:
22902294
Statement:
@@ -2295,15 +2299,66 @@ Resources:
22952299
Principal:
22962300
Service: pods.eks.amazonaws.com
22972301
Version: "2012-10-17"
2298-
Description: Role for perf-collector EKS DaemonSet pod to upload profiling artifacts to S3
2299-
RoleName: perf-collector-eks-pod-role
2302+
Description: Role for Grafana to read CloudWatch metrics for the perf-platform Latency Metrics dashboard and ServiceLatency alert
2303+
RoleName: grafana-eks-pod-role
23002304
Type: AWS::IAM::Role
2301-
PerfPlatformCollectorEksPodRoleDefaultPolicy102C5ADB:
2305+
PerfPlatformGrafanaEksPodRoleDefaultPolicyBFFD5487:
23022306
Properties:
23032307
PolicyDocument:
23042308
Statement:
23052309
- Action:
2306-
- s3:HeadObject
2310+
- cloudwatch:DescribeAlarmHistory
2311+
- cloudwatch:DescribeAlarms
2312+
- cloudwatch:DescribeAlarmsForMetric
2313+
- cloudwatch:GetMetricData
2314+
- cloudwatch:GetMetricStatistics
2315+
- cloudwatch:ListMetrics
2316+
- ec2:DescribeRegions
2317+
- ec2:DescribeTags
2318+
- tag:GetResources
2319+
Effect: Allow
2320+
Resource: "*"
2321+
Version: "2012-10-17"
2322+
PolicyName: PerfPlatformGrafanaEksPodRoleDefaultPolicyBFFD5487
2323+
Roles:
2324+
- Ref: PerfPlatformGrafanaEksPodRole8BAC861C
2325+
Type: AWS::IAM::Policy
2326+
PerfPlatformPyroscopeEksPodRole01200CAC:
2327+
Properties:
2328+
AssumeRolePolicyDocument:
2329+
Statement:
2330+
- Action:
2331+
- sts:AssumeRole
2332+
- sts:TagSession
2333+
Effect: Allow
2334+
Principal:
2335+
Service: pods.eks.amazonaws.com
2336+
Version: "2012-10-17"
2337+
Description: Role for Pyroscope server pod to read/write blocks in S3 under pyroscope/*
2338+
RoleName: pyroscope-eks-pod-role
2339+
Type: AWS::IAM::Role
2340+
PerfPlatformPyroscopeEksPodRoleDefaultPolicy133E4C48:
2341+
Properties:
2342+
PolicyDocument:
2343+
Statement:
2344+
- Action:
2345+
- s3:GetBucketLocation
2346+
- s3:ListBucket
2347+
Condition:
2348+
StringLike:
2349+
s3:prefix:
2350+
- pyroscope/*
2351+
- pyroscope
2352+
Effect: Allow
2353+
Resource:
2354+
Fn::GetAtt:
2355+
- WorkshopBucketFD5BC43F
2356+
- Arn
2357+
- Action:
2358+
- s3:AbortMultipartUpload
2359+
- s3:DeleteObject
2360+
- s3:GetObject
2361+
- s3:ListMultipartUploadParts
23072362
- s3:PutObject
23082363
Effect: Allow
23092364
Resource:
@@ -2312,11 +2367,11 @@ Resources:
23122367
- - Fn::GetAtt:
23132368
- WorkshopBucketFD5BC43F
23142369
- Arn
2315-
- /perf-platform/profiling/*
2370+
- /pyroscope/*
23162371
Version: "2012-10-17"
2317-
PolicyName: PerfPlatformCollectorEksPodRoleDefaultPolicy102C5ADB
2372+
PolicyName: PerfPlatformPyroscopeEksPodRoleDefaultPolicy133E4C48
23182373
Roles:
2319-
- Ref: PerfPlatformCollectorEksPodRole3090D9EA
2374+
- Ref: PerfPlatformPyroscopeEksPodRole01200CAC
23202375
Type: AWS::IAM::Policy
23212376
ThreadAnalysisLambda3EE9B29D:
23222377
DependsOn:
@@ -2972,6 +3027,20 @@ Resources:
29723027
Fn::GetAtt:
29733028
- UnicornUnicornEventBusB728845C
29743029
- Arn
3030+
- Action:
3031+
- s3:HeadObject
3032+
- s3:PutObject
3033+
Effect: Allow
3034+
Resource:
3035+
Fn::Join:
3036+
- ""
3037+
- - Fn::GetAtt:
3038+
- WorkshopBucketFD5BC43F
3039+
- Arn
3040+
- /perf-platform/profiling/*
3041+
- Action: ecs:DescribeTasks
3042+
Effect: Allow
3043+
Resource: "*"
29753044
Version: "2012-10-17"
29763045
PolicyName: UnicornUnicornStoreEcsTaskRoleDefaultPolicy477138EA
29773046
Roles:
@@ -3312,7 +3381,7 @@ Resources:
33123381
- Ref: AWS::AccountId
33133382
- "-"
33143383
- Ref: AWS::Region
3315-
- "-20260510144024"
3384+
- "-20260515070354"
33163385
PublicAccessBlockConfiguration:
33173386
BlockPublicAcls: true
33183387
BlockPublicPolicy: true

infra/scripts/setup/analysis.sh

Lines changed: 60 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -63,29 +63,19 @@ for i in {1..20}; do
6363
sleep 5
6464
done
6565

66-
# Create shared folder
67-
log_info "Creating shared folder '$FOLDER_NAME'..."
68-
FOLDER_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" \
69-
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
70-
-d "{\"title\": \"$FOLDER_NAME\"}" \
71-
"$GRAFANA_URL/api/folders")
72-
73-
FOLDER_UID=$(echo "$FOLDER_RESPONSE" | jq -r '.uid // empty')
74-
FOLDER_ID=$(echo "$FOLDER_RESPONSE" | jq -r '.id // empty')
75-
if [[ -z "$FOLDER_UID" ]]; then
76-
EXISTING_FOLDER=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" "$GRAFANA_URL/api/folders" | jq -r ".[] | select(.title == \"$FOLDER_NAME\")")
77-
if [[ -n "$EXISTING_FOLDER" ]]; then
78-
FOLDER_UID=$(echo "$EXISTING_FOLDER" | jq -r '.uid')
79-
FOLDER_ID=$(echo "$EXISTING_FOLDER" | jq -r '.id')
80-
log_info "Using existing folder: $FOLDER_UID"
81-
else
82-
FOLDER_UID=""
83-
FOLDER_ID=0
84-
log_warning "Using General folder"
85-
fi
86-
else
87-
log_success "Folder created: $FOLDER_UID"
66+
# Look up the shared "Workshop Dashboards" folder created by monitoring.sh.
67+
# Both this script and perf-platform.sh consume it — monitoring.sh owns
68+
# the create, downstream scripts only read.
69+
log_info "Looking up shared folder '$FOLDER_NAME'..."
70+
SHARED_FOLDER=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" "$GRAFANA_URL/api/folders" \
71+
| jq -r ".[] | select(.title == \"$FOLDER_NAME\")")
72+
if [[ -z "$SHARED_FOLDER" ]]; then
73+
log_error "Shared folder '$FOLDER_NAME' not found. Run monitoring.sh first."
74+
exit 1
8875
fi
76+
FOLDER_UID=$(echo "$SHARED_FOLDER" | jq -r '.uid')
77+
FOLDER_ID=$(echo "$SHARED_FOLDER" | jq -r '.id')
78+
log_info "Using folder: $FOLDER_UID"
8979

9080
# Get Lambda Function URL for thread dump Lambda
9181
FUNCTION_URL=$(aws lambda get-function-url-config --function-name "$LAMBDA_FUNCTION_NAME" --query "FunctionUrl" --output text 2>/dev/null || echo "")
@@ -620,48 +610,67 @@ fi
620610

621611

622612
# =============================================================================
623-
# SHARED NOTIFICATION POLICY
613+
# NOTIFICATION POLICY ROUTES (this script's two routes only)
614+
# =============================================================================
615+
#
616+
# The notification policy is shared across all analysis modules. Each module's
617+
# setup script owns its own routes (keyed by receiver name) and upserts them
618+
# idempotently into the existing policy. This script owns:
619+
# - thread-dump-lambda-webhook
620+
# - ai-jvm-analyzer-webhook
621+
# perf-platform.sh owns its own (perf-analyzer-webhook). Whoever runs last
622+
# does not clobber the other modules' routes any more.
624623
# =============================================================================
625624

626625
log_info "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
627-
log_info "Configuring unified notification policy..."
626+
log_info "Upserting notification policy routes for thread + profiling..."
628627
log_info "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
629628

630-
# Configure notification policy with nested routes for both contact points
631-
POLICY_PAYLOAD="{
632-
\"receiver\": \"grafana-default-email\",
633-
\"group_by\": [\"alertname\"],
634-
\"group_wait\": \"30s\",
635-
\"group_interval\": \"5m\",
636-
\"repeat_interval\": \"1h\",
637-
\"routes\": [
638-
{
639-
\"receiver\": \"$THREAD_CONTACT_POINT\",
640-
\"matchers\": [\"analysis_type=thread\"],
641-
\"group_wait\": \"30s\",
642-
\"group_interval\": \"5m\",
643-
\"repeat_interval\": \"1h\"
644-
},
645-
{
646-
\"receiver\": \"$PROFILING_CONTACT_POINT\",
647-
\"matchers\": [\"analysis_type=profiling\"],
648-
\"group_by\": [\"alertname\", \"pod\"],
649-
\"group_wait\": \"10s\",
650-
\"group_interval\": \"30s\",
651-
\"repeat_interval\": \"2m\"
652-
}
653-
]
654-
}"
629+
# Read the current policy, drop any route whose receiver is one we own, append
630+
# our own routes, PUT the merged result.
631+
EXISTING_POLICY=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
632+
"$GRAFANA_URL/api/v1/provisioning/policies")
633+
634+
NEW_ROUTES='[
635+
{
636+
"receiver": "'"$THREAD_CONTACT_POINT"'",
637+
"matchers": ["analysis_type=thread"],
638+
"group_wait": "30s",
639+
"group_interval": "5m",
640+
"repeat_interval": "1h"
641+
},
642+
{
643+
"receiver": "'"$PROFILING_CONTACT_POINT"'",
644+
"matchers": ["analysis_type=profiling"],
645+
"group_by": ["alertname", "pod"],
646+
"group_wait": "10s",
647+
"group_interval": "30s",
648+
"repeat_interval": "2m"
649+
}
650+
]'
651+
652+
POLICY_PAYLOAD=$(echo "$EXISTING_POLICY" | jq \
653+
--argjson new "$NEW_ROUTES" \
654+
--arg t "$THREAD_CONTACT_POINT" \
655+
--arg p "$PROFILING_CONTACT_POINT" '
656+
# Default the policy fields if the existing payload was empty.
657+
.receiver = (.receiver // "grafana-default-email")
658+
| .group_by = (.group_by // ["alertname"])
659+
| .group_wait = (.group_wait // "30s")
660+
| .group_interval = (.group_interval // "5m")
661+
| .repeat_interval = (.repeat_interval // "1h")
662+
| .routes = ((.routes // []) | map(select(.receiver != $t and .receiver != $p))) + $new
663+
')
655664

656665
POLICY_RESPONSE=$(curl -s -X PUT -H "Content-Type: application/json" \
657666
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
658667
-d "$POLICY_PAYLOAD" \
659668
"$GRAFANA_URL/api/v1/provisioning/policies")
660669

661670
if echo "$POLICY_RESPONSE" | grep -q "policies updated"; then
662-
log_success "Unified notification policy configured"
671+
log_success "Thread + profiling routes upserted into the notification policy"
663672
else
664-
log_error "Notification policy configuration failed:"
673+
log_error "Notification policy update failed:"
665674
echo "$POLICY_RESPONSE"
666675
fi
667676

0 commit comments

Comments
 (0)