Skip to content

Commit 7f5fb49

Browse files
arielr-ltAriel Rolfo
andauthored
ES cluster resizing (#998)
* ES cluster resizing * edit rolling update * add exporter * Resize ES cluster --------- Co-authored-by: Ariel Rolfo <arielr-lt+username@users.noreply.github.com>
1 parent 8e797ee commit 7f5fb49

13 files changed

Lines changed: 485 additions & 58 deletions

File tree

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
name: Update ES Ingress IP Whitelist
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
ip:
7+
description: "Your current IP address (without /32)"
8+
type: string
9+
required: true
10+
label:
11+
description: "Label for this IP (e.g. 'Ariel home')"
12+
type: string
13+
required: false
14+
default: ""
15+
16+
permissions:
17+
id-token: write
18+
contents: read
19+
20+
env:
21+
AWS_REGION: us-east-1
22+
EKS_CLUSTER: ce-registry-eks
23+
NAMESPACE: credreg-sandbox
24+
INGRESS_NAME: elasticsearch
25+
26+
jobs:
27+
update-whitelist:
28+
if: ${{ github.repository_owner == 'CredentialEngine' }}
29+
runs-on: ubuntu-latest
30+
steps:
31+
- name: Configure AWS credentials
32+
uses: aws-actions/configure-aws-credentials@v4
33+
with:
34+
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT }}:role/github-oidc-widget
35+
aws-region: ${{ env.AWS_REGION }}
36+
37+
- name: Install kubectl
38+
uses: azure/setup-kubectl@v4
39+
with:
40+
version: v1.29.6
41+
42+
- name: Update kubeconfig
43+
run: |
44+
aws eks update-kubeconfig --name "${{ env.EKS_CLUSTER }}" --region "${{ env.AWS_REGION }}"
45+
46+
- name: Add IP to whitelist
47+
run: |
48+
NEW_IP="${{ inputs.ip }}/32"
49+
50+
CURRENT=$(kubectl get ingress "${{ env.INGRESS_NAME }}" \
51+
-n "${{ env.NAMESPACE }}" \
52+
-o jsonpath='{.metadata.annotations.nginx\.ingress\.kubernetes\.io/whitelist-source-range}')
53+
54+
echo "Current whitelist: $CURRENT"
55+
56+
if echo "$CURRENT" | grep -qF "$NEW_IP"; then
57+
echo "IP $NEW_IP is already in the whitelist, nothing to do."
58+
exit 0
59+
fi
60+
61+
UPDATED="${CURRENT},${NEW_IP}"
62+
63+
kubectl annotate ingress "${{ env.INGRESS_NAME }}" \
64+
-n "${{ env.NAMESPACE }}" \
65+
--overwrite \
66+
"nginx.ingress.kubernetes.io/whitelist-source-range=${UPDATED}"
67+
68+
echo "Updated whitelist: $UPDATED"
69+
if [ -n "${{ inputs.label }}" ]; then
70+
echo "Label: ${{ inputs.label }}"
71+
fi
72+
73+
- name: Notify Slack
74+
if: always()
75+
env:
76+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
77+
run: |
78+
if [ -z "${SLACK_WEBHOOK_URL}" ]; then
79+
echo "SLACK_WEBHOOK_URL not set; skipping notification"
80+
exit 0
81+
fi
82+
STATUS="${{ job.status }}"
83+
EMOJI=✅; [ "$STATUS" = "failure" ] && EMOJI=❌
84+
LABEL="${{ inputs.label }}"
85+
IP="${{ inputs.ip }}/32"
86+
MSG="$EMOJI ES ingress whitelist update ${STATUS}: ${IP}${LABEL:+ ($LABEL)} triggered by ${{ github.actor }} — ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
87+
curl -sS -X POST -H 'Content-type: application/json' \
88+
--data "$(jq -nc --arg text "$MSG" '{text:$text}')" \
89+
"$SLACK_WEBHOOK_URL" || true
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
---
2+
apiVersion: v1
3+
kind: ServiceAccount
4+
metadata:
5+
name: event-exporter
6+
namespace: amazon-cloudwatch
7+
---
8+
apiVersion: rbac.authorization.k8s.io/v1
9+
kind: ClusterRole
10+
metadata:
11+
name: event-exporter
12+
rules:
13+
- apiGroups: [""]
14+
resources: ["events", "namespaces", "pods"]
15+
verbs: ["get", "list", "watch"]
16+
---
17+
apiVersion: rbac.authorization.k8s.io/v1
18+
kind: ClusterRoleBinding
19+
metadata:
20+
name: event-exporter
21+
roleRef:
22+
apiGroup: rbac.authorization.k8s.io
23+
kind: ClusterRole
24+
name: event-exporter
25+
subjects:
26+
- kind: ServiceAccount
27+
name: event-exporter
28+
namespace: amazon-cloudwatch
29+
---
30+
apiVersion: v1
31+
kind: ConfigMap
32+
metadata:
33+
name: event-exporter-config
34+
namespace: amazon-cloudwatch
35+
data:
36+
config.yaml: |
37+
logLevel: warn
38+
logFormat: json
39+
route:
40+
routes:
41+
- match:
42+
- receiver: stdout
43+
receivers:
44+
- name: stdout
45+
stdout: {}
46+
---
47+
apiVersion: apps/v1
48+
kind: Deployment
49+
metadata:
50+
name: event-exporter
51+
namespace: amazon-cloudwatch
52+
spec:
53+
replicas: 1
54+
selector:
55+
matchLabels:
56+
app: event-exporter
57+
template:
58+
metadata:
59+
labels:
60+
app: event-exporter
61+
spec:
62+
serviceAccountName: event-exporter
63+
containers:
64+
- name: event-exporter
65+
image: ghcr.io/resmoio/kubernetes-event-exporter:v1.7
66+
args:
67+
- -conf=/data/config.yaml
68+
volumeMounts:
69+
- name: config
70+
mountPath: /data
71+
resources:
72+
requests:
73+
cpu: 50m
74+
memory: 64Mi
75+
limits:
76+
cpu: 100m
77+
memory: 128Mi
78+
volumes:
79+
- name: config
80+
configMap:
81+
name: event-exporter-config
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
---
2+
apiVersion: apps/v1
3+
kind: Deployment
4+
metadata:
5+
name: elasticsearch-exporter
6+
namespace: credreg-sandbox
7+
labels:
8+
app: elasticsearch-exporter
9+
spec:
10+
replicas: 1
11+
selector:
12+
matchLabels:
13+
app: elasticsearch-exporter
14+
template:
15+
metadata:
16+
labels:
17+
app: elasticsearch-exporter
18+
spec:
19+
nodeSelector:
20+
env: sandbox
21+
tolerations:
22+
- key: "env"
23+
operator: "Equal"
24+
value: "sandbox"
25+
effect: "NoSchedule"
26+
containers:
27+
- name: elasticsearch-exporter
28+
image: prometheuscommunity/elasticsearch-exporter:v1.8.0
29+
args:
30+
- --es.uri=http://elasticsearch.credreg-sandbox.svc.cluster.local:9200
31+
- --es.all
32+
- --es.indices
33+
- --es.shards
34+
ports:
35+
- containerPort: 9114
36+
name: metrics
37+
resources:
38+
requests:
39+
cpu: 50m
40+
memory: 64Mi
41+
limits:
42+
cpu: 100m
43+
memory: 128Mi
44+
---
45+
apiVersion: v1
46+
kind: Service
47+
metadata:
48+
name: elasticsearch-exporter
49+
namespace: credreg-sandbox
50+
labels:
51+
app: elasticsearch-exporter
52+
spec:
53+
type: ClusterIP
54+
selector:
55+
app: elasticsearch-exporter
56+
ports:
57+
- name: metrics
58+
port: 9114
59+
targetPort: 9114

terraform/environments/eks/k8s-manifests-sandbox/elasticsearch-ingress.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ metadata:
88
nginx.ingress.kubernetes.io/auth-type: "basic"
99
nginx.ingress.kubernetes.io/auth-secret: "es-basic-auth"
1010
nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
11-
nginx.ingress.kubernetes.io/whitelist-source-range: "98.97.134.132/32,71.212.64.155/32,98.13.197.1/32,98.193.126.147/32"
11+
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
12+
nginx.ingress.kubernetes.io/whitelist-source-range: "192.140.91.9/32,98.97.134.132/32,71.212.64.155/32,98.13.197.1/32,98.193.126.147/32,128.203.139.2/32"
1213
# 71.212.64.155 – Rohit
1314
# 98.13.197.1 – Jenna
1415
# 98.193.126.147 – Mike P.
1516
# 98.97.134.132/32 - Ariel
17+
# 128.203.139.2/32 - Azure's cluster
1618
spec:
1719
ingressClassName: nginx
1820
tls:
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: policy/v1
2+
kind: PodDisruptionBudget
3+
metadata:
4+
name: elasticsearch-pdb
5+
namespace: credreg-sandbox
6+
spec:
7+
minAvailable: 1
8+
selector:
9+
matchLabels:
10+
app: elasticsearch

terraform/environments/eks/k8s-manifests-sandbox/elasticsearch-statefulset.yaml

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,21 @@ spec:
1717
app: elasticsearch
1818
spec:
1919
subdomain: elasticsearch-discovery
20-
priorityClassName: sandbox-medium
20+
priorityClassName: prod-high
2121
nodeSelector:
2222
env: sandbox
2323
tolerations:
2424
- key: "env"
2525
operator: "Equal"
2626
value: "sandbox"
27-
effect: "NoSchedule"
27+
effect: "NoSchedule"
28+
affinity:
29+
podAntiAffinity:
30+
requiredDuringSchedulingIgnoredDuringExecution:
31+
- labelSelector:
32+
matchLabels:
33+
app: elasticsearch
34+
topologyKey: kubernetes.io/hostname
2835
securityContext:
2936
fsGroup: 1000
3037
runAsUser: 1000
@@ -37,16 +44,32 @@ spec:
3744
- containerPort: 9300
3845
resources:
3946
requests:
40-
cpu: "256m"
47+
cpu: "500m"
4148
memory: "2Gi"
4249
limits:
4350
cpu: "1000m"
4451
memory: "4Gi"
52+
readinessProbe:
53+
httpGet:
54+
path: /_cluster/health?wait_for_status=yellow&timeout=5s
55+
port: 9200
56+
initialDelaySeconds: 30
57+
periodSeconds: 10
58+
failureThreshold: 3
59+
livenessProbe:
60+
httpGet:
61+
path: /_cluster/health
62+
port: 9200
63+
initialDelaySeconds: 120
64+
periodSeconds: 20
65+
failureThreshold: 5
4566
env:
4667
- name: ES_JAVA_OPTS
4768
value: "-Xms2g -Xmx2g"
4869
- name: cluster.name
4970
value: "elasticsearch"
71+
- name: discovery.type
72+
value: "multi-node"
5073
- name: xpack.security.enabled
5174
value: "false"
5275
- name: network.host

terraform/environments/eks/main.tf

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,18 @@ module "eks" {
118118
app_namespace_prod = var.app_namespace_prod
119119
app_service_account_prod = var.app_service_account_prod
120120
# Env node group scaling
121-
ng_staging_min_size = var.ng_staging_min_size
122-
ng_staging_desired_size = var.ng_staging_desired_size
123-
ng_staging_max_size = var.ng_staging_max_size
124-
ng_sandbox_min_size = var.ng_sandbox_min_size
125-
ng_sandbox_desired_size = var.ng_sandbox_desired_size
126-
ng_sandbox_max_size = var.ng_sandbox_max_size
127-
ng_prod_min_size = var.ng_prod_min_size
128-
ng_prod_desired_size = var.ng_prod_desired_size
129-
ng_prod_max_size = var.ng_prod_max_size
121+
ng_staging_min_size = var.ng_staging_min_size
122+
ng_staging_desired_size = var.ng_staging_desired_size
123+
ng_staging_max_size = var.ng_staging_max_size
124+
ng_sandbox_min_size = var.ng_sandbox_min_size
125+
ng_sandbox_desired_size = var.ng_sandbox_desired_size
126+
ng_sandbox_max_size = var.ng_sandbox_max_size
127+
ng_sandbox_large_min_size = var.ng_sandbox_large_min_size
128+
ng_sandbox_large_desired_size = var.ng_sandbox_large_desired_size
129+
ng_sandbox_large_max_size = var.ng_sandbox_large_max_size
130+
ng_prod_min_size = var.ng_prod_min_size
131+
ng_prod_desired_size = var.ng_prod_desired_size
132+
ng_prod_max_size = var.ng_prod_max_size
130133
}
131134

132135
module "application_secret" {
@@ -231,9 +234,14 @@ module "cloudwatch_slack_forwarder" {
231234

232235
log_filters = [
233236
{
234-
name = "es-warn-prod"
237+
name = "es-warn-sandbox"
235238
log_group_name = "/aws/containerinsights/ce-registry-eks/application"
236-
filter_pattern = "\"WARN\" \"elasticsearch\" \"credreg-prod\""
239+
filter_pattern = "\"WARN\" \"elasticsearch\" \"credreg-sandbox\""
240+
},
241+
{
242+
name = "k8s-backoff-prod-staging"
243+
log_group_name = "/aws/containerinsights/ce-registry-eks/application"
244+
filter_pattern = "{ ($.log_processed.reason = \"BackOff\") && (($.log_processed.involvedObject.namespace = \"credreg-prod\") || ($.log_processed.involvedObject.namespace = \"credreg-sandbox\")) }"
237245
},
238246
]
239247

terraform/environments/eks/terraform.tfvars

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,13 @@ route53_hosted_zone_id = "Z1N75467P1FUL5"
3030
# Env node group scaling
3131
ng_staging_min_size = 1
3232
ng_staging_desired_size = 1
33-
ng_staging_max_size = 4
33+
ng_staging_max_size = 6
3434
ng_sandbox_min_size = 1
3535
ng_sandbox_desired_size = 1
3636
ng_sandbox_max_size = 5
37+
ng_sandbox_large_min_size = 1
38+
ng_sandbox_large_desired_size = 2
39+
ng_sandbox_large_max_size = 4
3740
ng_prod_min_size = 2
3841
ng_prod_desired_size = 2
3942
ng_prod_max_size = 8

terraform/environments/eks/variables.tf

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,21 @@ variable "ng_sandbox_max_size" {
136136
description = "Sandbox node group max size"
137137
}
138138

139+
variable "ng_sandbox_large_min_size" {
140+
type = number
141+
description = "Sandbox large (t3.large) node group min size"
142+
}
143+
144+
variable "ng_sandbox_large_desired_size" {
145+
type = number
146+
description = "Sandbox large (t3.large) node group desired size"
147+
}
148+
149+
variable "ng_sandbox_large_max_size" {
150+
type = number
151+
description = "Sandbox large (t3.large) node group max size"
152+
}
153+
139154
variable "ng_prod_min_size" {
140155
type = number
141156
description = "Production node group min size"

0 commit comments

Comments
 (0)