Skip to content

Private AKS PoC - Deploy, Log, Teardown #2

Private AKS PoC - Deploy, Log, Teardown

Private AKS PoC - Deploy, Log, Teardown #2

name: Private AKS PoC - Deploy, Log, Teardown
on:
workflow_dispatch:
inputs:
location:
description: 'Azure region (canadacentral or canadaeast)'
default: 'canadacentral'
type: string
wait_minutes:
description: 'Minutes to wait before teardown (cost control)'
default: '30'
type: string
permissions:
id-token: write # OIDC token for azure/login from GitHub-hosted runners
contents: read
env:
LOCATION: ${{ github.event.inputs.location || 'canadacentral' }}
# Runner VM infrastructure (created/destroyed each run)
RUNNER_RG: rg-aks-poc-runner-${{ github.run_id }}
RUNNER_VM: vm-runner-${{ github.run_id }}
RUNNER_LABEL: aks-poc-runner-${{ github.run_id }}
MI_NAME: mi-aks-poc-deployer
# AKS deployment resources
AKS_RG: rg-aks-poc-${{ github.run_id }}
CLUSTER_NAME: aks-poc-${{ github.run_id }}
VNET_NAME: vnet-aks-poc
SUBNET_NAME: subnet-aks
jobs:
# ═══════════════════════════════════════════════════════════════
# Job 1: Provision runner VM on GitHub-hosted runner
# ═══════════════════════════════════════════════════════════════
# Uses ubuntu-latest (always available) to create an Azure VM,
# assign a managed identity, and register it as a self-hosted
# GitHub Actions runner for this repository.
# ═══════════════════════════════════════════════════════════════
setup-runner:
runs-on: ubuntu-latest
timeout-minutes: 30
outputs:
mi_client_id: ${{ steps.identity.outputs.mi_client_id }}
steps:
# ── 1a. Azure Login (OIDC from GitHub-hosted runner) ───────
- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# ── 1b. Create runner resource group ───────────────────────
- name: Create runner resource group
run: |
az group create \
--name "$RUNNER_RG" \
--location "$LOCATION" \
--tags purpose=aks-poc component=runner run=${{ github.run_id }}
# ── 1c. Create user-assigned managed identity ──────────────
- name: Create managed identity
id: identity
run: |
az identity create \
--name "$MI_NAME" \
--resource-group "$RUNNER_RG" \
--location "$LOCATION"
MI_CLIENT_ID=$(az identity show \
--name "$MI_NAME" \
--resource-group "$RUNNER_RG" \
--query clientId -o tsv)
MI_PRINCIPAL_ID=$(az identity show \
--name "$MI_NAME" \
--resource-group "$RUNNER_RG" \
--query principalId -o tsv)
MI_RESOURCE_ID=$(az identity show \
--name "$MI_NAME" \
--resource-group "$RUNNER_RG" \
--query id -o tsv)
echo "mi_client_id=$MI_CLIENT_ID" >> "$GITHUB_OUTPUT"
echo "MI_PRINCIPAL_ID=$MI_PRINCIPAL_ID" >> "$GITHUB_ENV"
echo "MI_RESOURCE_ID=$MI_RESOURCE_ID" >> "$GITHUB_ENV"
# ── 1d. Assign RBAC roles to the managed identity ─────────
- name: Assign RBAC roles
run: |
SUBSCRIPTION_ID=$(az account show --query id -o tsv)
az role assignment create \
--assignee-object-id "$MI_PRINCIPAL_ID" \
--assignee-principal-type ServicePrincipal \
--role "Contributor" \
--scope "/subscriptions/$SUBSCRIPTION_ID"
az role assignment create \
--assignee-object-id "$MI_PRINCIPAL_ID" \
--assignee-principal-type ServicePrincipal \
--role "Monitoring Reader" \
--scope "/subscriptions/$SUBSCRIPTION_ID"
# ── 1e. Create the runner VM with cloud-init ───────────────
- name: Create runner VM
run: |
cat > /tmp/cloud-init.yaml <<'CLOUD_INIT'
#cloud-config
package_update: true
package_upgrade: true
packages:
- curl
- jq
- unzip
- apt-transport-https
- ca-certificates
- gnupg
- lsb-release
runcmd:
- curl -sL https://aka.ms/InstallAzureCLIDeb | bash
- mkdir -p /home/azureuser/actions-runner
- chown azureuser:azureuser /home/azureuser/actions-runner
- |
RUNNER_VERSION=$(curl -s https://api.github.com/repos/actions/runner/releases/latest | jq -r '.tag_name' | sed 's/^v//')
curl -sL "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" \
-o /tmp/actions-runner.tar.gz
tar xzf /tmp/actions-runner.tar.gz -C /home/azureuser/actions-runner
chown -R azureuser:azureuser /home/azureuser/actions-runner
rm -f /tmp/actions-runner.tar.gz
- /home/azureuser/actions-runner/bin/installdependencies.sh
CLOUD_INIT
az vm create \
--resource-group "$RUNNER_RG" \
--name "$RUNNER_VM" \
--image Ubuntu2204 \
--size Standard_B2s \
--admin-username azureuser \
--generate-ssh-keys \
--assign-identity "$MI_RESOURCE_ID" \
--tags purpose=aks-poc component=runner \
--custom-data /tmp/cloud-init.yaml
# ── 1f. Wait for cloud-init to complete ────────────────────
- name: Wait for cloud-init
run: |
echo "Waiting for cloud-init to finish on the VM..."
for i in $(seq 1 30); do
STATUS=$(az vm run-command invoke \
--resource-group "$RUNNER_RG" \
--name "$RUNNER_VM" \
--command-id RunShellScript \
--scripts "cloud-init status 2>/dev/null | grep -q 'done' && echo 'DONE' || echo 'WAITING'" \
--query "value[0].message" -o tsv 2>/dev/null || echo "WAITING")
if echo "$STATUS" | grep -q "DONE"; then
echo "Cloud-init completed."
break
fi
echo "Attempt $i/30: still waiting..."
sleep 30
done
# ── 1g. Register as a GitHub Actions self-hosted runner ────
- name: Register GitHub Actions runner
env:
GH_PAT: ${{ secrets.GH_PAT }}
run: |
# Get a registration token from the GitHub API
REG_TOKEN=$(curl -s -X POST \
-H "Authorization: token $GH_PAT" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
| jq -r '.token')
if [ -z "$REG_TOKEN" ] || [ "$REG_TOKEN" = "null" ]; then
echo "::error::Failed to obtain runner registration token"
exit 1
fi
# Configure and start the runner on the VM via run-command
az vm run-command invoke \
--resource-group "$RUNNER_RG" \
--name "$RUNNER_VM" \
--command-id RunShellScript \
--scripts "
cd /home/azureuser/actions-runner
sudo -u azureuser ./config.sh \
--url https://github.com/${{ github.repository }} \
--token $REG_TOKEN \
--name $RUNNER_VM \
--labels $RUNNER_LABEL \
--unattended \
--replace
./svc.sh install azureuser
./svc.sh start
"
# ── 1h. Wait for runner to come online ─────────────────────
- name: Wait for runner to come online
env:
GH_PAT: ${{ secrets.GH_PAT }}
run: |
echo "Waiting for self-hosted runner '$RUNNER_VM' to come online..."
for i in $(seq 1 20); do
ONLINE=$(curl -s \
-H "Authorization: token $GH_PAT" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runners" \
| jq -r ".runners[] | select(.name == \"$RUNNER_VM\") | .status")
if [ "$ONLINE" = "online" ]; then
echo "Runner is online!"
exit 0
fi
echo "Attempt $i/20: runner status=${ONLINE:-not found}, waiting 15s..."
sleep 15
done
echo "::error::Runner did not come online within expected time"
exit 1
# ═══════════════════════════════════════════════════════════════
# Job 2: Deploy Private AKS on the self-hosted runner
# ═══════════════════════════════════════════════════════════════
# Runs on the VM provisioned in Job 1. The runner VM has a
# managed identity — tokens are acquired via IMDS, so
# Conditional Access location policies never apply.
# ═══════════════════════════════════════════════════════════════
deploy-and-log:
needs: setup-runner
runs-on: [self-hosted, "aks-poc-runner-${{ github.run_id }}"]
timeout-minutes: 60
steps:
# ── 2a. Checkout ───────────────────────────────────────────
- name: Checkout repository
uses: actions/checkout@v4
# ── 2b. Azure Login (Managed Identity on the runner VM) ────
- name: Azure Login (Managed Identity)
uses: azure/login@v2
with:
auth-type: IDENTITY
client-id: ${{ needs.setup-runner.outputs.mi_client_id }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# ── 2c. Record Runner IP ──────────────────────────────────
- name: Record runner IP
run: |
RUNNER_IP=$(curl -s ifconfig.me)
echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_ENV
echo "Runner public IP: $RUNNER_IP"
# ── 2d. Create AKS Resource Group ─────────────────────────
- name: Create Resource Group
run: |
az group create \
--name "$AKS_RG" \
--location "$LOCATION" \
--tags purpose=aks-poc created=$(date -u +%Y-%m-%dT%H:%M:%SZ) run=${{ github.run_id }}
# ── 2e. Create VNet + Subnet ──────────────────────────────
- name: Create VNet and Subnet
run: |
az network vnet create \
--resource-group "$AKS_RG" \
--name "$VNET_NAME" \
--address-prefixes 10.224.0.0/16 \
--subnet-name "$SUBNET_NAME" \
--subnet-prefixes 10.224.0.0/24
SUBNET_ID=$(az network vnet subnet show \
--resource-group "$AKS_RG" \
--vnet-name "$VNET_NAME" \
--name "$SUBNET_NAME" \
--query id -o tsv)
echo "SUBNET_ID=$SUBNET_ID" >> $GITHUB_ENV
# ── 2f. Record Start Time ─────────────────────────────────
- name: Record start time
run: |
echo "DEPLOY_START_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_ENV
# ── 2g. Deploy Private AKS ────────────────────────────────
- name: Deploy Private AKS Cluster
run: |
az aks create \
--resource-group "$AKS_RG" \
--name "$CLUSTER_NAME" \
--node-count 1 \
--node-vm-size Standard_B2s \
--network-plugin azure \
--vnet-subnet-id "$SUBNET_ID" \
--enable-private-cluster \
--enable-managed-identity \
--generate-ssh-keys \
--tier free \
--no-wait || echo "DEPLOY_FAILED=true" >> $GITHUB_ENV
# ── 2h. Wait for Provisioning ─────────────────────────────
- name: Wait for AKS provisioning
if: env.DEPLOY_FAILED != 'true'
run: |
az aks wait \
--resource-group "$AKS_RG" \
--name "$CLUSTER_NAME" \
--created \
--timeout 1200
# ── 2i. Log IPs (Activity Log) ────────────────────────────
- name: Log IPs (Activity Log)
if: always()
run: |
echo "=== Runner VM Outbound IP ==="
echo "Runner IP: $RUNNER_IP"
echo ""
echo "Waiting 60s for Activity Log propagation..."
sleep 60
echo "=== ARM Operation Caller IPs (ContainerService) ==="
az monitor activity-log list \
--resource-group "$AKS_RG" \
--start-time "$DEPLOY_START_TIME" \
--query "[?contains(operationName.value, 'Microsoft.ContainerService')].{op:operationName.value, caller:caller, clientIp:httpRequest.clientIpAddress, status:status.value, time:eventTimestamp}" \
-o table || echo "Activity log query failed for ContainerService"
echo ""
echo "=== ARM Operation Caller IPs (Network) ==="
az monitor activity-log list \
--resource-group "$AKS_RG" \
--start-time "$DEPLOY_START_TIME" \
--query "[?contains(operationName.value, 'Microsoft.Network')].{op:operationName.value, caller:caller, clientIp:httpRequest.clientIpAddress, status:status.value, time:eventTimestamp}" \
-o table || echo "Activity log query failed for Network"
echo ""
echo "=== IP Comparison ==="
echo "Runner IP: $RUNNER_IP"
echo "Compare the clientIp values above against the runner IP to verify traffic routes."
# ── 2j. Log IPs (Entra Sign-In) ───────────────────────────
- name: Log IPs (Entra Sign-In Logs)
if: always()
continue-on-error: true
run: |
echo "=== Entra ID Sign-In IPs (requires P1/P2) ==="
MI_CLIENT_ID="${{ needs.setup-runner.outputs.mi_client_id }}"
az rest --method get \
--url "https://graph.microsoft.com/v1.0/auditLogs/signIns?\$filter=createdDateTime ge $DEPLOY_START_TIME and appId eq '${MI_CLIENT_ID}'" \
--query "value[].{ip:ipAddress, app:appDisplayName, time:createdDateTime, status:status.errorCode}" \
-o table || echo "Sign-in log query failed (may require Entra P1/P2)"
# ── 2k. Wait Before Teardown ──────────────────────────────
- name: Wait before teardown
if: env.DEPLOY_FAILED != 'true'
run: |
WAIT=${{ github.event.inputs.wait_minutes || '30' }}
echo "Waiting ${WAIT} minutes before teardown..."
sleep $((WAIT * 60))
# ── 2l. Teardown AKS Resources ────────────────────────────
- name: Teardown AKS resources
if: always()
run: |
echo "Deleting resource group $AKS_RG..."
az group delete --name "$AKS_RG" --yes --no-wait
echo "Resource group deletion initiated."
# ── 2m. Azure Logout ───────────────────────────────────────
- name: Azure Logout
if: always()
run: az logout
# ═══════════════════════════════════════════════════════════════
# Job 3: Teardown runner VM (GitHub-hosted runner)
# ═══════════════════════════════════════════════════════════════
# Always runs — deregisters the self-hosted runner from the repo
# and deletes the runner VM resource group.
# ═══════════════════════════════════════════════════════════════
teardown-runner:
needs: [setup-runner, deploy-and-log]
if: always()
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
# ── 3a. Azure Login ────────────────────────────────────────
- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# ── 3b. Deregister runner from GitHub ──────────────────────
- name: Deregister self-hosted runner
continue-on-error: true
env:
GH_PAT: ${{ secrets.GH_PAT }}
run: |
RUNNER_ID=$(curl -s \
-H "Authorization: token $GH_PAT" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runners" \
| jq -r ".runners[] | select(.name == \"$RUNNER_VM\") | .id")
if [ -n "$RUNNER_ID" ] && [ "$RUNNER_ID" != "null" ]; then
curl -s -X DELETE \
-H "Authorization: token $GH_PAT" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runners/$RUNNER_ID"
echo "Runner deregistered (ID: $RUNNER_ID)."
else
echo "Runner not found or already removed."
fi
# ── 3c. Safety-net: ensure AKS RG is deleted ──────────────
- name: Ensure AKS resources are deleted
continue-on-error: true
run: |
if az group show --name "$AKS_RG" --output none 2>/dev/null; then
echo "AKS resource group still exists — deleting..."
az group delete --name "$AKS_RG" --yes --no-wait
fi
# ── 3d. Delete runner resource group ───────────────────────
- name: Delete runner resource group
run: |
az group delete --name "$RUNNER_RG" --yes --no-wait
echo "Runner resource group deletion initiated."
# ── 3e. Azure Logout ───────────────────────────────────────
- name: Azure Logout
if: always()
run: az logout