Automate service coverage data update for Azure (#474)

HarshCasper · web-flow · commit da2c1d1701db · 2026-03-16T09:09:43.000-07:00
diff --git a/.github/workflows/update-azure-coverage.yml b/.github/workflows/update-azure-coverage.yml
@@ -0,0 +1,79 @@
+name: Update Azure Coverage Data
+
+on:
+  schedule:
+    - cron: 0 5 * * MON
+  workflow_dispatch:
+    inputs:
+      targetBranch:
+        required: true
+        type: string
+        description: "Branch to checkout and compare against (e.g. harshmishra/doc-91)"
+
+jobs:
+  update-azure-coverage:
+    name: Update Azure coverage data
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - name: Checkout docs
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          path: docs
+          ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.targetBranch || 'main' }}
+
+      - name: Set up system wide dependencies
+        run: |
+          sudo apt-get install jq wget
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Download Azure implementation metrics artifact
+        working-directory: docs
+        run: bash ./scripts/get_latest_github_metrics.sh ./target main
+        env:
+          GITHUB_TOKEN: ${{ secrets.PRO_ACCESS_TOKEN }}
+          REPOSITORY_NAME: localstack-pro
+          ARTIFACT_ID: implemented_features_python-amd64
+          WORKFLOW: "Az / Build, Test, Push"
+
+      - name: Generate Azure coverage JSON data
+        working-directory: docs
+        run: |
+          python3 scripts/create_azure_coverage.py -i target/implemented_features.csv -o target/updated_azure_coverage
+          if ls target/updated_azure_coverage/*.json > /dev/null 2>&1; then
+            mv -f target/updated_azure_coverage/*.json src/data/azure-coverage/
+          else
+            echo "No JSON files generated in target/updated_azure_coverage."
+            exit 1
+          fi
+
+      - name: Check for changes
+        id: check-for-changes
+        working-directory: docs
+        env:
+          TARGET_BRANCH: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.targetBranch || 'main' }}
+        run: |
+          mkdir -p resources
+          (git diff --name-only origin/automated-azure-coverage-updates src/data/azure-coverage/ 2>/dev/null || git diff --name-only "origin/$TARGET_BRANCH" src/data/azure-coverage/ 2>/dev/null) | tee -a resources/diff-check.log
+          echo "diff-count=$(cat resources/diff-check.log | wc -l)" >> "$GITHUB_OUTPUT"
+          cat resources/diff-check.log
+
+      - name: Create PR
+        uses: peter-evans/create-pull-request@v7
+        if: ${{ success() && steps.check-for-changes.outputs.diff-count != '0' && steps.check-for-changes.outputs.diff-count != '' }}
+        with:
+          path: docs
+          title: "Update Azure coverage data"
+          body: "Update generated Azure coverage JSON data from the latest LocalStack Pro parity metrics artifact."
+          branch: "automated-azure-coverage-updates"
+          author: "LocalStack Bot <localstack-bot@users.noreply.github.com>"
+          committer: "LocalStack Bot <localstack-bot@users.noreply.github.com>"
+          commit-message: "update generated azure coverage data"
+          token: ${{ secrets.PRO_ACCESS_TOKEN }}
diff --git a/scripts/create_azure_coverage.py b/scripts/create_azure_coverage.py
@@ -0,0 +1,152 @@
+"""
+Generate Azure coverage JSON files from implementation CSV data.
+"""
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Any
+
+
+def _as_bool(value: Any, default: bool = True) -> bool:
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    return str(value).strip().lower() in {"1", "true", "yes", "y"}
+
+
+def _group_name(service_name: str, category: str) -> str:
+    service_name = (service_name or "").strip()
+    category = (category or "").strip()
+    if not category:
+        return service_name
+    if category.lower() in {"none", "null", "n/a"}:
+        return service_name
+    if category == service_name:
+        return service_name
+    return f"{service_name} ({category})"
+
+
+def _normalize_provider(value: str) -> str:
+    return (value or "").strip().replace("_", ".")
+
+
+def _resolve_input_csv(path: Path) -> Path:
+    if path.exists():
+        if path.is_file():
+            return path
+        # Support passing a directory that contains the extracted artifact.
+        nested_csv = path / "implemented_features.csv"
+        if nested_csv.exists():
+            return nested_csv
+        matches = sorted(path.rglob("implemented_features.csv"))
+        if matches:
+            return matches[0]
+        raise FileNotFoundError(f"No implemented_features.csv found under: {path}")
+
+    # Backward-compatible fallback for target/implemented_features.csv.
+    if path.name == "implemented_features.csv" and path.parent.exists():
+        matches = sorted(path.parent.rglob("implemented_features.csv"))
+        if matches:
+            return matches[0]
+
+    raise FileNotFoundError(f"Input CSV not found: {path}")
+
+
+def _load_csv(path: Path) -> dict[str, dict[str, dict[str, dict[str, Any]]]]:
+    path = _resolve_input_csv(path)
+
+    coverage: dict[str, dict[str, dict[str, dict[str, Any]]]] = {}
+    with path.open(mode="r", encoding="utf-8") as file:
+        reader = csv.DictReader(file)
+        if not reader.fieldnames:
+            raise ValueError("Input CSV has no headers.")
+        required_headers = {"resource_provider", "service", "feature"}
+        if not required_headers.issubset(set(reader.fieldnames)):
+            raise ValueError(
+                "Unexpected CSV schema. Expected headers including "
+                f"{sorted(required_headers)}, got {reader.fieldnames}. "
+                "The downloaded artifact may contain an error payload instead of CSV data."
+            )
+
+        for row in reader:
+            provider = _normalize_provider(row.get("resource_provider", ""))
+            if not provider:
+                continue
+
+            feature_name = (row.get("feature") or row.get("operation") or "").strip()
+            if not feature_name:
+                continue
+
+            group = _group_name(row.get("service", ""), row.get("category", ""))
+            if not group:
+                group = "General"
+
+            implemented = _as_bool(
+                row.get("implemented", row.get("is_implemented", row.get("isImplemented"))),
+                default=True,
+            )
+            pro_only = _as_bool(row.get("pro", row.get("is_pro", row.get("isPro"))), default=True)
+
+            provider_data = coverage.setdefault(provider, {})
+            group_data = provider_data.setdefault(group, {})
+            group_data[feature_name] = {
+                "implemented": implemented,
+                "pro": pro_only,
+            }
+
+    if not coverage:
+        raise ValueError(
+            "No Azure coverage records were parsed from the input CSV. "
+            "Please verify the artifact content is valid and non-empty."
+        )
+
+    return coverage
+
+
+def _sorted_details(details: dict[str, dict[str, dict[str, Any]]]) -> dict[str, dict[str, dict[str, Any]]]:
+    sorted_details: dict[str, dict[str, dict[str, Any]]] = {}
+    for group_name in sorted(details.keys()):
+        operations = details[group_name]
+        sorted_details[group_name] = dict(sorted(operations.items(), key=lambda item: item[0]))
+    return sorted_details
+
+
+def write_coverage_files(coverage: dict[str, dict[str, dict[str, dict[str, Any]]]], output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for provider in sorted(coverage.keys()):
+        payload = {
+            "service": provider,
+            "operations": [],
+            "details": _sorted_details(coverage[provider]),
+        }
+        file_path = output_dir / f"{provider}.json"
+        with file_path.open(mode="w", encoding="utf-8") as fd:
+            json.dump(payload, fd, indent=2)
+            fd.write("\n")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate Azure coverage JSON data.")
+    parser.add_argument(
+        "-i",
+        "--implementation-details",
+        required=True,
+        help="Path to implementation details CSV.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        required=True,
+        help="Directory where generated JSON files will be written.",
+    )
+    args = parser.parse_args()
+
+    coverage = _load_csv(Path(args.implementation_details))
+    write_coverage_files(coverage, Path(args.output_dir))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/get_latest_github_metrics.sh b/scripts/get_latest_github_metrics.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -euo pipefail
+
+# input params
+PARENT_FOLDER=${1:-target}
+METRICS_ARTIFACTS_BRANCH=${2:-main}
+
+# env vars
+REPOSITORY_NAME=${REPOSITORY_NAME:-localstack-pro}
+ARTIFACT_ID=${ARTIFACT_ID:-implemented_features_python-amd64}
+WORKFLOW=${WORKFLOW:-"Az / Build, Test, Push"}
+PREFIX_ARTIFACT=${PREFIX_ARTIFACT:-}
+FILTER_SUCCESS=${FILTER_SUCCESS:-1}
+LIMIT=${LIMIT:-20}
+
+RESOURCE_FOLDER=${RESOURCE_FOLDER:-}
+REPOSITORY_OWNER=${REPOSITORY_OWNER:-localstack}
+TARGET_FOLDER="$PARENT_FOLDER/$RESOURCE_FOLDER"
+
+TMP_FOLDER="$PARENT_FOLDER/tmp_download"
+mkdir -p "$TMP_FOLDER"
+
+echo "Searching for artifact '$ARTIFACT_ID' in workflow '$WORKFLOW' on branch '$METRICS_ARTIFACTS_BRANCH' in repo '$REPOSITORY_OWNER/$REPOSITORY_NAME'."
+
+if [ "$FILTER_SUCCESS" = "1" ]; then
+  echo "Filtering runs by conclusion=success"
+  SELECTOR='.[] | select(.conclusion=="success")'
+else
+  echo "Filtering runs by completed status (success/failure)"
+  SELECTOR='.[] | select(.status=="completed" and (.conclusion=="failure" or .conclusion=="success"))'
+fi
+
+RUN_IDS=()
+while IFS= read -r run_id; do
+  RUN_IDS+=("$run_id")
+done < <(
+  gh run list \
+    --limit "$LIMIT" \
+    --branch "$METRICS_ARTIFACTS_BRANCH" \
+    --repo "$REPOSITORY_OWNER/$REPOSITORY_NAME" \
+    --workflow "$WORKFLOW" \
+    --json databaseId,conclusion,status \
+    --jq "$SELECTOR | .databaseId"
+)
+
+if [ "${#RUN_IDS[@]}" -eq 0 ]; then
+  echo "No matching workflow runs found."
+  exit 1
+fi
+
+for RUN_ID in "${RUN_IDS[@]}"; do
+  if [ -z "$RUN_ID" ] || [ "$RUN_ID" = "null" ]; then
+    continue
+  fi
+  echo "Trying run id: $RUN_ID"
+
+  gh run download "$RUN_ID" --repo "$REPOSITORY_OWNER/$REPOSITORY_NAME" -p "$ARTIFACT_ID" -D "$TMP_FOLDER" || true
+
+  if [ "$(ls -1 "$TMP_FOLDER" 2>/dev/null | wc -l)" -gt 0 ]; then
+    echo "Downloaded artifact successfully."
+    break
+  fi
+done
+
+if [ "$(ls -1 "$TMP_FOLDER" 2>/dev/null | wc -l)" -eq 0 ]; then
+  echo "Failed to download artifact '$ARTIFACT_ID' from the checked workflow runs."
+  exit 1
+fi
+
+echo "Moving artifact to $TARGET_FOLDER"
+mkdir -p "$TARGET_FOLDER"
+if [[ -z "${PREFIX_ARTIFACT}" ]]; then
+  cp -R "$TMP_FOLDER"/. "$TARGET_FOLDER"/
+else
+  while IFS= read -r file; do
+    org_file_name=$(echo "$file" | sed "s/.*\///")
+    mv -- "$file" "$TARGET_FOLDER/$PREFIX_ARTIFACT-$org_file_name"
+  done < <(find "$TMP_FOLDER" -type f -name "*.csv")
+fi
+
+rm -rf "$TMP_FOLDER"
+echo "Contents of $TARGET_FOLDER:"
+ls -la "$TARGET_FOLDER"