diff --git a/README.fr.md b/README.fr.md index e5ab221..ab143ed 100644 --- a/README.fr.md +++ b/README.fr.md @@ -100,7 +100,7 @@ Gaspillage minimum estimé : ~$25 944/mois - Détecte le gaspillage IA/ML coûteux : SageMaker, AML, Vertex AI — ressources GPU signalées comme candidats à risque plus élevé (500–23 000 $/mois) - Fonctionne sur AWS, Azure et GCP en un seul outil - S'exécute entièrement dans votre environnement — aucun agent, pas de SaaS, aucun credential stocké -- 48 règles de détection sélectives et haut signal, conçues pour éviter les faux positifs en environnements IaC +- 49 règles de détection sélectives et haut signal, conçues pour éviter les faux positifs en environnements IaC - Prêt pour CI/CD — codes de sortie d'application + sorties JSON/CSV/markdown ### Ce que CleanCloud ne fait PAS @@ -434,13 +434,13 @@ Oui. CleanCloud n'a besoin d'accès réseau qu'aux endpoints API de votre cloud ## Ce que CleanCloud détecte -48 règles pour AWS, Azure et GCP — conservatrices, haut signal, conçues pour éviter les faux positifs en environnements IaC. +49 règles pour AWS, Azure et GCP — conservatrices, haut signal, conçues pour éviter les faux positifs en environnements IaC. **AWS :** - Compute : instances arrêtées 30+ jours (charges EBS continuent) - Stockage : volumes EBS non attachés (HIGH), anciens snapshots EBS, anciennes AMIs, anciens snapshots RDS 90+ jours - Réseau : Elastic IPs non attachées (HIGH), ENI détachées, NAT Gateways inactives, Load Balancers inactifs (HIGH) -- Plateforme : instances RDS inactives (HIGH) +- Plateforme : instances RDS inactives (HIGH), clusters Redshift inactifs (zéro connexion 14+ jours) - Observabilité : logs CloudWatch à rétention infinie - Gouvernance : ressources sans tags, security groups inutilisés - IA/ML *(opt-in : `--category ai`)* : Bedrock Provisioned Throughput (Model Units) inactifs avec zéro invocation depuis 7+ jours ; endpoints SageMaker sans trafic `InvokeEndpoint` observé depuis 14+ jours ; instances Notebook SageMaker avec timestamps de contrôle inactifs depuis 14+ jours ; Domaines SageMaker sans apps en cours d'exécution sur tous les profils et espaces depuis 30+ jours (coût de stockage EFS continu) ; Studio Apps SageMaker (`KernelGateway`/`JupyterLab`/`CodeEditor`) sans signal d'activité récent exploitable depuis 7+ jours ; training jobs SageMaker toujours `InProgress` au-delà du seuil de 24h ; processing jobs SageMaker toujours `InProgress` au-delà du seuil de 24h diff --git a/README.md b/README.md index 86bf498..6106fa2 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Minimum estimated waste: ~$25,944/month - Catches expensive idle AI/ML waste: SageMaker, AML, Vertex AI — GPU-backed resources flagged as higher-risk review candidates ($500–$23K/month) - Works across AWS, Azure, and GCP in one tool - Runs entirely in your environment — no agents, no SaaS, no credentials stored -- 48 curated, high-signal detection rules designed to avoid false positives in IaC environments +- 49 curated, high-signal detection rules designed to avoid false positives in IaC environments - CI/CD-ready — enforcement exit codes + JSON/CSV/markdown output ### What CleanCloud does NOT do @@ -434,13 +434,13 @@ Yes. CleanCloud only needs network access to your cloud provider's API endpoints ## What CleanCloud Detects -48 rules across AWS, Azure, and GCP — conservative, high-signal, designed to avoid false positives in IaC environments. +49 rules across AWS, Azure, and GCP — conservative, high-signal, designed to avoid false positives in IaC environments. **AWS:** - Compute: stopped instances 30+ days (EBS charges continue) - Storage: unattached EBS volumes (HIGH), old EBS snapshots, old AMIs, old RDS snapshots 90+ days - Network: unattached Elastic IPs (HIGH), detached ENIs, idle NAT Gateways, idle load balancers (HIGH) -- Platform: idle RDS instances (HIGH) +- Platform: idle RDS instances (HIGH), idle Redshift clusters (zero connections 14+ days) - Observability: infinite retention CloudWatch Logs - Governance: untagged resources, unused security groups - AI/ML *(opt-in: `--category ai`)*: idle Bedrock Provisioned Throughput (Model Units) with zero invocations 7+ days; idle SageMaker endpoints with no observed `InvokeEndpoint` traffic 14+ days; SageMaker Notebook Instances with stale control-plane timestamps 14+ days; SageMaker Domains with no running apps across all user profiles and spaces 30+ days (continuous EFS storage cost); SageMaker Studio apps (`KernelGateway`/`JupyterLab`/`CodeEditor`) with no usable recent activity signal 7+ days; SageMaker training jobs still `InProgress` beyond the 24h threshold; SageMaker processing jobs still `InProgress` beyond the 24h threshold diff --git a/cleancloud/doctor/aws.py b/cleancloud/doctor/aws.py index 73fc70b..ce2c8c6 100644 --- a/cleancloud/doctor/aws.py +++ b/cleancloud/doctor/aws.py @@ -242,6 +242,7 @@ def run_aws_doctor(profile: Optional[str], region: Optional[str] = None) -> None info(" rds:DescribeDBSnapshots") info(" rds:DescribeDBSnapshotAttributes") info(" cloudtrail:LookupEvents") + info(" redshift:DescribeClusters") info(" elasticloadbalancing:DescribeLoadBalancers") info(" elasticloadbalancing:DescribeTargetGroups") info(" logs:DescribeLogGroups") @@ -520,6 +521,16 @@ def run_aws_doctor(profile: Optional[str], region: Optional[str] = None) -> None permissions_tested.append("rds:DescribeDBSnapshotAttributes") success("rds:DescribeDBSnapshotAttributes") + # Test Redshift permissions + try: + redshift = session.client("redshift", region_name=region) + redshift.describe_clusters(MaxRecords=20) + permissions_tested.append("redshift:DescribeClusters") + success("redshift:DescribeClusters") + except Exception as e: + permissions_failed.append(("redshift:DescribeClusters", str(e))) + warn(f"redshift:DescribeClusters - {e}") + # Test ELB permissions try: elbv2 = session.client("elbv2", region_name=region) diff --git a/cleancloud/providers/aws/rules/redshift_idle.py b/cleancloud/providers/aws/rules/redshift_idle.py new file mode 100644 index 0000000..d9090f6 --- /dev/null +++ b/cleancloud/providers/aws/rules/redshift_idle.py @@ -0,0 +1,393 @@ +""" +Rule: aws.redshift.cluster.idle + + (spec — docs/specs/aws/redshift_cluster_idle.md) + +Intent: + Detect provisioned Redshift clusters that are in 'available' status but have + had zero observed database connections over the configured idle window, so + they can be reviewed as candidates for pausing or deletion. + + This is a CleanCloud-derived idle heuristic based on Redshift cluster + metadata and CloudWatch activity metrics. It is a read-only + review-candidate rule — not a stop-safe or delete-safe rule. + +Exclusions: + - cluster_identifier absent (malformed inventory item) + - cluster_status absent or not "available" + - cluster_availability_status is Unavailable, Maintenance, or Failed + - ClusterCreateTime absent, naive, or future beyond clock_skew_tolerance + - cluster younger than idle_days_threshold + - CloudWatch returned no datapoints for DatabaseConnections + - DatabaseConnections Sum > 0 + +Detection: + - ClusterStatus == "available" + - cluster_age_days >= idle_days_threshold + - DatabaseConnections Sum == 0 over the full evaluation window + +Key rules: + - DatabaseConnections Sum is the sole required activity metric. + - Missing CloudWatch datapoints → SKIP ITEM (not zero). + - CloudWatch API failure → FAIL RULE. + - ReadIOPS / WriteIOPS are best-effort secondary signals for confidence. + - estimated_monthly_cost_usd = None. + - Confidence: HIGH when connections + IOPS all zero; MEDIUM otherwise. + - Risk: HIGH when number_of_nodes >= 4; MEDIUM otherwise. + - Paused clusters are excluded (already cost-optimized). + - Redshift Serverless is out of scope (separate API). + - clock_skew_tolerance_seconds = 300. + +Blind spots: + - Business value or planned future use + - Whether pausing or deleting is safe + - Disaster recovery or compliance retention purpose + - Exact price impact or savings impact + +APIs: + - redshift:DescribeClusters + - cloudwatch:GetMetricStatistics +""" + +from datetime import datetime, timedelta, timezone +from typing import List, Optional + +import boto3 +from botocore.exceptions import BotoCoreError, ClientError + +from cleancloud.core.confidence import ConfidenceLevel +from cleancloud.core.evidence import Evidence +from cleancloud.core.finding import Finding +from cleancloud.core.risk import RiskLevel +from cleancloud.providers.aws.errors import is_permission_error + +# --- Module-level constants --- + +_DEFAULT_IDLE_DAYS_THRESHOLD = 14 +_ELIGIBLE_STATUS = "available" +_CLOCK_SKEW_TOLERANCE_SECONDS = 300 +_CW_NAMESPACE = "AWS/Redshift" +_CW_DIM = "ClusterIdentifier" + +_UNAVAILABLE_AVAILABILITY_STATUSES = frozenset({"Unavailable", "Maintenance", "Failed"}) + +_FINDING_TITLE = "Idle Redshift cluster review candidate" +_FINDING_REASON = ( + "Available Redshift cluster has had zero database connections " + "over the configured idle window" +) + +_SIGNALS_NOT_CHECKED = ( + "Business value or planned future use", + "Whether pausing or deleting is safe", + "Disaster recovery or compliance retention purpose", + "Exact price impact or savings impact", +) + +RULE_METADATA = { + "id": "aws.redshift.cluster.idle", + "category": "hygiene", + "service": "redshift", + "cost_impact": "high", +} + + +def _str(value: object) -> Optional[str]: + """Return value as str only when it is a non-empty string; else None.""" + return value if isinstance(value, str) and value else None + + +def _choose_period(idle_days: int) -> int: + """Return a single full-window period in seconds.""" + return idle_days * 86400 + + +def _normalize_cluster(item: object, now_utc: datetime) -> Optional[dict]: + """Normalize a raw DescribeClusters item to canonical fields. + + Returns None when required fields are absent or invalid — caller must skip. + """ + if not isinstance(item, dict): + return None + + skew_tol = timedelta(seconds=_CLOCK_SKEW_TOLERANCE_SECONDS) + + # --- Identity (required) --- + cluster_identifier = _str(item.get("ClusterIdentifier")) + if cluster_identifier is None: + return None + + # --- Status (required) --- + cluster_status = _str(item.get("ClusterStatus")) + if cluster_status is None: + return None + + # --- ClusterAvailabilityStatus (optional) --- + cluster_availability_status = _str(item.get("ClusterAvailabilityStatus")) + + # --- ClusterCreateTime (required) --- + raw_ct = item.get("ClusterCreateTime") + if not isinstance(raw_ct, datetime): + return None + if raw_ct.tzinfo is None: + return None + cluster_create_time_utc = raw_ct.astimezone(timezone.utc) + if cluster_create_time_utc > now_utc + skew_tol: + return None + + cluster_age_days = max(0, int((now_utc - cluster_create_time_utc).total_seconds() // 86400)) + + # --- Optional fields --- + node_type = _str(item.get("NodeType")) + + raw_num_nodes = item.get("NumberOfNodes") + number_of_nodes = ( + raw_num_nodes + if isinstance(raw_num_nodes, int) + and not isinstance(raw_num_nodes, bool) + and raw_num_nodes > 0 + else None + ) + + cluster_namespace_arn = _str(item.get("ClusterNamespaceArn")) + + endpoint = item.get("Endpoint") + if isinstance(endpoint, dict): + cluster_endpoint_address = _str(endpoint.get("Address")) + raw_port = endpoint.get("Port") + cluster_endpoint_port = ( + raw_port if isinstance(raw_port, int) and not isinstance(raw_port, bool) else None + ) + else: + cluster_endpoint_address = None + cluster_endpoint_port = None + + raw_storage = item.get("TotalStorageCapacityInMegaBytes") + total_storage_capacity_mb = ( + raw_storage + if isinstance(raw_storage, (int, float)) and not isinstance(raw_storage, bool) + else None + ) + + resource_id = cluster_namespace_arn or cluster_identifier + + return { + "cluster_identifier": cluster_identifier, + "cluster_status": cluster_status, + "cluster_availability_status": cluster_availability_status, + "cluster_create_time_utc": cluster_create_time_utc, + "cluster_age_days": cluster_age_days, + "node_type": node_type, + "number_of_nodes": number_of_nodes, + "cluster_namespace_arn": cluster_namespace_arn, + "cluster_endpoint_address": cluster_endpoint_address, + "cluster_endpoint_port": cluster_endpoint_port, + "total_storage_capacity_mb": total_storage_capacity_mb, + "resource_id": resource_id, + } + + +def _get_cw_sum( + cloudwatch, + metric_name: str, + cluster_identifier: str, + start_time: datetime, + end_time: datetime, + period: int, +) -> Optional[float]: + """Fetch a CloudWatch metric Sum over the observation window. + + Returns None if no datapoints (insufficient evidence → caller must SKIP ITEM). + Returns the Sum value (>= 0.0) if datapoints are present. + Raises on API failure (caller → FAIL RULE). + """ + try: + resp = cloudwatch.get_metric_statistics( + Namespace=_CW_NAMESPACE, + MetricName=metric_name, + Dimensions=[{"Name": _CW_DIM, "Value": cluster_identifier}], + StartTime=start_time, + EndTime=end_time, + Period=period, + Statistics=["Sum"], + ) + except ClientError as exc: + if is_permission_error(exc): + raise PermissionError( + "Missing required IAM permission: cloudwatch:GetMetricStatistics" + ) from exc + raise + except BotoCoreError: + raise + + datapoints = resp.get("Datapoints", []) + if not datapoints: + return None + + return sum(dp.get("Sum", 0.0) for dp in datapoints) + + +def _get_secondary_sum( + cloudwatch, + metric_name: str, + cluster_identifier: str, + start_time: datetime, + end_time: datetime, + period: int, +) -> Optional[float]: + """Best-effort fetch of a secondary CloudWatch metric. Returns None on any failure.""" + try: + return _get_cw_sum( + cloudwatch, metric_name, cluster_identifier, start_time, end_time, period + ) + except Exception: + return None + + +def find_idle_redshift_clusters( + session: boto3.Session, + region: str, + idle_days_threshold: int = _DEFAULT_IDLE_DAYS_THRESHOLD, +) -> List[Finding]: + redshift = session.client("redshift", region_name=region) + cloudwatch = session.client("cloudwatch", region_name=region) + + # Spec 8: paginate DescribeClusters. + try: + paginator = redshift.get_paginator("describe_clusters") + pages = list(paginator.paginate()) + except ClientError as exc: + if is_permission_error(exc): + raise PermissionError( + "Missing required IAM permission: redshift:DescribeClusters" + ) from exc + raise + except BotoCoreError: + raise + + now = datetime.now(timezone.utc) + window_start = now - timedelta(seconds=idle_days_threshold * 86400) + period = _choose_period(idle_days_threshold) + findings: List[Finding] = [] + + for page in pages: + for raw_item in page.get("Clusters", []): + # --- Step 1: Normalize --- + n = _normalize_cluster(raw_item, now) + if n is None: + continue + + # --- Step 2: Exclusion rules --- + + # Status must be "available" + if n["cluster_status"] != _ELIGIBLE_STATUS: + continue + + # Exclude transient availability states + if ( + n["cluster_availability_status"] is not None + and n["cluster_availability_status"] in _UNAVAILABLE_AVAILABILITY_STATUSES + ): + continue + + # Too young to evaluate + if n["cluster_age_days"] < idle_days_threshold: + continue + + # --- Step 3: Primary CloudWatch signal (FAIL RULE on error) --- + database_connections_sum = _get_cw_sum( + cloudwatch, + "DatabaseConnections", + n["cluster_identifier"], + window_start, + now, + period, + ) + + # No datapoints → insufficient evidence → SKIP ITEM + if database_connections_sum is None: + continue + + # Any connections → not idle + if database_connections_sum > 0: + continue + + # --- Step 4: Secondary signals (best-effort) --- + read_iops_sum = _get_secondary_sum( + cloudwatch, "ReadIOPS", n["cluster_identifier"], window_start, now, period + ) + write_iops_sum = _get_secondary_sum( + cloudwatch, "WriteIOPS", n["cluster_identifier"], window_start, now, period + ) + + # --- Step 5: Confidence and Risk --- + all_secondary_present = read_iops_sum is not None and write_iops_sum is not None + all_secondary_zero = ( + all_secondary_present and read_iops_sum == 0 and write_iops_sum == 0 + ) + confidence = ConfidenceLevel.HIGH if all_secondary_zero else ConfidenceLevel.MEDIUM + + risk = ( + RiskLevel.HIGH + if n["number_of_nodes"] is not None and n["number_of_nodes"] >= 4 + else RiskLevel.MEDIUM + ) + + # --- Step 6: Emit --- + signals_used = [ + f"Cluster status is '{_ELIGIBLE_STATUS}'", + f"Cluster age is {n['cluster_age_days']} days, meeting the " + f"{idle_days_threshold}-day threshold", + f"DatabaseConnections Sum was 0 over the {idle_days_threshold}-day " + "evaluation window", + ] + + findings.append( + Finding( + provider="aws", + rule_id="aws.redshift.cluster.idle", + resource_type="aws.redshift.cluster", + resource_id=n["resource_id"], + region=region, + estimated_monthly_cost_usd=None, + title=_FINDING_TITLE, + summary=( + f"Redshift cluster '{n['cluster_identifier']}' has had zero " + f"database connections for {idle_days_threshold} days" + ), + reason=_FINDING_REASON, + risk=risk, + confidence=confidence, + detected_at=now, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=list(_SIGNALS_NOT_CHECKED), + time_window=f"{idle_days_threshold} days", + ), + details={ + # Required fields + "evaluation_path": "idle-redshift-cluster-review-candidate", + "cluster_identifier": n["cluster_identifier"], + "resource_id": n["resource_id"], + "cluster_status": _ELIGIBLE_STATUS, + "cluster_create_time": n["cluster_create_time_utc"].isoformat(), + "cluster_age_days": n["cluster_age_days"], + "node_type": n["node_type"], + "number_of_nodes": n["number_of_nodes"], + "idle_days_threshold": idle_days_threshold, + "evaluation_window_start": window_start.isoformat(), + "evaluation_window_end": now.isoformat(), + "database_connections_sum": database_connections_sum, + "is_idle": True, + # Optional context + "cluster_availability_status": n["cluster_availability_status"], + "cluster_endpoint_address": n["cluster_endpoint_address"], + "cluster_endpoint_port": n["cluster_endpoint_port"], + "read_iops_sum": read_iops_sum, + "write_iops_sum": write_iops_sum, + "total_storage_capacity_mb": n["total_storage_capacity_mb"], + }, + ) + ) + + return findings diff --git a/cleancloud/providers/aws/scan.py b/cleancloud/providers/aws/scan.py index 31bed8b..c50f3e9 100644 --- a/cleancloud/providers/aws/scan.py +++ b/cleancloud/providers/aws/scan.py @@ -46,6 +46,7 @@ from cleancloud.providers.aws.rules.nat_gateway_idle import find_idle_nat_gateways from cleancloud.providers.aws.rules.rds_idle import find_idle_rds_instances from cleancloud.providers.aws.rules.rds_snapshot_old import find_old_rds_snapshots +from cleancloud.providers.aws.rules.redshift_idle import find_idle_redshift_clusters from cleancloud.providers.aws.rules.untagged_resources import ( find_untagged_resources as find_aws_untagged_resources, ) @@ -69,6 +70,7 @@ "aws.ec2.instance.stopped": find_stopped_ec2_instances, "aws.ec2.security_group.unused": find_unused_security_groups, "aws.rds.snapshot.old": find_old_rds_snapshots, + "aws.redshift.cluster.idle": find_idle_redshift_clusters, } AWS_RULE_MAP_AI: Dict[str, Callable] = { @@ -147,12 +149,12 @@ def scan_aws_with_region_selection( click.echo(f" {', '.join(regions_to_scan)}") if include_ai: click.echo( - " (Regions with EBS volumes, snapshots, logs, Elastic IPs, ENIs, RDS, " + " (Regions with EBS volumes, snapshots, logs, Elastic IPs, ENIs, RDS, Redshift, " "NAT Gateways, ELBs, SageMaker AI resources, or Bedrock provisioned throughputs)" ) else: click.echo( - " (Regions with EBS volumes, snapshots, logs, Elastic IPs, ENIs, RDS, NAT Gateways, or ELBs)" + " (Regions with EBS volumes, snapshots, logs, Elastic IPs, ENIs, RDS, Redshift, NAT Gateways, or ELBs)" ) else: click.echo("No active regions detected") @@ -286,6 +288,15 @@ def _region_has_cleancloud_resources( if lbs.get("LoadBalancers"): return True, None + # 9. Check Redshift clusters + try: + redshift = session.client("redshift", region_name=region, config=BOTO_CONFIG) + clusters = redshift.describe_clusters(MaxRecords=20) + if clusters.get("Clusters"): + return True, None + except Exception: + pass # no Redshift perms or service not available — continue + # AI resource probes — only when running AI/ML rules. # Wrapped individually so a missing permission for one service doesn't # prevent the other from being checked. diff --git a/deploy/cloudformation/cleancloud-role.yaml b/deploy/cloudformation/cleancloud-role.yaml index 12af5cf..2f3e7df 100644 --- a/deploy/cloudformation/cleancloud-role.yaml +++ b/deploy/cloudformation/cleancloud-role.yaml @@ -86,6 +86,11 @@ Resources: - rds:DescribeDBSnapshots - rds:DescribeDBSnapshotAttributes Resource: "*" + - Sid: RedshiftReadOnly + Effect: Allow + Action: + - redshift:DescribeClusters + Resource: "*" - Sid: CloudTrailReadOnly Effect: Allow Action: diff --git a/deploy/terraform/aws/main.tf b/deploy/terraform/aws/main.tf index c1d2a1a..0ec9f38 100644 --- a/deploy/terraform/aws/main.tf +++ b/deploy/terraform/aws/main.tf @@ -135,6 +135,14 @@ resource "aws_iam_role_policy" "cleancloud" { ] Resource = "*" }, + { + Sid = "RedshiftReadOnly" + Effect = "Allow" + Action = [ + "redshift:DescribeClusters", + ] + Resource = "*" + }, { Sid = "CloudTrailReadOnly" Effect = "Allow" diff --git a/docs/aws.md b/docs/aws.md index 235dd5d..2f456d5 100644 --- a/docs/aws.md +++ b/docs/aws.md @@ -358,6 +358,14 @@ Attach this policy to your IAM role or user for the default hygiene scan path (c ], "Resource": "*" }, + { + "Sid": "RedshiftReadOnly", + "Effect": "Allow", + "Action": [ + "redshift:DescribeClusters" + ], + "Resource": "*" + }, { "Sid": "CloudWatchReadOnly", "Effect": "Allow", diff --git a/docs/rules/aws.md b/docs/rules/aws.md index 648d4e0..96be0ae 100644 --- a/docs/rules/aws.md +++ b/docs/rules/aws.md @@ -16,6 +16,7 @@ | `aws.ec2.nat_gateway.idle` | Network | NAT Gateways with zero traffic 14+ days | | `aws.elbv2.alb.idle` / `aws.elbv2.nlb.idle` / `aws.elb.clb.idle` | Network | Load balancers with zero traffic 14+ days | | `aws.rds.instance.idle` | Platform | RDS instances with zero connections 14+ days | +| `aws.redshift.cluster.idle` | Platform | Redshift clusters with zero connections 14+ days | | `aws.rds.snapshot.old` | Storage | Manual RDS snapshots older than 90 days | | `aws.cloudwatch.logs.infinite_retention` | Observability | Log groups with no retention policy | | `aws.resource.untagged` | Governance | EC2/S3/CloudWatch resources with zero tags | @@ -203,6 +204,19 @@ **Spec:** [specs/aws/rds_idle.md](../specs/aws/rds_idle.md) +#### `aws.redshift.cluster.idle` +**Detects:** Provisioned Redshift clusters with zero `DatabaseConnections` for `idle_days_threshold` + +**Confidence / Risk:** HIGH (zero connections + zero IOPS); MEDIUM (zero connections only) / HIGH (4+ nodes); MEDIUM otherwise + +**Permissions:** `redshift:DescribeClusters`, `cloudwatch:GetMetricStatistics` + +**Params:** `idle_days_threshold` (default: 14) + +**Exclusions:** paused clusters, non-`available` status, `ClusterAvailabilityStatus` of Unavailable/Maintenance/Failed, clusters younger than threshold, Redshift Serverless workgroups + +**Spec:** [specs/aws/redshift_cluster_idle.md](../specs/aws/redshift_cluster_idle.md) + --- ## Observability diff --git a/docs/specs/aws/redshift_cluster_idle.md b/docs/specs/aws/redshift_cluster_idle.md new file mode 100644 index 0000000..c764d2a --- /dev/null +++ b/docs/specs/aws/redshift_cluster_idle.md @@ -0,0 +1,336 @@ +# aws.redshift.cluster.idle — Canonical Rule Specification + +## 1. Intent + +Detect provisioned Redshift clusters that are in `available` status but have had zero observed +database connections over the configured idle window, so they can be reviewed as candidates for +pausing or deletion. + +This is a **CleanCloud-derived idle heuristic** based on Redshift cluster metadata and CloudWatch +activity metrics. It is a **read-only review-candidate rule** — not a stop-safe or delete-safe +rule. + +--- + +## 2. AWS API Grounding + +Based on official Redshift provisioned cluster API and CloudWatch documentation. + +### Key facts + +1. `DescribeClusters` is the canonical inventory API for provisioned Redshift clusters and supports + pagination via `Marker` (max 100 records per page). +2. `DescribeClusters` returns `Cluster` objects including `ClusterIdentifier`, + `ClusterNamespaceArn`, `ClusterStatus`, `ClusterAvailabilityStatus`, `ClusterCreateTime`, + `NodeType`, `NumberOfNodes`, `Endpoint`, `Tags`, and `TotalStorageCapacityInMegaBytes`. +3. `ClusterStatus` has 20 valid values including `available`, `paused`, `creating`, `deleting`, + `modifying`, `resizing`, `rebooting`, `renaming`, and various `incompatible-*` states. The + value `paused` indicates compute billing is suspended (storage only). +4. `ClusterAvailabilityStatus` has 5 valid values: `Available`, `Unavailable`, `Maintenance`, + `Modifying`, `Failed`. +5. There is **no** `LastQueryTime`, `LastActivityTime`, or equivalent field in the API response. + Idleness must be inferred from CloudWatch metrics. +6. Paused clusters do **not** publish hardware CloudWatch metrics. `DatabaseConnections`, + `CPUUtilization`, `ReadIOPS`, etc. return no datapoints while a cluster is paused. +7. Paused clusters pay storage only — they are already cost-optimized from a compute perspective + and should be excluded from this rule. +8. Redshift Serverless uses a separate boto3 client (`redshift-serverless`) and separate APIs + (`list_workgroups`, `get_workgroup`). `DescribeClusters` does **not** return serverless + workgroups. Serverless is out of scope for this rule. +9. CloudWatch namespace `AWS/Redshift` publishes metrics automatically at 1-minute intervals. + Key idle-detection metrics: + - `DatabaseConnections` — number of database connections to the cluster. Dimension: + `ClusterIdentifier` only. 1-minute interval. + - `ReadIOPS` / `WriteIOPS` — average disk read/write operations per second. Dimension: + `ClusterIdentifier` or `ClusterIdentifier` + `NodeID`. 1-minute interval. +10. `QueriesCompletedPerSecond` is **not supported on single-node clusters** and reports at + 5-minute intervals. It must not be used as the primary idle signal. +11. Fixed monthly USD cost estimates are not canonical from the fetched AWS docs. + +### Implications + +- Inventory must be built by fully paginating `DescribeClusters`. +- Idleness is determined by CloudWatch `DatabaseConnections` (Sum = 0) over the idle window. + `ReadIOPS` and `WriteIOPS` (Sum ≈ 0) provide secondary confirmation. +- `ClusterStatus == "paused"` clusters must be excluded — they already pay storage only. +- Only `ClusterStatus == "available"` clusters are eligible for idle evaluation. +- `QueriesCompletedPerSecond` must not be a required signal due to the single-node limitation. +- `estimated_monthly_cost_usd = null`. + +--- + +## 3. Scope and Terminology + +- **Cluster** — an item returned by `DescribeClusters`. +- **Eligible status** — `ClusterStatus == "available"`. +- `idle_days_threshold` — operator-configurable integer >= 1, default 14. +- `idle_window_seconds` — `idle_days_threshold × 86400`. +- **evaluation_window_start_utc** — `now_utc - idle_window_seconds`. +- **evaluation_window_end_utc** — `now_utc`. +- **idle** — `DatabaseConnections` Sum = 0 over a single full-window aggregate period, with at + least one datapoint returned (missing data = inconclusive, not idle). + +### Explicit scope boundary + +This rule applies only to provisioned Redshift clusters whose `ClusterStatus` is `available`. + +Out of scope: + +- `paused` clusters (already cost-optimized, storage only) +- `creating`, `deleting`, `modifying`, `resizing`, `rebooting`, `renaming`, `final-snapshot` +- All `incompatible-*` and `hardware-failure` statuses +- `storage-full`, `rotating-keys`, `updating-hsm`, `cancelling-resize` +- `available, prep-for-resize` and `available, resize-cleanup` (transient states) +- Redshift Serverless workgroups (separate service, separate APIs) +- exact price estimation, accrued USD estimation, or savings estimation + +--- + +## 4. Canonical Rule Statement + +A provisioned Redshift cluster is flagged as idle only when **all** of the following are true: + +- stable cluster identity exists (`ClusterIdentifier`) +- `ClusterStatus == "available"` +- `ClusterCreateTime` is valid and the cluster is older than `idle_days_threshold` +- CloudWatch `DatabaseConnections` Sum = 0 over a single full-window aggregate period, with at + least one datapoint returned (no datapoints = inconclusive, skip) + +No additional predicate may be required for baseline eligibility, including node type, node count, +or static cost heuristics. + +--- + +## 5. Normalization Contract + +All rule logic must operate on normalized fields only. + +### 5.1 Describe-Level Fields + +| Canonical field | Source field | Absent / invalid | +|---|---|---| +| `cluster_identifier` | `ClusterIdentifier` | skip item | +| `cluster_status` | `ClusterStatus` | skip item | +| `cluster_availability_status` | `ClusterAvailabilityStatus` | null | +| `cluster_create_time_utc` | `ClusterCreateTime` (tz-aware UTC) | skip item | +| `node_type` | `NodeType` | null | +| `number_of_nodes` | `NumberOfNodes` | null | +| `cluster_namespace_arn` | `ClusterNamespaceArn` | null | +| `cluster_endpoint_address` | `Endpoint.Address` | null | +| `cluster_endpoint_port` | `Endpoint.Port` | null | +| `total_storage_capacity_mb` | `TotalStorageCapacityInMegaBytes` | null | + +### 5.2 CloudWatch-Derived Fields + +| Canonical field | Derivation | +|---|---| +| `database_connections_sum` | Sum of `DatabaseConnections` over evaluation window | +| `read_iops_sum` | Sum of `ReadIOPS` over evaluation window | +| `write_iops_sum` | Sum of `WriteIOPS` over evaluation window | +| `is_idle` | `true` when `database_connections_sum == 0` | + +### 5.3 Derived Fields + +| Canonical field | Derivation | +|---|---| +| `cluster_age_days` | `max(0, floor((now_utc - cluster_create_time_utc).total_seconds() / 86400))` | +| `resource_id` | `cluster_namespace_arn` when present; else `cluster_identifier` | + +Normalization requirements: + +- String-valued fields: normalize only from non-empty strings. +- Timestamp fields: must be timezone-aware UTC before use; naive timestamps must skip the item. +- `ClusterCreateTime` future beyond `clock_skew_tolerance_seconds` (300) must skip the item. +- Clusters younger than `idle_days_threshold` must be skipped — insufficient evaluation history. + +--- + +## 6. Idle Signal Contract + +This rule evaluates **connection and I/O activity**, not query correctness or business value. + +### 6.1 Primary idle signal + +- Query CloudWatch `DatabaseConnections` with dimension `ClusterIdentifier` over the evaluation + window using `Sum` statistic with `Period = idle_window_seconds` (single full-window aggregate). +- A single-period aggregate avoids the gap problem: if CloudWatch returns no datapoints, the + cluster may have been paused or otherwise unavailable during the window — treat missing data as + **inconclusive** and **SKIP ITEM** (do not emit). +- If the returned Sum is 0, the cluster is idle — no client has connected. +- If CloudWatch returns no datapoints at all, **SKIP ITEM** — insufficient evidence. + +### 6.2 Secondary confirmation signals (best-effort) + +- `ReadIOPS` Sum ≈ 0 and `WriteIOPS` Sum ≈ 0 over the same window confirms no disk activity. +- These are **optional context only** — the primary idle decision is based on + `DatabaseConnections` alone and must not change based on secondary signal availability. +- If secondary metric retrieval fails or returns no datapoints: **omit from details**, set the + corresponding field to `null`, and **degrade confidence to MEDIUM**. Do not change the primary + idle decision or skip the item. + +### 6.3 Explicit blind spots + +This rule does **not** prove: + +- that the cluster has no business value or planned future use +- that pausing or deleting the cluster is safe +- that the cluster is not used for disaster recovery or compliance retention +- exact price impact or savings impact + +--- + +## 7. Pricing / Cost Boundary + +- `estimated_monthly_cost_usd = null` +- Do not hardcode instance-price tables, accrued USD estimates, or regional billing assumptions. +- `NodeType` and `NumberOfNodes` are emitted as context for the reviewer to assess cost impact. + +--- + +## 8. Deterministic Evaluation Order + +1. Retrieve and fully paginate `DescribeClusters`. +2. Normalize each cluster. +3. For each normalized cluster: + - identity absent → **SKIP ITEM** + - `cluster_status` absent → **SKIP ITEM** + - `cluster_status != "available"` → **SKIP ITEM** + - `cluster_availability_status` is `Unavailable`, `Maintenance`, or `Failed` (when + present) → **SKIP ITEM** + - `cluster_create_time_utc` absent / naive / future beyond skew tolerance → **SKIP ITEM** + - `cluster_age_days < idle_days_threshold` → **SKIP ITEM** +4. Query CloudWatch `DatabaseConnections` for the cluster over the evaluation window using a + single full-window aggregate period. +5. CloudWatch permission or request failure → **FAIL RULE**. +6. CloudWatch returned no datapoints → **SKIP ITEM** (insufficient evidence). +7. `DatabaseConnections` Sum > 0 → **SKIP ITEM** (not idle). +8. Otherwise → **EMIT**. + +No raw AWS field access after normalization. + +--- + +## 9. Exclusion Rules + +1. identity absent (`cluster_identifier`) → malformed inventory item +2. status absent → missing primary state +3. status not `available` → out of scope (includes `paused`, `creating`, `deleting`, etc.) +4. `cluster_availability_status` is `Unavailable`, `Maintenance`, or `Failed` → transient state +5. `ClusterCreateTime` absent / naive / future → missing or invalid timestamp +6. cluster younger than `idle_days_threshold` → insufficient evaluation history +7. CloudWatch returned no datapoints → insufficient evidence +8. `DatabaseConnections` Sum > 0 → not idle + +--- + +## 10. Failure Model + +**Rule-level failures (FAIL RULE):** + +- `DescribeClusters` request or pagination failure +- `DescribeClusters` permission failure +- CloudWatch `GetMetricStatistics` permission failure +- CloudWatch `GetMetricStatistics` request failure for the primary `DatabaseConnections` metric + (any non-permission error is still a rule failure — this is a required signal, not optional + context) + +**Item-level skips (SKIP ITEM):** + +- malformed identity or missing required fields +- non-`available` status +- `cluster_availability_status` is `Unavailable`, `Maintenance`, or `Failed` (when present) +- cluster too young for evaluation +- non-zero database connections +- CloudWatch returned no datapoints for `DatabaseConnections` (insufficient evidence) + +--- + +## 11. Evidence / Details Contract + +### Required details fields + +``` +evaluation_path = "idle-redshift-cluster-review-candidate" +cluster_identifier +resource_id +cluster_status = "available" +cluster_create_time +cluster_age_days +node_type +number_of_nodes +idle_days_threshold +evaluation_window_start +evaluation_window_end +database_connections_sum +is_idle = true +``` + +### Optional context fields + +``` +cluster_availability_status +cluster_endpoint_address +cluster_endpoint_port +read_iops_sum +write_iops_sum +total_storage_capacity_mb +``` + +### Required evidence wording + +**Signals used** must state: + +- cluster status is `available` +- `DatabaseConnections` Sum was 0 over the evaluation window +- the idle window duration + +**Signals not checked** must state major blind spots: + +- business value or planned future use +- whether pausing or deleting is safe +- disaster recovery or compliance retention purpose +- exact price impact or savings impact + +--- + +## 12. Confidence Model + +| Condition | Confidence | +|---|---| +| `database_connections_sum == 0` AND `read_iops_sum == 0` AND `write_iops_sum == 0` (all present) | `HIGH` | +| `database_connections_sum == 0` AND either secondary signal is `null` (missing/failed) | `MEDIUM` | +| `database_connections_sum == 0` only (secondary signals not zero) | `MEDIUM` | + +No LOW finding should be emitted. + +--- + +## 13. Risk Model + +| Condition | Risk | +|---|---| +| `number_of_nodes >= 4` | `HIGH` | +| all other emitted findings | `MEDIUM` | + +Risk is about likely waste severity based on cluster size, not proof of safe action. Node count +is a stable shape signal that does not require maintaining a list of instance types. + +--- + +## 14. Title and Reason Contract + +| Condition | Title | Reason | +|---|---|---| +| Idle Redshift cluster finding | `"Idle Redshift cluster review candidate"` | `"Available Redshift cluster has had zero database connections over the configured idle window"` | + +--- + +## 15. Non-Goals + +This rule does **not**: + +- infer exact billing from static node-price tables +- cover Redshift Serverless workgroups (separate service, separate APIs) +- cover paused clusters (already cost-optimized) +- determine whether a cluster should be paused or deleted automatically +- use `QueriesCompletedPerSecond` as a required signal (not supported on single-node clusters) diff --git a/security/aws/hygiene-readonly.json b/security/aws/hygiene-readonly.json index a1c7107..f953769 100644 --- a/security/aws/hygiene-readonly.json +++ b/security/aws/hygiene-readonly.json @@ -38,6 +38,14 @@ ], "Resource": "*" }, + { + "Sid": "RedshiftReadOnly", + "Effect": "Allow", + "Action": [ + "redshift:DescribeClusters" + ], + "Resource": "*" + }, { "Sid": "CloudTrailReadOnly", "Effect": "Allow", diff --git a/tests/cleancloud/providers/aws/test_aws_redshift_idle.py b/tests/cleancloud/providers/aws/test_aws_redshift_idle.py new file mode 100644 index 0000000..575e5fa --- /dev/null +++ b/tests/cleancloud/providers/aws/test_aws_redshift_idle.py @@ -0,0 +1,501 @@ +""" +Tests for aws.redshift.cluster.idle rule. + +Test class overview: + TestMustEmit — canonical detection path + TestMustSkip — all exclusion rules + TestMustFailRule — required API failure behaviour + TestNormalization — _normalize_cluster field extraction + TestConfidenceModel — HIGH with corroboration, MEDIUM without + TestRiskModel — HIGH for 4+ nodes, MEDIUM otherwise + TestEvidenceContract — signals_used, signals_not_checked, evaluation_path + TestRuleMetadata — rule_id, category, service, cost_impact +""" + +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock + +import pytest +from botocore.exceptions import BotoCoreError, ClientError + +from cleancloud.providers.aws.rules.redshift_idle import ( + _normalize_cluster, + find_idle_redshift_clusters, +) + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +_REGION = "us-east-1" + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +def _old() -> datetime: + """30 days ago — always older than the default 14-day threshold.""" + return datetime.now(timezone.utc) - timedelta(days=30) + + +def _young() -> datetime: + """5 days ago — always younger than the default 14-day threshold.""" + return datetime.now(timezone.utc) - timedelta(days=5) + + +def _client_error(code: str) -> ClientError: + return ClientError({"Error": {"Code": code, "Message": code}}, "op") + + +def _make_cluster(**overrides) -> dict: + """Return a minimal valid DescribeClusters item.""" + base = { + "ClusterIdentifier": "test-cluster", + "ClusterStatus": "available", + "ClusterAvailabilityStatus": "Available", + "ClusterCreateTime": _old(), + "NodeType": "dc2.large", + "NumberOfNodes": 2, + "ClusterNamespaceArn": "arn:aws:redshift:us-east-1:123456789012:namespace:test", + "Endpoint": {"Address": "test.us-east-1.redshift.amazonaws.com", "Port": 5439}, + "TotalStorageCapacityInMegaBytes": 640000, + } + base.update(overrides) + return base + + +def _zero_connections_response() -> dict: + return {"Datapoints": [{"Sum": 0.0}]} + + +def _nonzero_connections_response(val: float = 5.0) -> dict: + return {"Datapoints": [{"Sum": val}]} + + +def _no_datapoints_response() -> dict: + return {"Datapoints": []} + + +def _setup( + mock_boto3_session, + clusters: list, + cw_responses=None, + cw_side_effect=None, +): + """Wire up Redshift paginator and CloudWatch mock.""" + redshift = MagicMock() + paginator = MagicMock() + paginator.paginate.return_value = [{"Clusters": clusters}] + redshift.get_paginator.return_value = paginator + + cloudwatch = MagicMock() + if cw_side_effect is not None: + cloudwatch.get_metric_statistics.side_effect = cw_side_effect + elif cw_responses is not None: + cloudwatch.get_metric_statistics.side_effect = cw_responses + else: + # Default: zero connections, zero IOPS + cloudwatch.get_metric_statistics.return_value = _zero_connections_response() + + def client_side_effect(service, **kwargs): + if service == "redshift": + return redshift + if service == "cloudwatch": + return cloudwatch + raise ValueError(f"Unexpected service: {service}") + + mock_boto3_session.client.side_effect = client_side_effect + return redshift, cloudwatch + + +# --------------------------------------------------------------------------- +# TestMustEmit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_canonical_idle_cluster_emits(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 1 + f = findings[0] + assert f.provider == "aws" + assert f.rule_id == "aws.redshift.cluster.idle" + assert f.resource_type == "aws.redshift.cluster" + assert f.region == _REGION + + def test_resource_id_uses_namespace_arn(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].resource_id == ("arn:aws:redshift:us-east-1:123456789012:namespace:test") + + def test_resource_id_falls_back_to_identifier(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterNamespaceArn=None)], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].resource_id == "test-cluster" + + def test_details_required_fields_present(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + d = findings[0].details + for key in ( + "evaluation_path", + "cluster_identifier", + "resource_id", + "cluster_status", + "cluster_create_time", + "cluster_age_days", + "node_type", + "number_of_nodes", + "idle_days_threshold", + "evaluation_window_start", + "evaluation_window_end", + "database_connections_sum", + "is_idle", + ): + assert key in d, f"Missing required detail key: {key}" + + def test_details_optional_fields_present(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + d = findings[0].details + for key in ( + "cluster_availability_status", + "cluster_endpoint_address", + "cluster_endpoint_port", + "read_iops_sum", + "write_iops_sum", + "total_storage_capacity_mb", + ): + assert key in d, f"Missing optional detail key: {key}" + + +# --------------------------------------------------------------------------- +# TestMustSkip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_skip_paused_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(ClusterStatus="paused")]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_creating_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(ClusterStatus="creating")]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_deleting_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(ClusterStatus="deleting")]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_modifying_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(ClusterStatus="modifying")]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_unavailable_availability_status(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterAvailabilityStatus="Unavailable")], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_maintenance_availability_status(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterAvailabilityStatus="Maintenance")], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_failed_availability_status(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterAvailabilityStatus="Failed")], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_young_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(ClusterCreateTime=_young())]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_nonzero_connections(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_responses=[_nonzero_connections_response()], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_no_datapoints(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_responses=[_no_datapoints_response()], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_missing_identifier(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterIdentifier=None)], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_missing_status(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterStatus=None)], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_missing_create_time(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterCreateTime=None)], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_naive_create_time(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster(ClusterCreateTime=datetime(2024, 1, 1))], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_skip_future_create_time(self, mock_boto3_session): + future = datetime.now(timezone.utc) + timedelta(hours=1) + _setup( + mock_boto3_session, + [_make_cluster(ClusterCreateTime=future)], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 0 + + def test_null_availability_status_does_not_skip(self, mock_boto3_session): + """When ClusterAvailabilityStatus is absent, the cluster is not skipped.""" + _setup( + mock_boto3_session, + [_make_cluster(ClusterAvailabilityStatus=None)], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert len(findings) == 1 + + +# --------------------------------------------------------------------------- +# TestMustFailRule +# --------------------------------------------------------------------------- + + +class TestMustFailRule: + def test_describe_clusters_permission_error(self, mock_boto3_session): + redshift = MagicMock() + paginator = MagicMock() + paginator.paginate.side_effect = _client_error("AccessDenied") + redshift.get_paginator.return_value = paginator + mock_boto3_session.client.side_effect = lambda s, **kw: redshift + + with pytest.raises(PermissionError, match="redshift:DescribeClusters"): + find_idle_redshift_clusters(mock_boto3_session, _REGION) + + def test_cloudwatch_permission_error(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_side_effect=_client_error("AccessDenied"), + ) + with pytest.raises(PermissionError, match="cloudwatch:GetMetricStatistics"): + find_idle_redshift_clusters(mock_boto3_session, _REGION) + + def test_cloudwatch_request_failure_raises(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_side_effect=BotoCoreError(), + ) + with pytest.raises(BotoCoreError): + find_idle_redshift_clusters(mock_boto3_session, _REGION) + + +# --------------------------------------------------------------------------- +# TestNormalization +# --------------------------------------------------------------------------- + + +class TestNormalization: + def test_valid_cluster_normalizes(self): + n = _normalize_cluster(_make_cluster(), _now()) + assert n is not None + assert n["cluster_identifier"] == "test-cluster" + assert n["cluster_status"] == "available" + assert n["node_type"] == "dc2.large" + assert n["number_of_nodes"] == 2 + assert n["cluster_endpoint_address"] == "test.us-east-1.redshift.amazonaws.com" + assert n["cluster_endpoint_port"] == 5439 + + def test_non_dict_returns_none(self): + assert _normalize_cluster("not a dict", _now()) is None + + def test_missing_identifier_returns_none(self): + assert _normalize_cluster(_make_cluster(ClusterIdentifier=None), _now()) is None + + def test_missing_status_returns_none(self): + assert _normalize_cluster(_make_cluster(ClusterStatus=None), _now()) is None + + def test_naive_create_time_returns_none(self): + assert ( + _normalize_cluster(_make_cluster(ClusterCreateTime=datetime(2024, 1, 1)), _now()) + is None + ) + + def test_bool_number_of_nodes_treated_as_none(self): + n = _normalize_cluster(_make_cluster(NumberOfNodes=True), _now()) + assert n is not None + assert n["number_of_nodes"] is None + + def test_missing_endpoint_degrades_to_null(self): + n = _normalize_cluster(_make_cluster(Endpoint=None), _now()) + assert n is not None + assert n["cluster_endpoint_address"] is None + assert n["cluster_endpoint_port"] is None + + def test_cluster_age_days_not_negative(self): + slightly_future = datetime.now(timezone.utc) + timedelta(seconds=100) + n = _normalize_cluster(_make_cluster(ClusterCreateTime=slightly_future), _now()) + assert n is not None + assert n["cluster_age_days"] >= 0 + + +# --------------------------------------------------------------------------- +# TestConfidenceModel +# --------------------------------------------------------------------------- + + +class TestConfidenceModel: + def test_high_confidence_with_zero_iops(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_responses=[ + _zero_connections_response(), # DatabaseConnections + _zero_connections_response(), # ReadIOPS + _zero_connections_response(), # WriteIOPS + ], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].confidence.value == "high" + + def test_medium_confidence_when_secondary_missing(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_responses=[ + _zero_connections_response(), # DatabaseConnections + _no_datapoints_response(), # ReadIOPS (missing) + _no_datapoints_response(), # WriteIOPS (missing) + ], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].confidence.value == "medium" + + def test_medium_confidence_when_secondary_nonzero(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_cluster()], + cw_responses=[ + _zero_connections_response(), # DatabaseConnections + _nonzero_connections_response(10), # ReadIOPS (nonzero) + _zero_connections_response(), # WriteIOPS + ], + ) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].confidence.value == "medium" + + +# --------------------------------------------------------------------------- +# TestRiskModel +# --------------------------------------------------------------------------- + + +class TestRiskModel: + def test_high_risk_large_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(NumberOfNodes=4)]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].risk.value == "high" + + def test_medium_risk_small_cluster(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(NumberOfNodes=2)]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].risk.value == "medium" + + def test_medium_risk_when_nodes_missing(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster(NumberOfNodes=None)]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].risk.value == "medium" + + +# --------------------------------------------------------------------------- +# TestEvidenceContract +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def test_evaluation_path(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].details["evaluation_path"] == ("idle-redshift-cluster-review-candidate") + + def test_signals_not_checked(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + snc = findings[0].evidence.signals_not_checked + assert any("business value" in s.lower() for s in snc) + assert any("pausing or deleting" in s.lower() for s in snc) + + def test_is_idle_always_true_in_emitted_finding(self, mock_boto3_session): + _setup(mock_boto3_session, [_make_cluster()]) + findings = find_idle_redshift_clusters(mock_boto3_session, _REGION) + assert findings[0].details["is_idle"] is True + + +# --------------------------------------------------------------------------- +# TestRuleMetadata +# --------------------------------------------------------------------------- + + +class TestRuleMetadata: + def test_rule_id(self): + from cleancloud.providers.aws.rules.redshift_idle import RULE_METADATA + + assert RULE_METADATA["id"] == "aws.redshift.cluster.idle" + + def test_category(self): + from cleancloud.providers.aws.rules.redshift_idle import RULE_METADATA + + assert RULE_METADATA["category"] == "hygiene" + + def test_service(self): + from cleancloud.providers.aws.rules.redshift_idle import RULE_METADATA + + assert RULE_METADATA["service"] == "redshift" + + def test_cost_impact(self): + from cleancloud.providers.aws.rules.redshift_idle import RULE_METADATA + + assert RULE_METADATA["cost_impact"] == "high" diff --git a/tests/cleancloud/safety/aws/test_aws_iam_policy_parity.py b/tests/cleancloud/safety/aws/test_aws_iam_policy_parity.py index 477764d..3c7e6d3 100644 --- a/tests/cleancloud/safety/aws/test_aws_iam_policy_parity.py +++ b/tests/cleancloud/safety/aws/test_aws_iam_policy_parity.py @@ -36,6 +36,8 @@ # aws.ec2.instance.stopped, aws.ec2.security_group.unused "ec2:DescribeInstances", "ec2:DescribeSecurityGroups", + # aws.redshift.cluster.idle + "redshift:DescribeClusters", # aws.elbv2.alb.idle / aws.elbv2.nlb.idle / aws.elb.clb.idle "elasticloadbalancing:DescribeLoadBalancers", "elasticloadbalancing:DescribeTargetGroups", diff --git a/tests/e2e/aws/test_aws_rules_smoke.py b/tests/e2e/aws/test_aws_rules_smoke.py index 65607c2..f9624ed 100644 --- a/tests/e2e/aws/test_aws_rules_smoke.py +++ b/tests/e2e/aws/test_aws_rules_smoke.py @@ -20,6 +20,7 @@ from cleancloud.providers.aws.rules.nat_gateway_idle import find_idle_nat_gateways from cleancloud.providers.aws.rules.rds_idle import find_idle_rds_instances from cleancloud.providers.aws.rules.rds_snapshot_old import find_old_rds_snapshots +from cleancloud.providers.aws.rules.redshift_idle import find_idle_redshift_clusters from cleancloud.providers.aws.rules.untagged_resources import find_untagged_resources @@ -43,6 +44,7 @@ def test_aws_rules_run_without_error(): find_stopped_ec2_instances, find_unused_security_groups, find_old_rds_snapshots, + find_idle_redshift_clusters, ] all_results = []