diff --git a/cleancloud/providers/azure/rules/lb_no_backends.py b/cleancloud/providers/azure/rules/lb_no_backends.py index 0ad311d..9d596ea 100644 --- a/cleancloud/providers/azure/rules/lb_no_backends.py +++ b/cleancloud/providers/azure/rules/lb_no_backends.py @@ -1,5 +1,41 @@ +""" +Rule: azure.load_balancer.no_backends + +Intent: + Detect Standard Azure Load Balancers whose billable load-balancing + configuration points to backend pools with no members. + + This is a conservative review-candidate rule only. It does not prove the + load balancer is unused, safe to delete, or guaranteed to save cost. + +Exclusions: + - id absent or empty + - name absent or empty + - outside optional region filter (exact lowercase match) + - provisioning state does not resolve to "Succeeded" (SDK then nested fallback) + - SKU does not resolve to lowercase "standard" + - no billable rules (no load-balancing rules and no outbound rules) + - relevant backend-pool set cannot be resolved reliably + - billable rules exist but resolved relevant backend-pool set is empty + - any relevant backend pool has one or more members + +Detection: + - SKU is Standard + - provisioning state is Succeeded + - at least one billable rule exists + - all relevant backend pools resolve and are empty + +Cost model (spec 11): + estimated_monthly_cost_usd = None (always) + Standard Load Balancer pricing depends on configured billable rules and + processed data; no flat monthly estimate is appropriate. + +APIs: + - Microsoft.Network/loadBalancers/read (load_balancers.list_all) +""" + from datetime import datetime, timezone -from typing import List, Optional +from typing import List, Optional, Set from azure.mgmt.network import NetworkManagementClient @@ -8,14 +44,217 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.load_balancer.no_backends" +_RESOURCE_TYPE = "azure.load_balancer" + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec section 7.""" + return s.lower() if s else "" + + +def _norm_pool_id(s: str) -> str: + """Lowercase and strip trailing slash — ARM id normalization per spec section 7.""" + return s.lower().rstrip("/") if s else "" + + +def _safe_list(v) -> list: + """ + Coerce v to a list safely. + Returns [] for None or any non-iterable shape, preventing TypeError on + malformed ARM response fields that are truthy but not iterable. + """ + if v is None: + return [] + try: + return list(v) + except TypeError: + return [] + + +# --------------------------------------------------------------------------- +# SDK-first / nested-fallback resolvers (spec 9.1–9.4) +# --------------------------------------------------------------------------- + + +def _resolve_provisioning_state(lb) -> Optional[str]: + """ + Resolve provisioning state per spec 9.1: + 1. SDK projection (lb.provisioning_state) + 2. Nested snake_case (lb.properties.provisioning_state) + 3. Nested ARM camelCase (lb.properties.provisioningState) + 4. Otherwise None (unknown → caller must skip) + """ + state = getattr(lb, "provisioning_state", None) + if state is not None: + return state + props = getattr(lb, "properties", None) + if props is not None: + state = getattr(props, "provisioning_state", None) + if state is not None: + return state + return getattr(props, "provisioningState", None) + return None + + +def _get_lb_rules(lb) -> list: + """ + Get load-balancing rules per spec 9.2: + SDK lb.load_balancing_rules → nested snake_case → nested ARM camelCase → [] + """ + rules = getattr(lb, "load_balancing_rules", None) + if rules is None: + props = getattr(lb, "properties", None) + if props is not None: + rules = getattr(props, "load_balancing_rules", None) + if rules is None: + rules = getattr(props, "loadBalancingRules", None) + return _safe_list(rules) + + +def _get_outbound_rules(lb) -> list: + """ + Get outbound rules per spec 9.2: + SDK lb.outbound_rules → nested snake_case → nested ARM camelCase → [] + """ + rules = getattr(lb, "outbound_rules", None) + if rules is None: + props = getattr(lb, "properties", None) + if props is not None: + rules = getattr(props, "outbound_rules", None) + if rules is None: + rules = getattr(props, "outboundRules", None) + return _safe_list(rules) + + +def _get_backend_pools(lb) -> list: + """ + Get backend address pools per spec 9.3 / spec 6: + SDK lb.backend_address_pools → nested snake_case → nested ARM camelCase → [] + """ + pools = getattr(lb, "backend_address_pools", None) + if pools is None: + props = getattr(lb, "properties", None) + if props is not None: + pools = getattr(props, "backend_address_pools", None) + if pools is None: + pools = getattr(props, "backendAddressPools", None) + return _safe_list(pools) + + +def _get_frontend_ip_configs(lb) -> list: + """ + Get frontend IP configurations (detail-only, not used for detection): + SDK lb.frontend_ip_configurations → nested snake_case → nested ARM camelCase → [] + """ + cfgs = getattr(lb, "frontend_ip_configurations", None) + if cfgs is None: + props = getattr(lb, "properties", None) + if props is not None: + cfgs = getattr(props, "frontend_ip_configurations", None) + if cfgs is None: + cfgs = getattr(props, "frontendIPConfigurations", None) + return _safe_list(cfgs) + + +def _rule_single_pool_ref(rule): + """ + Get a rule's single backend_address_pool reference per spec 9.3: + SDK rule.backend_address_pool → nested snake_case → nested ARM camelCase → None + """ + ref = getattr(rule, "backend_address_pool", None) + if ref is None: + props = getattr(rule, "properties", None) + if props is not None: + ref = getattr(props, "backend_address_pool", None) + if ref is None: + ref = getattr(props, "backendAddressPool", None) + return ref + + +def _rule_multi_pool_refs(rule) -> list: + """ + Get a rule's backend_address_pools list per spec 9.3: + SDK rule.backend_address_pools → nested snake_case → nested ARM camelCase → [] + """ + refs = getattr(rule, "backend_address_pools", None) + if refs is None: + props = getattr(rule, "properties", None) + if props is not None: + refs = getattr(props, "backend_address_pools", None) + if refs is None: + refs = getattr(props, "backendAddressPools", None) + return _safe_list(refs) + def _pool_has_members(pool) -> bool: - """Check if a backend pool has any members via NIC-based or IP-based backends.""" - if getattr(pool, "backend_ip_configurations", None): - return True - if getattr(pool, "load_balancer_backend_addresses", None): - return True - return False + """ + A pool has members when either NIC-based or IP-based membership contains + at least one entry, per spec 9.4. + + SDK projections are checked first; nested snake_case and then ARM camelCase + are used as fallback if the SDK attribute is absent (None). + """ + props = getattr(pool, "properties", None) + + # NIC-based: SDK first, nested snake_case, nested camelCase + nic = getattr(pool, "backend_ip_configurations", None) + if nic is None and props is not None: + nic = getattr(props, "backend_ip_configurations", None) + if nic is None: + nic = getattr(props, "backendIpConfigurations", None) + + # IP-based: SDK first, nested snake_case, nested camelCase + ip_based = getattr(pool, "load_balancer_backend_addresses", None) + if ip_based is None and props is not None: + ip_based = getattr(props, "load_balancer_backend_addresses", None) + if ip_based is None: + ip_based = getattr(props, "loadBalancerBackendAddresses", None) + + return bool(nic or []) or bool(ip_based or []) + + +def _collect_referenced_pool_ids(lb) -> Optional[Set[str]]: + """ + Collect normalized ARM ids of backend pools referenced by billable rules + (load-balancing rules and outbound rules), per spec 9.3. + + Uses SDK-first with nested/raw fallback for both rule collections and + individual pool references within each rule. + + Returns None if any reference cannot be resolved — a pool reference object + that lacks an id, or a billable rule with no pool reference at all. + Callers must skip the load balancer when None is returned. + + Returns a set of normalized pool ids (possibly empty) otherwise. + """ + referenced: Set[str] = set() + + for rule in list(_get_lb_rules(lb)) + list(_get_outbound_rules(lb)): + rule_pool_ids: Set[str] = set() + + # Single pool reference: SDK first, nested fallback + single = _rule_single_pool_ref(rule) + if single is not None: + pid = getattr(single, "id", None) + if not pid: + return None # reference object present but no id → unresolvable + rule_pool_ids.add(_norm_pool_id(pid)) + + # Multi pool references: SDK first, nested fallback + for ref in _rule_multi_pool_refs(rule): + pid = getattr(ref, "id", None) + if not pid: + return None # reference object present but no id → unresolvable + rule_pool_ids.add(_norm_pool_id(pid)) + + if not rule_pool_ids: + # Billable rule has no pool reference at all — incomplete config → skip + return None + + referenced |= rule_pool_ids + + return referenced def find_lb_no_backends( @@ -26,13 +265,14 @@ def find_lb_no_backends( client: Optional[NetworkManagementClient] = None, ) -> List[Finding]: """ - Find Standard Azure Load Balancers with no backend pool members. + Find Standard Azure Load Balancers with billable rules pointing to + backend pools that have no members. - Conservative rule (review-only): - - Only flags Standard SKU (Basic has no cost signal post-retirement) - - Checks both NIC-based and IP-based backend representations - - Flags only if ALL pools have zero members, or LB has no pools at all - - Skips non-Succeeded provisioning state + Detection requires: + - SKU resolves to "Standard" + - provisioning state resolves to exactly "Succeeded" (SDK then nested fallback) + - at least one billable rule (load-balancing rule or outbound rule) + - all relevant backend pools resolve and are empty IAM permissions: - Microsoft.Network/loadBalancers/read @@ -44,82 +284,127 @@ def find_lb_no_backends( subscription_id=subscription_id, ) + now = datetime.now(timezone.utc) + for lb in net_client.load_balancers.list_all(): - if region_filter and lb.location != region_filter: + # spec 8.1: id must be present and non-empty + lb_id = getattr(lb, "id", None) + if not lb_id: continue - # Skip non-Succeeded provisioning state - if getattr(lb, "provisioning_state", None) not in (None, "Succeeded"): + # spec 8.2: name must be present and non-empty + lb_name = getattr(lb, "name", None) + if not lb_name: continue - # Only flag Standard SKU (Basic is retired, no cost signal) - sku_name = lb.sku.name if lb.sku else None - if sku_name != "Standard": + # spec 8.3: region filter — exact lowercase match + location = _norm_location(getattr(lb, "location", "") or "") + if region_filter and location != _norm_location(region_filter): continue - # Check all backend pools for members - pools = lb.backend_address_pools or [] - has_any_members = any(_pool_has_members(pool) for pool in pools) + # spec 8.4 / 9.1: provisioning state must resolve to exactly "Succeeded" + if _resolve_provisioning_state(lb) != "Succeeded": + continue - if has_any_members: + # spec 8.5: SKU must resolve to lowercase "standard" + sku = getattr(lb, "sku", None) + sku_name = getattr(sku, "name", None) if sku else None + if not sku_name or sku_name.lower() != "standard": continue - pool_count = len(pools) - signals = [] - if pool_count == 0: - signals.append("Load Balancer has no backend address pools") - else: - signals.append( - f"All {pool_count} backend pool(s) have no members " - f"(checked backend_ip_configurations and load_balancer_backend_addresses)" - ) - signals.append("SKU is Standard (incurs base charges regardless of backends)") - - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Planned backend attachment", - "IaC-managed intent", - "Migration or teardown in progress", - "Disaster recovery or failover standby", - ], - time_window=None, - ) + # spec 8.6 / 9.2: at least one billable rule must exist (SDK + nested fallback) + lb_rules = _get_lb_rules(lb) + outbound_rules = _get_outbound_rules(lb) + lb_rule_count = len(lb_rules) + outbound_rule_count = len(outbound_rules) + billable_rule_count = lb_rule_count + outbound_rule_count + if billable_rule_count == 0: + continue - sku_tier = lb.sku.tier if lb.sku else None + # spec 9.3: collect pool ids referenced by billable rules (SDK + nested fallback) + referenced_ids = _collect_referenced_pool_ids(lb) + if referenced_ids is None: + continue # unresolvable reference → skip + + # spec 8.8: resolved relevant pool set is empty → skip + if not referenced_ids: + continue + + # spec 9.3: build normalized pool inventory (SDK + nested fallback) + pool_inventory = {} + for pool in _get_backend_pools(lb): + pool_id = getattr(pool, "id", None) + if pool_id: + pool_inventory[_norm_pool_id(pool_id)] = pool + + # spec 9.3: resolve referenced ids against inventory; skip if any unresolvable + relevant_pools = [] + skip_lb = False + for norm_id in referenced_ids: + pool = pool_inventory.get(norm_id) + if pool is None: + skip_lb = True + break + relevant_pools.append(pool) + if skip_lb: + continue + + # spec 8.9 / 9.4: any relevant pool with members → skip (SDK + nested fallback) + if any(_pool_has_members(pool) for pool in relevant_pools): + continue + + # --- EMIT --- + sku_tier = getattr(sku, "tier", None) if sku else None + tags = getattr(lb, "tags", None) or {} + relevant_pool_count = len(relevant_pools) + all_pool_count = len(_get_backend_pools(lb)) + frontend_count = len(_get_frontend_ip_configs(lb)) findings.append( Finding( provider="azure", - rule_id="azure.load_balancer.no_backends", - resource_type="azure.load_balancer", - resource_id=lb.id, - region=lb.location, - estimated_monthly_cost_usd=18.0, + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=lb_id, + region=location, + estimated_monthly_cost_usd=None, # spec 11: always None title="Standard Load Balancer Has No Backend Members", summary=( - f"This Standard Load Balancer '{lb.name}' currently has no backend " - f"pool members. Review whether it is still required or was left " - f"behind after a teardown or migration." + f"Standard Load Balancer '{lb_name}' has {billable_rule_count} billable " + f"rule(s) but all {relevant_pool_count} relevant backend pool(s) are empty" ), reason=( - "All backend pools empty on Standard SKU load balancer" - if pool_count > 0 - else "No backend pools configured on Standard SKU load balancer" + f"All {relevant_pool_count} relevant backend pool(s) referenced by " + f"{billable_rule_count} billable rule(s) have zero members" ), risk=RiskLevel.LOW, confidence=ConfidenceLevel.HIGH, - detected_at=datetime.now(timezone.utc), - evidence=evidence, + detected_at=now, + evidence=Evidence( + signals_used=[ + "Load Balancer SKU is Standard", + f"Billable rule count is {billable_rule_count}", + "All relevant backend pools evaluated to empty using NIC-based and IP-based membership checks", + ], + signals_not_checked=[ + "Planned backend attachment or cutover intent", + "IaC-managed placeholder or staged deployment intent", + "Traffic history or future activation plans", + "Frontend public IP cost or attachment evaluated by other rules", + ], + time_window=None, + ), details={ - "resource_name": lb.name, + "resource_name": lb_name, "subscription_id": subscription_id, "sku_name": sku_name, "sku_tier": sku_tier, - "backend_pool_count": pool_count, - "frontend_ip_count": len(lb.frontend_ip_configurations or []), - "rule_count": len(lb.load_balancing_rules or []), - "tags": lb.tags, + "backend_pool_count": all_pool_count, + "relevant_backend_pool_count": relevant_pool_count, + "frontend_ip_count": frontend_count, + "load_balancing_rule_count": lb_rule_count, + "outbound_rule_count": outbound_rule_count, + "tags": tags, }, ) ) diff --git a/cleancloud/providers/azure/rules/public_ip_unused.py b/cleancloud/providers/azure/rules/public_ip_unused.py index ae0154f..6efadd3 100644 --- a/cleancloud/providers/azure/rules/public_ip_unused.py +++ b/cleancloud/providers/azure/rules/public_ip_unused.py @@ -1,3 +1,37 @@ +""" +Rule: azure.network.public_ip.unused + +Intent: + Detect Azure Public IP Address resources that are fully unattached across + known Azure control-plane linkage surfaces and therefore represent + conservative cleanup review candidates. + + This is a conservative review-candidate rule only. It does not prove the + Public IP is delete-safe, unused at the DNS/firewall layer, or guaranteed + to produce a specific monthly saving. + +Exclusions: + - id absent or empty + - name absent or empty + - outside optional region filter (exact lowercase match) + - provisioning state does not resolve to "Succeeded" + - any attachment linkage resolves to a non-empty reference + - unattached dynamic placeholder with no assigned ip_address + +Detection: + - provisioning state is Succeeded + - all four attachment linkages resolve to absent: + ip_configuration, nat_gateway, service_public_ip_address, linked_public_ip_address + - dynamic-placeholder contract is not triggered + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + Azure Public IP pricing varies by SKU/type; no flat estimate is appropriate. + +APIs: + - Microsoft.Network/publicIPAddresses/read (public_ip_addresses.list_all) +""" + from datetime import datetime, timezone from typing import List, Optional @@ -8,6 +42,120 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.network.public_ip.unused" +_RESOURCE_TYPE = "azure.network.public_ip" + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec section 7.""" + return s.lower() if s else "" + + +# --------------------------------------------------------------------------- +# SDK-first / nested-fallback resolvers (spec 9.1–9.2) +# --------------------------------------------------------------------------- + + +def _resolve_provisioning_state(pip) -> Optional[str]: + """ + Resolve provisioning state per spec 9.1: + 1. SDK projection (pip.provisioning_state) + 2. Nested snake_case (pip.properties.provisioning_state) + 3. Nested ARM camelCase (pip.properties.provisioningState) + 4. Otherwise None (unknown → caller must skip) + """ + state = getattr(pip, "provisioning_state", None) + if state is not None: + return state + props = getattr(pip, "properties", None) + if props is not None: + state = getattr(props, "provisioning_state", None) + if state is not None: + return state + return getattr(props, "provisioningState", None) + return None + + +def _resolve_linkage(pip, sdk_attr: str, arm_attr: str): + """ + Resolve a single attachment linkage field per spec 9.2: + 1. SDK projection (pip.) + 2. Nested ARM camelCase (pip.properties.) + Returns the reference object if present, or None. + """ + ref = getattr(pip, sdk_attr, None) + if ref is None: + props = getattr(pip, "properties", None) + if props is not None: + ref = getattr(props, arm_attr, None) + return ref + + +def _is_attached(pip) -> Optional[bool]: + """ + Resolve attachment state across all known control-plane linkage fields. + + Returns: + True — at least one linkage has a non-empty id (attached) + False — all linkages are cleanly absent (not attached) + None — at least one linkage object is present but has no resolvable id + (unresolvable → caller must skip rather than emit) + + Canonical linkage map (SDK field → ARM camelCase fallback): + ip_configuration → ipConfiguration + nat_gateway → natGateway + service_public_ip_address → servicePublicIPAddress + linked_public_ip_address → linkedPublicIPAddress + """ + for sdk_attr, arm_attr in ( + ("ip_configuration", "ipConfiguration"), + ("nat_gateway", "natGateway"), + ("service_public_ip_address", "servicePublicIPAddress"), + ("linked_public_ip_address", "linkedPublicIPAddress"), + ): + ref = _resolve_linkage(pip, sdk_attr, arm_attr) + if ref is None: + continue + if getattr(ref, "id", None): + return True + # ref is present but id is absent/empty — cannot resolve reliably + return None + return False + + +def _resolve_allocation_method(pip) -> Optional[str]: + """ + Resolve allocation method: + SDK pip.public_ip_allocation_method + → pip.properties.public_ip_allocation_method + → pip.properties.publicIPAllocationMethod + """ + v = getattr(pip, "public_ip_allocation_method", None) + if v is None: + props = getattr(pip, "properties", None) + if props is not None: + v = getattr(props, "public_ip_allocation_method", None) + if v is None: + v = getattr(props, "publicIPAllocationMethod", None) + return v + + +def _resolve_ip_address(pip) -> Optional[str]: + """ + Resolve assigned ip_address: + SDK pip.ip_address + → pip.properties.ip_address + → pip.properties.ipAddress + """ + v = getattr(pip, "ip_address", None) + if v is None: + props = getattr(pip, "properties", None) + if props is not None: + v = getattr(props, "ip_address", None) + if v is None: + v = getattr(props, "ipAddress", None) + return v + def find_unused_public_ips( *, @@ -17,11 +165,14 @@ def find_unused_public_ips( client: Optional[NetworkManagementClient] = None, ) -> List[Finding]: """ - Find unattached or unused Azure Public IPs. + Find Azure Public IP addresses that are fully unattached across all known + Azure control-plane linkage surfaces. - Conservative rule (review-only): - - IP configuration checked - - Does NOT infer future use or planned attachment + Detection requires: + - provisioning state resolves to "Succeeded" + - ip_configuration, nat_gateway, service_public_ip_address, and + linked_public_ip_address all resolve to absent + - not an unattached dynamic placeholder with no assigned ip_address IAM permissions: - Microsoft.Network/publicIPAddresses/read @@ -33,46 +184,95 @@ def find_unused_public_ips( subscription_id=subscription_id, ) + now = datetime.now(timezone.utc) + for pip in net_client.public_ip_addresses.list_all(): - if region_filter and pip.location != region_filter: + # spec 8.1: id must be present and non-empty + pip_id = getattr(pip, "id", None) + if not pip_id: continue - # Skip attached IPs - if pip.ip_configuration is not None: + # spec 8.2: name must be present and non-empty + pip_name = getattr(pip, "name", None) + if not pip_name: continue - evidence = Evidence( - signals_used=["IP configuration is None (not attached to any resource)"], - signals_not_checked=[ - "Planned future association", - "IaC-managed intent", - "Application-level usage", - "Disaster recovery or backup planning", - ], - time_window=None, - ) + # spec 8.3: region filter — exact lowercase match + location = _norm_location(getattr(pip, "location", "") or "") + if region_filter and location != _norm_location(region_filter): + continue + + # spec 8.4 / 9.1: provisioning state must resolve to exactly "Succeeded" + if _resolve_provisioning_state(pip) != "Succeeded": + continue + + # spec 8.5 / 9.2: any attachment linkage present → skip; + # unresolvable linkage (object present, id absent) → also skip + attached = _is_attached(pip) + if attached is None or attached: + continue + + # spec 8.6 / 9.3: dynamic-placeholder contract — + # unattached Dynamic IP with no assigned address is low-signal noise + allocation = _resolve_allocation_method(pip) + ip_address = _resolve_ip_address(pip) + if allocation == "Dynamic" and not ip_address: + continue + + # --- context-only details (spec 9.4) --- + sku = getattr(pip, "sku", None) + sku_name = getattr(sku, "name", None) if sku else None + ip_version = getattr(pip, "public_ip_address_version", None) + if ip_version is None: + props = getattr(pip, "properties", None) + if props is not None: + ip_version = getattr(props, "public_ip_address_version", None) + if ip_version is None: + ip_version = getattr(props, "publicIPAddressVersion", None) + ip_tags = getattr(pip, "ip_tags", None) + tags = getattr(pip, "tags", None) or {} findings.append( Finding( provider="azure", - rule_id="azure.network.public_ip.unused", - resource_type="azure.network.public_ip", - resource_id=pip.id, - region=pip.location, - estimated_monthly_cost_usd=3.60, - title="Unused Azure Public IP", - summary="Public IP is not attached to any resource", - reason="IP configuration is None (not attached)", + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=pip_id, + region=location, + estimated_monthly_cost_usd=None, # spec 10: always None + title="Unused Azure Public IP Address", + summary=f"Public IP '{pip_name}' is not attached to any Azure resource", + reason=( + "No attachment found via ip_configuration, nat_gateway, " + "service_public_ip_address, or linked_public_ip_address" + ), risk=RiskLevel.LOW, - confidence=ConfidenceLevel.MEDIUM, - detected_at=datetime.now(timezone.utc), - evidence=evidence, + confidence=ConfidenceLevel.HIGH, + detected_at=now, + evidence=Evidence( + signals_used=[ + "Provisioning state is Succeeded", + "Public IP has no resolved attachment via ip_configuration, nat_gateway, service_public_ip_address, or linked_public_ip_address", + "Dynamic-placeholder contract not triggered", + ], + signals_not_checked=[ + "Planned future association or reserved intent", + "DNS records or firewall allowlist references", + "Application-level reachability or traffic history", + "Exact Azure billing amount for this Public IP", + ], + time_window=None, + ), details={ - "resource_name": pip.name, + "resource_name": pip_name, "subscription_id": subscription_id, + "allocation_method": allocation, + "ip_address": ip_address, + "sku": sku_name, + "ip_version": ip_version, + "ip_tags": ip_tags, "attached": False, - "ip_address": pip.ip_address, - "tags": pip.tags, + "tags": tags, }, ) ) diff --git a/cleancloud/providers/azure/rules/sql_database_idle.py b/cleancloud/providers/azure/rules/sql_database_idle.py index 7c57374..b9c2772 100644 --- a/cleancloud/providers/azure/rules/sql_database_idle.py +++ b/cleancloud/providers/azure/rules/sql_database_idle.py @@ -1,3 +1,49 @@ +""" +Rule: azure.sql.database.idle + +Intent: + Detect dedicated Azure SQL Database single-database resources that show + no observable user workload activity over the configured idle window and + therefore represent conservative cleanup or rightsizing review candidates. + + This is a conservative review-candidate rule only. It is not proof that a + database is delete-safe, not proof that no business continuity purpose + exists, and not proof of a specific monthly saving. + +Exclusions: + - id absent or empty + - name absent or empty + - outside optional region filter (exact lowercase match) + - status does not resolve to "Online" + - name == "master" (system database) + - database age unknown or less than idle_days + - database is in an elastic pool (no per-database billing) + - database is replica / secondary-shaped + - database is currently paused (serverless paused — compute cost is zero) + - any required metric cannot be resolved reliably (series absent or empty) + - any required metric is non-zero over the idle window + +Detection: + - status is Online + - database age >= idle_days + - not pooled, not replica / secondary-shaped, not paused + - all five required metrics zero over the idle window: + connection_successful (Total), sessions_count (Maximum), + cpu_percent (Maximum), physical_data_read_percent (Maximum), + log_write_percent (Maximum) + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + Azure SQL pricing varies by purchasing model, tier, compute shape, + storage, backup, and serverless behavior; no flat estimate is appropriate. + +APIs: + - Microsoft.Sql/servers/read (servers.list) + - Microsoft.Sql/servers/databases/read (databases.list_by_server) + - Microsoft.Insights/metrics/read (monitor metrics for connection_successful, + sessions_count, cpu_percent, physical_data_read_percent, log_write_percent) +""" + from datetime import datetime, timedelta, timezone from typing import List, Optional @@ -9,73 +55,252 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel -# Approximate monthly costs for Azure SQL Database Standard/Premium tiers (DTU model) -_SKU_COST_MAP = { - # Standard tier - "S0": 15, - "S1": 30, - "S2": 75, - "S3": 150, - "S4": 300, - "S6": 600, - "S7": 1200, - "S9": 2400, - "S12": 4800, - # Premium tier - "P1": 465, - "P2": 930, - "P4": 1860, - "P6": 3720, - "P11": 5521, - "P15": 7446, -} +_RULE_ID = "azure.sql.database.idle" +_RESOURCE_TYPE = "azure.sql.database" +# Required activity metrics: (REST metric name, aggregation type, datapoint attribute) +_REQUIRED_METRICS = [ + ("connection_successful", "Total", "total"), + ("sessions_count", "Maximum", "maximum"), + ("cpu_percent", "Maximum", "maximum"), + ("physical_data_read_percent", "Maximum", "maximum"), + ("log_write_percent", "Maximum", "maximum"), +] + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec section 7.""" + return s.lower() if s else "" -def _extract_resource_group(resource_id: str) -> str: - """Extract resource group name from an Azure resource ID.""" - parts = resource_id.split("/") - for i, part in enumerate(parts): - if part.lower() == "resourcegroups" and i + 1 < len(parts): - return parts[i + 1] - raise ValueError(f"Cannot extract resource group from resource ID: {resource_id}") +# --------------------------------------------------------------------------- +# SDK-first / nested-fallback resolvers (spec 9.1–9.5) +# --------------------------------------------------------------------------- -def _get_metric_sum( + +def _resolve_status(db) -> Optional[str]: + """ + Resolve database status per spec 9.1: + 1. SDK projection (db.status) + 2. Nested snake_case (db.properties.status) + Otherwise None (unknown → caller must skip). + """ + v = getattr(db, "status", None) + if v is not None: + return str(v) + props = getattr(db, "properties", None) + if props is not None: + v = getattr(props, "status", None) + if v is not None: + return str(v) + return None + + +def _resolve_str_field(db, sdk_attr: str, arm_attr: str) -> Optional[str]: + """ + Resolve a string field with SDK-first / nested fallback: + 1. SDK projection (db.) + 2. Nested snake_case (db.properties.) + 3. Nested ARM camelCase (db.properties.) + Returns the first non-empty value found, or None. + """ + v = getattr(db, sdk_attr, None) + if v: + return str(v) + props = getattr(db, "properties", None) + if props is not None: + v = getattr(props, sdk_attr, None) + if v: + return str(v) + v = getattr(props, arm_attr, None) + if v: + return str(v) + return None + + +def _resolve_creation_date(db) -> Optional[datetime]: + """ + Resolve creation_date per spec 9.2: + 1. SDK projection (db.creation_date) + 2. Nested snake_case (db.properties.creation_date) + 3. Nested ARM camelCase (db.properties.creationDate) + Returns a UTC-aware datetime or None. + """ + v = getattr(db, "creation_date", None) + if v is None: + props = getattr(db, "properties", None) + if props is not None: + v = getattr(props, "creation_date", None) + if v is None: + v = getattr(props, "creationDate", None) + return _coerce_datetime(v) + + +def _resolve_date_field(db, sdk_attr: str, arm_attr: str) -> Optional[datetime]: + """ + Resolve a date field with SDK-first / nested fallback. + Returns a UTC-aware datetime or None. + """ + v = getattr(db, sdk_attr, None) + if v is None: + props = getattr(db, "properties", None) + if props is not None: + v = getattr(props, sdk_attr, None) + if v is None: + v = getattr(props, arm_attr, None) + return _coerce_datetime(v) + + +def _coerce_datetime(v) -> Optional[datetime]: + """Convert datetime / ISO string to UTC-aware datetime, or return None.""" + if v is None: + return None + if isinstance(v, datetime): + return v if v.tzinfo else v.replace(tzinfo=timezone.utc) + if isinstance(v, str): + try: + dt = datetime.fromisoformat(v.replace("Z", "+00:00")) + return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc) + except (ValueError, AttributeError): + return None + return None + + +def _is_replica_secondary(db) -> bool: + """ + Replica / secondary exclusion contract per spec 9.4. + Returns True when the database is replica / secondary-shaped. + + Signals checked (SDK-first / nested fallback): + - secondary_type (non-empty → explicit replica indicator; standalone skip) + - source_database_id alone is NOT sufficient; it must be paired with + secondary/replica-shaped control-plane context (spec 9.4). Since + secondary_type is the canonical pairing signal and is already caught + above, source_database_id is only relevant when combined with it. + """ + if _resolve_str_field(db, "secondary_type", "secondaryType"): + return True + return False + + +def _is_paused(db) -> bool: + """ + Current paused-state contract per spec 9.5. + Returns True when the database is currently paused. + + Signals checked: + 1. status == "Paused" + 2. paused_date present with no evidence of a later resumed_date + """ + if _resolve_status(db) == "Paused": + return True + paused_date = _resolve_date_field(db, "paused_date", "pausedDate") + if paused_date is None: + return False + resumed_date = _resolve_date_field(db, "resumed_date", "resumedDate") + if resumed_date is None: + return True # paused with no resume evidence + return paused_date > resumed_date + + +# --------------------------------------------------------------------------- +# Metric query (spec 9.6) +# --------------------------------------------------------------------------- + + +def _query_metric( monitor_client: MonitorManagementClient, resource_uri: str, metric_name: str, - start_time: datetime, - end_time: datetime, -) -> int: + aggregation: str, + dp_attr: str, + window_start: datetime, + window_end: datetime, +) -> Optional[float]: """ - Query Azure Monitor for the sum of a metric over the time period. - - Returns 1 (non-zero) on any failure to avoid false positives. + Query a single Azure Monitor metric for the given timespan. + + Returns: + float >= 0 — metric resolved; 0.0 means confirmed zero for the window + None — metric unknown / query failed / series absent or empty + → caller must skip the database (spec 9.6, rules 3 & 4) + + Per spec 9.6: + - If the metric query raises → None (unknown) + - If the metric is absent from response → None (unknown) + - If the series is empty or unusable → None (unknown) + - If all datapoints are 0 or None and series is usable → 0.0 (confirmed zero) + - If any datapoint > 0 → that positive value (database is active) """ try: - # Use strftime to produce clean UTC timestamps without '+00:00' suffix, - # which Azure Monitor rejects (the '+' gets mangled in the REST URL). fmt = "%Y-%m-%dT%H:%M:%SZ" - timespan = f"{start_time.strftime(fmt)}/{end_time.strftime(fmt)}" + timespan = f"{window_start.strftime(fmt)}/{window_end.strftime(fmt)}" response = monitor_client.metrics.list( resource_uri, metricnames=metric_name, timespan=timespan, interval="P1D", - aggregation="Total", + aggregation=aggregation, ) - for metric in response.value: - for ts in metric.timeseries: - for data in ts.data: - if data.total is not None and data.total > 0: - return 1 - return 0 + # Locate the metric in the response (name may be LocalizableString or str) + matched = None + for m in response.value or []: + m_name = m.name + if hasattr(m_name, "value"): + name_val = m_name.value + else: + name_val = str(m_name) if m_name is not None else None + if name_val and name_val.lower() == metric_name.lower(): + matched = m + break + + if matched is None: + return None # metric absent from response → unknown + + # Collect aggregated datapoints. + # Distinguish "no data items at all" (series unusable → unknown) from + # "data items present but all None" (series usable, all zero → 0.0). + # Spec 9.6 rule 2: usable series where all datapoints are 0 or None + # counts as zero for the window, not unknown. + has_data_items = False + values = [] + for ts in matched.timeseries or []: + for dp in ts.data or []: + has_data_items = True + val = getattr(dp, dp_attr, None) + if val is not None: + values.append(val) + + if not has_data_items: + return None # series has no data items → unusable → unknown + + if not values: + return 0.0 # all datapoints None → usable series, confirmed zero + + return max(values) except Exception: - # If we can't get metrics, assume there might be connections - # to avoid false positives - return 1 + return None # query failure → unknown + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _extract_resource_group(resource_id: str) -> str: + """Extract resource group name from an Azure resource ID.""" + parts = resource_id.split("/") + for i, part in enumerate(parts): + if part.lower() == "resourcegroups" and i + 1 < len(parts): + return parts[i + 1] + raise ValueError(f"Cannot extract resource group from resource ID: {resource_id}") + + +# --------------------------------------------------------------------------- +# Main scan function +# --------------------------------------------------------------------------- def find_idle_sql_databases( @@ -88,12 +313,11 @@ def find_idle_sql_databases( idle_days: int = 14, ) -> List[Finding]: """ - Find Azure SQL databases with zero connections for `idle_days` days. + Find Azure SQL databases with no observable user workload activity over idle_days. - Azure SQL databases in Standard/Premium tiers cost $15-$7,500+/month. - Databases with zero connections over 14+ days are a strong idle signal. - Excludes Basic tier (<$5/month) to reduce noise. - Excludes system databases (master). + Detection requires all five required activity metrics (connection_successful, + sessions_count, cpu_percent, physical_data_read_percent, log_write_percent) + to be zero for the full observation window. Single-metric silence is not enough. IAM permissions: - Microsoft.Sql/servers/read @@ -112,125 +336,178 @@ def find_idle_sql_databases( ) now = datetime.now(timezone.utc) + window_start = now - timedelta(days=idle_days) for server in sql_client.servers.list(): - server_location = (server.location or "").lower() - if region_filter and server_location != region_filter.lower(): + # Server-level region pre-filter (optimization — database location == server location in Azure SQL) + server_location = _norm_location(getattr(server, "location", "") or "") + if region_filter and server_location != _norm_location(region_filter): + continue + + server_id = getattr(server, "id", None) + if not server_id: continue try: - resource_group = _extract_resource_group(server.id) + resource_group = _extract_resource_group(server_id) except ValueError: continue + server_name = getattr(server, "name", None) or "" + try: - db_list = list(sql_client.databases.list_by_server(resource_group, server.name)) + db_list = list(sql_client.databases.list_by_server(resource_group, server_name)) except Exception: - continue + continue # spec 12: skip server on listing failure for db in db_list: - # Skip system databases - if db.name == "master": + # spec 8.1: id must be present and non-empty + db_id = getattr(db, "id", None) + if not db_id: continue - # Skip Basic tier (< $5/month, not worth flagging) - sku_tier = getattr(db.sku, "tier", "") if db.sku else "" - if sku_tier.lower() == "basic": + # spec 8.2: name must be present and non-empty + db_name = getattr(db, "name", None) + if not db_name: continue - # Azure returns the service objective (S0, P1, etc.) in - # current_service_objective_name, not sku.name (which is the tier). - sku_name = getattr(db, "current_service_objective_name", None) or ( - getattr(db.sku, "name", "") if db.sku else "" - ) + # spec 8.3: region filter — exact lowercase match on database location + db_location = _norm_location(getattr(db, "location", "") or "") + if region_filter and db_location != _norm_location(region_filter): + continue - # Build resource URI for Azure Monitor - resource_uri = db.id + # spec 8.4 / 9.1: status must resolve to exactly "Online" + if _resolve_status(db) != "Online": + continue - # Query 14-day connection metrics - total_connections = _get_metric_sum( - mon_client, - resource_uri, - "connection_successful", - now - timedelta(days=idle_days), - now, - ) + # spec 8.5: skip master system database + if db_name == "master": + continue - if total_connections > 0: + # spec 8.6 / 9.2: age must be known and >= idle_days + creation_date = _resolve_creation_date(db) + if creation_date is None: + continue # age unknown → skip + age_days = (now - creation_date).days + if age_days < idle_days: continue - # Estimate monthly cost from SKU - estimated_monthly_cost = _estimate_monthly_cost(sku_name) - cost_usd = _estimate_monthly_cost_usd(sku_name) - - signals = [ - f"Zero successful connections for {idle_days} days (Azure Monitor metrics)", - f"Connections ({idle_days}d sum): {total_connections}", - f"SKU: {sku_name} ({sku_tier})", - f"Server: {server.name}", - ] - - signals_not_checked = [ - "Planned future usage", - "Disaster recovery intent", - "Seasonal traffic patterns", - "Application deployment cycles", - ] - - evidence = Evidence( - signals_used=signals, - signals_not_checked=signals_not_checked, - time_window=f"{idle_days} days", - ) + # spec 8.7 / 9.3: skip elastic pool databases (billing is at pool level) + elastic_pool_id = _resolve_str_field(db, "elastic_pool_id", "elasticPoolId") + if elastic_pool_id: + continue + + # spec 8.8 / 9.4: skip replica / secondary-shaped databases + if _is_replica_secondary(db): + continue + + # spec 8.9 / 9.5: skip currently paused databases (compute cost is already zero) + if _is_paused(db): + continue + + # spec 8.10–8.11 / 9.6: query all five required metrics + metric_values: dict = {} + skip_db = False + for metric_name, aggregation, dp_attr in _REQUIRED_METRICS: + val = _query_metric( + mon_client, + db_id, + metric_name, + aggregation, + dp_attr, + window_start, + now, + ) + if val is None: + skip_db = True # metric unknown → skip (spec 9.6 rule 3/4) + break + metric_values[metric_name] = val + + if skip_db: + continue + + if any(v > 0 for v in metric_values.values()): + continue # at least one metric non-zero → database is active + + # --- context-only details (spec 9.8) --- + sku = getattr(db, "sku", None) + sku_tier = getattr(sku, "tier", None) if sku else None + current_slo = getattr(db, "current_service_objective_name", None) + auto_pause_delay = getattr(db, "auto_pause_delay", None) + if auto_pause_delay is None: + props = getattr(db, "properties", None) + if props is not None: + auto_pause_delay = getattr(props, "auto_pause_delay", None) + if auto_pause_delay is None: + auto_pause_delay = getattr(props, "autoPauseDelay", None) + paused_date_raw = getattr(db, "paused_date", None) + tags = getattr(db, "tags", None) or {} findings.append( Finding( provider="azure", - rule_id="azure.sql.database.idle", - resource_type="azure.sql.database", - resource_id=db.id, - region=db.location, - estimated_monthly_cost_usd=cost_usd, - title=f"Idle Azure SQL Database (No Connections for {idle_days}+ Days)", + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=db_id, + region=db_location, + estimated_monthly_cost_usd=None, # spec 10: always None + title="Idle Azure SQL Database", summary=( - f"Azure SQL database '{db.name}' on server '{server.name}' " - f"({sku_name}, {sku_tier}) has had zero connections for {idle_days}+ days." + f"SQL database '{db_name}' on server '{server_name}' " + f"shows no observable activity for {idle_days}+ days" + ), + reason=( + f"All five required activity metrics are zero over {idle_days} days: " + "connection_successful, sessions_count, cpu_percent, " + "physical_data_read_percent, log_write_percent" ), - reason=f"Azure SQL database has zero connections for {idle_days}+ days", risk=RiskLevel.HIGH, confidence=ConfidenceLevel.HIGH, detected_at=now, - evidence=evidence, + evidence=Evidence( + signals_used=[ + "Database status is Online", + f"Database age is at least {idle_days} days", + "Database is not in an elastic pool", + "Replica / secondary exclusion contract is not triggered", + "Paused-state contract is not triggered", + f"Zero connection_successful over {idle_days}-day window", + f"Zero sessions_count over {idle_days}-day window", + f"Zero cpu_percent over {idle_days}-day window", + f"Zero physical_data_read_percent over {idle_days}-day window", + f"Zero log_write_percent over {idle_days}-day window", + ], + signals_not_checked=[ + "Planned future cutover or deployment intent", + "Undeclared business continuity requirements", + "Workload activity outside documented rule signals", + "Exact Azure billing amount for this database", + ], + time_window=f"{idle_days} days", + ), details={ - "db_name": db.name, - "server_name": server.name, - "sku_name": sku_name, + "database_name": db_name, + "server_name": server_name, + "status": _resolve_status(db), + "current_service_objective_name": current_slo, "sku_tier": sku_tier, - "max_size_bytes": getattr(db, "max_size_bytes", None), - "location": db.location, - f"connections_{idle_days}d": total_connections, - "estimated_monthly_cost": estimated_monthly_cost, - "tags": db.tags, + "elastic_pool_id": elastic_pool_id, + "auto_pause_delay": auto_pause_delay, + "paused_date": ( + str(paused_date_raw) if paused_date_raw is not None else None + ), + "creation_date": str(creation_date), + "idle_days": idle_days, + "connection_successful": metric_values.get("connection_successful"), + "sessions_count": metric_values.get("sessions_count"), + "cpu_percent": metric_values.get("cpu_percent"), + "physical_data_read_percent": metric_values.get( + "physical_data_read_percent" + ), + "log_write_percent": metric_values.get("log_write_percent"), + "tags": tags, }, ) ) return findings - - -def _estimate_monthly_cost(sku_name: str) -> str: - """Rough monthly cost estimate based on SKU name.""" - if not sku_name: - return "Cost varies by SKU (region dependent)" - cost = _SKU_COST_MAP.get(sku_name.upper()) - if cost: - return f"~${cost}/month (region dependent)" - return "Cost varies by SKU (region dependent)" - - -def _estimate_monthly_cost_usd(sku_name: str) -> Optional[float]: - """Numeric monthly cost estimate for aggregation.""" - if not sku_name: - return None - cost = _SKU_COST_MAP.get(sku_name.upper()) - return float(cost) if cost else None diff --git a/docs/rules.md b/docs/rules.md index 24da59d..430294d 100644 --- a/docs/rules.md +++ b/docs/rules.md @@ -1,2310 +1,23 @@ # CleanCloud Rules -Complete reference for all 45 rules implemented by CleanCloud (30 hygiene + 15 AI/ML). +45 rules across three providers (30 hygiene + 15 AI/ML). ---- - -## Design Principles - -All CleanCloud rules follow these principles: - -### 1. Read-Only Always -- Uses read-only cloud APIs exclusively -- No `Delete*`, `Modify*`, `Tag*`, or `Update*` operations -- Safe for production environments - -### 2. Conservative by Default -- Multiple signals preferred over single indicators -- Age-based thresholds prevent false positives on temporary resources -- Prefer false negatives over false positives - -### 3. Explicit Confidence Levels -Every finding includes a confidence level: -- **HIGH** - Multiple strong signals, very likely orphaned -- **MEDIUM** - Moderate signals, worth reviewing -- **LOW** - Weak signals, informational only - -### 4. Review-Only Recommendations -- Findings are candidates for human review, not automated action -- Clear reasoning provided for each finding -- No rule should justify deletion on its own - ---- - -## Quick Reference - -**AWS:** - -| Rule ID | Cost Surface | What It Detects | -|---|---|---| -| `aws.ec2.instance.stopped` | Compute | EC2 instances stopped 30+ days (EBS charges continue) | -| `aws.ec2.security_group.unused` | Governance | Security groups with no ENI associations | -| `aws.ebs.unattached` | Storage | EBS volumes not attached to any instance | -| `aws.ebs.snapshot.old` | Storage | Snapshots ≥ 90 days old | -| `aws.ec2.ami.old` | Storage | AMIs older than 180 days | -| `aws.ec2.elastic_ip.unattached` | Network | Elastic IPs not currently associated with any instance or network interface | -| `aws.ec2.eni.detached` | Network | Detached ENIs not currently attached | -| `aws.ec2.nat_gateway.idle` | Network | NAT Gateways with zero traffic 14+ days | -| `aws.elbv2.alb.idle` / `aws.elbv2.nlb.idle` / `aws.elb.clb.idle` | Network | Load balancers with zero traffic 14+ days | -| `aws.rds.instance.idle` | Platform | RDS instances with zero connections 14+ days | -| `aws.rds.snapshot.old` | Storage | Manual RDS snapshots older than 90 days | -| `aws.cloudwatch.logs.infinite_retention` | Observability | Log groups with no retention policy | -| `aws.resource.untagged` | Governance | EC2/S3/CloudWatch resources with zero tags | -| `aws.sagemaker.endpoint.idle` | AI/ML | Real-time SageMaker endpoints `InService` with no observed `InvokeEndpoint` traffic across billable production variants for 14+ days *(opt-in: `--category ai`)* | -| `aws.sagemaker.notebook.idle` | AI/ML | SageMaker Notebook Instances `InService` with stale control-plane timestamps for 14+ days *(opt-in: `--category ai`)* | -| `aws.ec2.gpu.idle` | AI/ML | EC2 GPU/accelerator instances (p/g/trn/inf/dl families) running with <5% GPU or <10% CPU utilisation over 7 days *(opt-in: `--category ai`)* | -| `aws.bedrock.provisioned_throughput.idle` | AI/ML | Bedrock Provisioned Throughput (Model Units) with zero invocations 7+ days — bills per MU per hour regardless of traffic *(opt-in: `--category ai`)* | -| `aws.sagemaker.studio_app.idle` | AI/ML | SageMaker Studio `KernelGateway`/`JupyterLab`/`CodeEditor` apps `InService` with no usable recent activity signal for 7+ days *(opt-in: `--category ai`)* | -| `aws.sagemaker.training_job.long_running` | AI/ML | SageMaker training jobs still `InProgress` beyond the configured threshold (default 24h), using `TrainingStartTime` when present else `CreationTime` *(opt-in: `--category ai`)* | - -**Azure:** - -| Rule ID | Cost Surface | What It Detects | -|---|---|---| -| `azure.vm.stopped_not_deallocated` | Compute | Stopped but not deallocated VMs (full charges) | -| `azure.compute.disk.unattached` | Storage | Managed disks not attached to any VM | -| `azure.compute.snapshot.old` | Storage | Old managed snapshots as conservative review candidates | -| `azure.network.public_ip.unused` | Network | Public IPs not attached to any interface | -| `azure.load_balancer.no_backends` | Network | Standard LBs with zero backend members | -| `azure.application_gateway.no_backends` | Network | App Gateways with zero backend targets | -| `azure.virtual_network_gateway.idle` | Network | VPN/ExpressRoute Gateways with no connections | -| `azure.app_service_plan.empty` | Platform | Paid App Service Plans with zero apps | -| `azure.app_service.idle` | Platform | App Services with zero HTTP requests 14+ days | -| `azure.sql.database.idle` | Platform | Azure SQL databases with zero connections 14+ days | -| `azure.container_registry.unused` | Platform | Container registries with zero successful pulls and pushes 90+ days | -| `azure.resource.untagged` | Governance | Disks and snapshots with zero tags | -| `azure.aml.compute.idle` | AI/ML | AML compute clusters with min_node_count > 0 and no active nodes 14+ days *(opt-in: `--category ai`)* | -| `azure.ml.compute_instance.idle` | AI/ML | Azure ML Compute Instances Running with no control-plane activity 14+ days *(opt-in: `--category ai`)* | -| `azure.ml.online_endpoint.idle` | AI/ML | Azure ML managed online endpoints in Succeeded provisioning state with zero scoring requests for 7+ days *(opt-in: `--category ai`)* | -| `azure.ai_search.idle` | AI/ML | Azure AI Search services (Standard tier+) with zero search queries for 30+ days *(opt-in: `--category ai`)* | -| `azure.openai.provisioned_deployment.idle` | AI/ML | Azure OpenAI provisioned deployments (PTUs) with zero API requests for 7+ days *(opt-in: `--category ai`)* (default, configurable) | - -**GCP:** - -| Rule ID | Cost Surface | What It Detects | -|---|---|---| -| `gcp.compute.vm.stopped` | Compute | TERMINATED VM instances stopped 30+ days (disk charges continue) | -| `gcp.compute.disk.unattached` | Storage | Persistent Disks in READY state with no attached VM | -| `gcp.compute.snapshot.old` | Storage | Disk snapshots older than 90 days | -| `gcp.compute.ip.unused` | Network | Reserved static IPs (regional and global) in RESERVED state | -| `gcp.sql.instance.idle` | Platform | Cloud SQL instances with zero connections for 14+ days | -| `gcp.vertex.endpoint.idle` | AI/ML | Vertex AI Online Prediction endpoints with dedicated capacity and zero predictions for 14+ days (`--category ai`) | -| `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances ACTIVE with no control-plane activity for 14+ days (`--category ai`) | -| `gcp.vertex.training_job.long_running` | AI/ML | Vertex AI CustomJobs and TrainingPipelines in RUNNING state beyond 24h threshold; GPU/TPU/expensive-CPU early warning at 90% of threshold — hung or runaway jobs on GPU-backed machines cost $4–$80+/hr per node *(opt-in: `--category ai`)* | -| `gcp.tpu.idle` | AI/ML | Cloud TPU nodes in READY state with near-zero utilization (`duty_cycle ≤ 2%`) for 7+ days — idle TPU v4 costs ~$12.88/hr, v5p can exceed $33/hr *(opt-in: `--category ai`)* | -| `gcp.vertex.featurestore.idle` | AI/ML | Vertex AI Feature Store online stores (legacy and new-gen) with zero ReadFeatureValues requests for 30+ days — Bigtable-backed stores bill ~$197/node/month regardless of utilization *(opt-in: `--category ai`)* | - ---- - -## AWS Rules - -### Compute Waste - -#### Stopped EC2 Instances - -**Rule ID:** `aws.ec2.instance.stopped` - -**What it detects:** EC2 instances in 'stopped' state for 30+ days - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Stop time from CloudTrail `LookupEvents` ≥ 30 days ago (deterministic timestamp) -- Not flagged: no CloudTrail stop event found or stopped < 30 days ago - -**Risk:** MEDIUM - -**Why this matters:** -- Stopped EC2 instances do not charge for compute — but every attached EBS volume accrues storage costs at ~$0.10/GB-month, every hour, regardless of instance state -- A 500 GB root + data volume on a forgotten stopped instance costs ~$50/month indefinitely -- Any associated Elastic IPs continue to charge ~$0.005/hour while unattached -- Stopped instances are the most common form of "I meant to clean that up" infrastructure debt - -**Detection logic:** -```python -for instance in describe_instances(state=stopped): - stop_event = cloudtrail_lookup_events(EventName="StopInstances", instance_id=instance.id) - # Uses latest StopInstances event after most recent StartInstances (restart-cycle aware) - if stop_event and (now - stop_event.eventTime).days >= 30: - confidence = "HIGH" # Deterministic CloudTrail timestamp, not a heuristic -``` - -**Cost estimates:** -- Based on total attached EBS storage × $0.10/GB-month -- Example: 2 × 100 GB volumes = ~$20/month in ongoing storage charges -- Additional Elastic IP charges are tracked separately by the `aws.ec2.elastic_ip.unattached` rule - -**Common causes:** -- Test or dev instances left stopped after a project ended -- Migration source instances never terminated after cutover -- Incident response boxes started and never cleaned up -- Autoscaling warm pools drained but not terminated - -**Required permissions:** -- `ec2:DescribeInstances` -- `ec2:DescribeVolumes` -- `cloudtrail:LookupEvents` - ---- - -#### Unused Security Groups - -**Rule ID:** `aws.ec2.security_group.unused` - -**What it detects:** Security groups not associated with any network interface - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** No ENI associations found (service-managed groups may appear unused between deployments) - -**Risk:** LOW - -**Why this matters:** -- Security groups with no ENI associations are pure governance debt -- Each unused group widens the blast radius if a misconfiguration is later introduced -- Compliance audits (SOC 2, ISO 27001, PCI DSS) flag unused security groups as a control failure -- In accounts with hundreds of groups, unused ones obscure the real security posture and add friction to every access review -- Cost is indirect but real: engineer time spent auditing and explaining phantom groups in compliance reviews - -**Detection logic:** -```python -in_use_sg_ids = { - group["GroupId"] - for eni in describe_network_interfaces() - for group in eni["Groups"] -} -for sg in describe_security_groups(): - if sg.name != "default" and sg.id not in in_use_sg_ids: - confidence = "MEDIUM" -``` - -**Exclusions:** -- `default` security groups — AWS prevents deletion of the default group; flagging it is noise - -**Caveats:** -- A security group referenced only in another group's inbound rules (not attached to any ENI) will be flagged. This is intentional. -- Service-managed groups (RDS, ELB, Lambda) may appear unused briefly between deployments. Review before deleting. - -**Common causes:** -- Leftover groups from deleted EC2 instances, RDS databases, or ELB deployments -- Test stacks torn down without full cleanup -- Groups created manually but never attached -- CloudFormation stacks deleted leaving orphaned groups - -**Required permissions:** -- `ec2:DescribeSecurityGroups` -- `ec2:DescribeNetworkInterfaces` - ---- - -### Storage Waste - -#### Unattached EBS Volumes - -**Rule ID:** `aws.ebs.unattached` - -**What it detects:** EBS volumes not attached to any EC2 instance - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** Volume in `available` state for ≥7 days (not attached to any instance) -- Not flagged: < 7 days - -**Why this threshold:** -- Allows time for deployment cycles -- Accounts for rollback windows -- Reduces false positives from autoscaling - -**Common causes:** -- Volumes from terminated EC2 instances -- Failed deployments or rollbacks -- Autoscaling cleanup gaps - -**Required permission:** `ec2:DescribeVolumes` - ---- - -#### Old EBS Snapshots - -**Rule ID:** `aws.ebs.snapshot.old` - -**What it detects:** Snapshots ≥ 90 days old (default, configurable) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **LOW:** Age ≥ 90 days (conservative — age alone is a weak signal) - -**Detection logic:** -```python -for snapshot in describe_snapshots(OwnerIds=["self"]): - age_days = (now - snapshot.StartTime).days - if age_days >= days_old: # default 90 - confidence = "LOW" # age alone is a weak signal - risk = "LOW" -``` - -**Limitations:** -- Snapshots linked to registered AMIs are excluded (avoids false positives) -- Does NOT verify snapshot is unused (conservative approach) - -**Common causes:** -- Backup retention policies without lifecycle rules -- Snapshots from deleted volumes -- Over-retention without cleanup - -**Required permissions:** -- `ec2:DescribeSnapshots` -- `ec2:DescribeSnapshotAttribute` - ---- - -#### Old AMIs - -**Rule ID:** `aws.ec2.ami.old` - -**What it detects:** AMIs (Amazon Machine Images) older than 180 days (default threshold) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** Age ≥ 180 days (AMI may still be actively used as template) - -**Why MEDIUM confidence:** -- Age alone is a moderate signal -- AMI may be a golden image still used for launches -- Cannot check if AMI is referenced by launch templates or Auto Scaling groups - -**Why this matters:** -- AMIs have associated EBS snapshots that incur storage costs -- Old unused AMIs accumulate over time -- Storage costs are ~$0.05/GB-month - -**Detection logic:** -```python -for ami in describe_images(Owners=["self"]): - age_days = (now - ami.creation_date).days - if age_days >= 180 (default) and ami.state == "available": - # Flag as old AMI -``` - -**What gets checked:** -- AMI creation date -- AMI state (only "available" AMIs are flagged) -- Associated snapshot sizes for cost estimation - -**Common causes:** -- AMIs from old deployments -- Test/dev AMIs no longer needed -- Superseded golden images -- AMIs from terminated projects - -**Cost estimates:** -- Based on total EBS snapshot storage -- ~$0.05/GB-month for snapshot storage -- Example: 100 GB AMI = ~$5/month - -**Required permission:** `ec2:DescribeImages` - ---- - -### Network Waste - -#### Unattached Elastic IPs - -**Rule ID:** `aws.ec2.elastic_ip.unattached` - -**What it detects:** Elastic IPs currently not associated with any instance or network interface - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Currently not associated (all four AWS association fields absent per DescribeAddresses) - -**Why this matters:** -- Unattached Elastic IPs incur small hourly charges -- State is deterministic (no `AssociationId`, `InstanceId`, `NetworkInterfaceId`, or `PrivateIpAddress` means not attached) -- Clear cost optimization signal with zero ambiguity - -**Detection logic:** -```python -if not any([eip.get("AssociationId"), eip.get("InstanceId"), - eip.get("NetworkInterfaceId"), eip.get("PrivateIpAddress")]): - confidence = "HIGH" # Deterministic state: not associated -``` - -**Common causes:** -- Elastic IPs from terminated EC2 instances -- Reserved IPs for DR that are no longer needed -- Failed deployments leaving orphaned IPs -- Manual allocation without attachment - -**Edge cases handled:** -- Classic EIPs without `AllocationTime` are annotated as `is_classic: true` in details -- Detection is purely state-based — no age threshold is applied - -**Required permission:** `ec2:DescribeAddresses` - ---- - -#### Detached Network Interfaces (ENIs) - -**Rule ID:** `aws.ec2.eni.detached` - -**What it detects:** Elastic Network Interfaces (ENIs) currently not attached (Status=available) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Currently not attached — no temporal threshold; `Status=available` is the sole eligibility signal - -**Why this matters:** -- Detached ENIs incur small hourly charges -- Often forgotten after failed deployments or incomplete teardowns -- Clear signal with minimal ambiguity - -**Detection logic:** -```python -if eni['Status'] == 'available': # Currently detached - confidence = "HIGH" # Deterministic state: not attached -``` - -**What gets flagged:** -- User-created ENIs (InterfaceType='interface') -- **Lambda/ECS/RDS ENIs** (RequesterManaged=true but YOUR resources!) - explicitly annotated in evidence and details -- Detached ENIs from deleted services - -**Key insight:** `RequesterManaged=true` means "AWS created this in YOUR VPC for YOUR resource" — these ARE your responsibility and often waste. RequesterManaged ENIs are included in findings with an explicit evidence signal and `requester_managed: true` in details for downstream filtering. - -**Common causes:** -- Failed EC2 instance launches -- Incomplete infrastructure teardown -- Terminated instances with retained ENIs -- Forgotten manual ENI creations - -**Required permission:** `ec2:DescribeNetworkInterfaces` - ---- - -#### Idle NAT Gateways - -**Rule ID:** `aws.ec2.nat_gateway.idle` - -**What it detects:** NAT Gateways with zero traffic for 14+ days (default, configurable) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** No traffic detected for 14+ days (CloudWatch metrics checked, but seasonal patterns not verified) - -**Why MEDIUM confidence:** -- Zero traffic is a strong signal, but gateway may be for DR/standby -- Cannot verify planned future usage or blue/green deployments -- Seasonal traffic patterns not checked - -**Why this matters:** -- NAT Gateways cost ~$0.045/hour + $0.045/GB data processing (~$32/month base) -- Idle gateways are a clear cost optimization signal -- Common after VPC restructuring or service migrations - -**Detection logic:** -```python -for gw in describe_nat_gateways(): - if gw.state == "available" and age >= idle_threshold_days: - # All 5 metrics must return datapoints and all must be zero - # If any metric has no datapoints, the item is skipped - for metric in required_metrics: - value = get_metric(metric, period=idle_threshold_days) - if value is None: - skip # Missing data is NOT treated as zero traffic - if value > 0: - skip # Active traffic detected - confidence = "HIGH" if no_route_table_refs else "MEDIUM" -``` - -**CloudWatch metrics checked:** -- `AWS/NATGateway` → `BytesOutToDestination` (daily sum) -- `AWS/NATGateway` → `BytesInFromSource` (daily sum) -- `AWS/NATGateway` → `BytesInFromDestination` (daily sum) -- `AWS/NATGateway` → `BytesOutToSource` (daily sum) -- `AWS/NATGateway` → `ActiveConnectionCount` (daily sum) - -> **Note:** If any metric has no data for the period (e.g. newly created gateway), the item is skipped — missing data is NOT treated as zero traffic. - -**Common causes:** -- VPC restructuring leaving orphaned NAT Gateways -- Service migrations to different subnets/VPCs -- Dev/staging environments with no active workloads -- DR standby gateways (intentional, but worth reviewing) - -**Cost estimates:** -- ~$32/month base cost per idle NAT Gateway -- Additional $0.045/GB data processing when active - -**Required permissions:** -- `ec2:DescribeNatGateways` -- `cloudwatch:GetMetricStatistics` - ---- - -#### Idle Elastic Load Balancers (ALB/CLB/NLB) - -**Rule IDs:** -- `aws.elbv2.alb.idle` — Application Load Balancer -- `aws.elbv2.nlb.idle` — Network Load Balancer -- `aws.elb.clb.idle` — Classic Load Balancer - -**What it detects:** Load balancers with zero traffic for 14+ days (default, configurable) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Zero traffic AND no registered targets/instances -- **MEDIUM:** Zero traffic only (targets/instances may still be registered) - -**Risk:** MEDIUM - -**Why this matters:** -- ELBs incur base hourly charges regardless of traffic (~$16-22/month) -- Idle load balancers are a clear cost optimization signal -- Common after service migrations or decommissions - -**Detection logic:** -```python -# ALB/NLB (elbv2) -for lb in describe_load_balancers(): - if age >= idle_threshold_days: - traffic = get_metric(RequestCount or NewFlowCount, period=idle_threshold_days) - has_targets = check_target_groups(lb) - if traffic == 0: - confidence = "HIGH" if not has_targets else "MEDIUM" - -# CLB (elb) -for lb in describe_load_balancers(): - if age >= idle_threshold_days: - traffic = get_metric(RequestCount, period=idle_threshold_days) - has_instances = len(lb.instances) > 0 - if traffic == 0: - confidence = "HIGH" if not has_instances else "MEDIUM" -``` - -**CloudWatch metrics checked:** -- `AWS/ApplicationELB` → `RequestCount` (ALB, daily sum) -- `AWS/NetworkELB` → `NewFlowCount` (NLB, daily sum) -- `AWS/ELB` → `RequestCount` (CLB, daily sum) - -**Exclusions:** -- LBs younger than the idle threshold - -**Common causes:** -- Service migrations leaving orphaned load balancers -- Dev/staging environments with no active workloads -- Decommissioned applications with retained infrastructure -- Blue/green deployments with stale LBs - -**Cost estimates:** -- ~$16-22/month base cost per idle load balancer (region dependent) - -**Required permissions:** -- `elasticloadbalancing:DescribeLoadBalancers` -- `elasticloadbalancing:DescribeTargetGroups` -- `elasticloadbalancing:DescribeTargetHealth` -- `cloudwatch:GetMetricStatistics` - ---- - -### Platform Waste - -#### Idle RDS Instances - -**Rule ID:** `aws.rds.instance.idle` - -**What it detects:** RDS instances with zero database connections for 14+ days (default, configurable) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** Zero connections for 14+ days (CloudWatch metrics checked, strong but not conclusive signal) - -**Why MEDIUM confidence:** -- Zero database connections is a strong signal of non-use, but cannot rule out Aurora-style architectures or scheduled workloads that connect infrequently -- Connection pools and proxies (RDS Proxy, PgBouncer) can hide real usage while keeping observed client connection counts low or zero - -**Risk:** MEDIUM - -**Why MEDIUM risk:** -- RDS instances are among the more expensive AWS resources, but zero connections alone does not confirm the instance is safe to delete - -**Why this matters:** -- RDS instances incur hourly charges regardless of usage -- Idle instances with no connections are a clear cost optimization signal -- Common after application migrations or decommissions - -**Detection logic:** -```python -for instance in describe_db_instances(): - if instance.status == "available" and age >= idle_threshold_days: - if not instance.read_replica_source: # Skip read replicas - connections_max = get_metric(DatabaseConnections, statistic="Maximum", period=idle_threshold_days) - if connections_max == 0: - confidence = "MEDIUM" - risk = "MEDIUM" -``` - -**CloudWatch metrics checked:** -- `AWS/RDS` -> `DatabaseConnections` (Maximum statistic) - -**Exclusions:** -- Aurora cluster members (`DBClusterIdentifier` set) — Aurora instances are managed at cluster level and may show zero connections individually even when the cluster is active -- Read replicas (`ReadReplicaSourceDBInstanceIdentifier` set) -- Instances younger than the idle threshold - -**Common causes:** -- Applications migrated to different databases -- Dev/staging instances left running -- Decommissioned services with retained databases -- Test databases no longer needed - -**Required permissions:** -- `rds:DescribeDBInstances` -- `cloudwatch:GetMetricStatistics` - ---- - -#### Old Manual RDS Snapshots - -**Rule ID:** `aws.rds.snapshot.old` - -**What it detects:** Manual RDS snapshots older than 90 days (default, configurable) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **LOW:** Snapshot age is known and exceeds threshold (age alone is a weak signal) - -**Risk:** LOW - -**Why this matters:** -- Manual RDS snapshots are retained indefinitely until explicitly deleted -- Storage charges accrue at ~$0.095/GB-month regardless of whether the source DB still exists -- Snapshots older than 90 days are rarely needed for active recovery - -**Detection logic:** -```python -for snapshot in describe_db_snapshots(SnapshotType="manual"): - if snapshot.status == "available": - age_days = (now - snapshot.create_time).days - if age_days >= days_old: - confidence = "LOW" - risk = "LOW" -``` - -**Exclusions:** -- Automated snapshots (`SnapshotType=automated`) — managed by RDS retention policy, auto-deleted -- Snapshots in non-`available` states - -**Common causes:** -- Pre-migration snapshots never cleaned up -- Manual backups taken before schema changes and forgotten -- Snapshots of deleted databases retained for compliance but past their useful life - -**Cost estimate:** ~$0.095/GB-month based on `AllocatedStorage` (the provisioned DB size). RDS snapshots are incremental so actual storage used may be lower — treat this as a ceiling estimate, not an exact figure. - -**Required permissions:** -- `rds:DescribeDBSnapshots` -- `rds:DescribeDBSnapshotAttributes` - ---- - -### Observability Waste - -#### CloudWatch Log Groups (Infinite Retention) - -**Rule ID:** `aws.cloudwatch.logs.infinite_retention` - -**What it detects:** Log groups with no retention policy (never expires) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** No retention policy configured (directly observable configuration fact) - -**Risk tiers:** -- **HIGH:** Log group has ≥1 GB stored bytes (significant ongoing cost) -- **MEDIUM:** Log group has >0 stored bytes -- **LOW:** Log group has 0 stored bytes (still flagged — retention should be set regardless) - -**Why this matters:** -- Logs grow indefinitely without retention -- Can reach GBs/TBs over months -- Often forgotten after service decommission - -**Common causes:** -- Default CloudFormation behavior (no retention) -- Manual log group creation -- Missing lifecycle policies - -**Required permission:** `logs:DescribeLogGroups` - ---- - -### Governance - -#### Untagged Resources - -**Rule ID:** `aws.resource.untagged` - -**What it detects:** Resources with zero tags - -**Resources checked:** -- EBS volumes -- S3 buckets -- CloudWatch log groups - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Zero tags (directly observable fact from authoritative tag source) - -**Why this matters:** -- Ownership ambiguity -- Compliance violations (SOC2, ISO27001) -- Cleanup decision paralysis - -**Required permissions:** -- `ec2:DescribeVolumes` -- `s3:ListAllMyBuckets` -- `s3:GetBucketTagging` -- `logs:DescribeLogGroups` -- `logs:ListTagsForResource` - -### AI/ML Waste - -#### Idle SageMaker Endpoints - -**Rule ID:** `aws.sagemaker.endpoint.idle` - -**Category:** `ai` - -**What it detects:** Real-time SageMaker endpoints in `InService` state with no observed `InvokeEndpoint` traffic across billable production variants for 14+ days (default, configurable). Async endpoints are excluded. Serverless variants without current provisioned concurrency are not treated as continuous idle-cost candidates. - -**Confidence:** -- **HIGH:** All evaluated billable variants returned datapoints and zero summed invocations over the observation window -- **MEDIUM:** At least one evaluated billable variant returned no CloudWatch datapoints, but no billable variant showed positive invocation traffic - -**Risk:** -- **HIGH:** Any billable variant is accelerator-backed (`ml.g*`, `ml.p*`, `ml.inf*`, `ml.trn*`) -- **MEDIUM:** All billable variants are CPU-backed - -**Why this matters:** -- SageMaker endpoints accrue charges continuously while `InService`, regardless of traffic -- Endpoints deployed for experiments or demos are frequently abandoned after initial testing -- Multi-variant endpoints multiply the cost per variant - -**Detection signal:** -- Inventory comes from `ListEndpoints(StatusEquals="InService")` -- Runtime variants come from `DescribeEndpoint.ProductionVariants` -- Async inference is excluded via `DescribeEndpointConfig.AsyncInferenceConfig` -- Activity is evaluated from `AWS/SageMaker` `Invocations` using `EndpointName + VariantName` -- `estimated_monthly_cost_usd` is intentionally left unset by this rule - -**Required permissions:** -- `sagemaker:ListEndpoints` -- `sagemaker:DescribeEndpoint` -- `sagemaker:DescribeEndpointConfig` -- `cloudwatch:GetMetricStatistics` - -> **Not run by default.** AI/ML rules are opt-in to avoid surprising users who don't use these services. Run with `cleancloud scan --provider aws --category ai` (or `--category all` to combine with hygiene rules). Validate access first with `cleancloud doctor --provider aws --category ai`. Attach [`security/aws/ai-readonly.json`](../security/aws/ai-readonly.json) to your IAM role to enable this rule. - ---- - -#### Idle SageMaker Notebook Instances - -**Rule ID:** `aws.sagemaker.notebook.idle` - -**Category:** `ai` - -**What it detects:** SageMaker Notebook Instances in `InService` state whose `CreationTime` and `LastModifiedTime` are both at least 14 days old (default, configurable). This is a conservative stale control-plane heuristic, not a direct notebook-usage signal. - -**Detection signal — why `LastModifiedTime`:** -SageMaker Notebook Instances do not publish a native notebook-session activity metric for this rule. `LastModifiedTime` is the only canonical control-plane timestamp available, but it is a **weak signal**: it is **not** a direct indicator of Jupyter usage, kernel execution, or user access. The rule therefore emits only MEDIUM-confidence review candidates. - -**Confidence:** -- **MEDIUM:** notebook age and stale control-plane age both meet or exceed the configured threshold - -**Risk:** -- **HIGH:** GPU/accelerator-backed instance (`ml.g4dn.*`, `ml.g5.*`, `ml.p3.*`, `ml.p4d.*`, `ml.p4de.*`, `ml.p5.*`, Inferentia, Trainium) -- **MEDIUM:** CPU-backed instance - -**Why this matters:** -- Notebook Instances bill continuously while `InService`, regardless of whether any kernels are running -- Notebooks are commonly left running after a sprint ends, a project is deprioritised, or a team member leaves -- Unlike endpoints, notebooks have no auto-scaling — they remain billable until explicitly stopped - -**Important scope note:** -- `Stopped` notebook instances are intentionally out of scope for this rule -- Their retained storage cost should be handled by a separate storage / cost-waste rule -- `estimated_monthly_cost_usd` is intentionally left unset by this rule - -**Required permissions:** -- `sagemaker:ListNotebookInstances` - -> **Not run by default.** AI/ML rules are opt-in to avoid surprising users who don't use these services. Run with `cleancloud scan --provider aws --category ai` (or `--category all` to combine with hygiene rules). Validate access first with `cleancloud doctor --provider aws --category ai`. Attach [`security/aws/ai-readonly.json`](../security/aws/ai-readonly.json) to your IAM role to enable this rule. - ---- - -#### Idle EC2 GPU Instances - -**Rule ID:** `aws.ec2.gpu.idle` - -**Category:** `ai` - -**What it detects:** EC2 GPU and accelerator instances (p2/p3/p4/p5, g4dn/g4ad/g5/g5g/g6/g6e/gr6, trn1/trn2, inf1/inf2, dl1/dl2q families) in `running` state with low utilisation over 7+ days (default, configurable). Unlike SageMaker rules which target managed services, this rule catches raw GPU instances spun up directly for training, inference, or experimentation and left running after the job completes. - -Detection uses two tiers based on metric availability: -- **GPU utilisation (HIGH confidence):** When the NVIDIA CloudWatch agent is installed, `nvidia_smi_utilization_gpu` is read from the `CWAgent` namespace. MAX statistic across all GPU indices is used — a single active GPU on a multi-GPU instance (e.g., p4d.24xlarge with 8 A100s) will not be masked by averaging. -- **CPU utilisation fallback (MEDIUM confidence):** When the NVIDIA agent is not installed, `CPUUtilization` from `AWS/EC2` is used as a proxy signal. Neuron instances (Trainium/Inferentia) always use this path by design — they use the AWS Neuron SDK, not NVIDIA CUDA. - -**Confidence levels:** -- **HIGH:** GPU metric available AND max GPU utilisation < 5% over 7 days -- **MEDIUM:** GPU metric unavailable; avg CPU utilisation < 10% over 7 days - -**Risk levels:** -- **CRITICAL:** `idle_ratio ≥ 2.0` (e.g. running for 14+ days at the 7-day threshold) -- **HIGH:** GPU/accelerator instance with low utilisation (all other cases) - -**Cost estimates (us-east-1 on-demand):** - -| Instance | Est. monthly cost | -|---|---| -| g4dn.xlarge (T4) | $379 | -| g5.xlarge (A10G) | $604 | -| p3.2xlarge (V100) | $2,234 | -| p4d.24xlarge (8× A100 40GB) | $23,374 | -| p4de.24xlarge (8× A100 80GB) | $32,074 | -| g6e.48xlarge (8× L40S) | $18,000 | -| p5.48xlarge (8× H100) | $98,318 | -| trn2.48xlarge (Trainium2) | $110,000 | - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `idle_days` | `7` | Days of low utilisation before flagging | -| `gpu_threshold` | `5.0` | Max GPU utilisation % (HIGH confidence path) | -| `cpu_threshold` | `10.0` | Max CPU utilisation % (MEDIUM confidence fallback) | - -**Required permissions:** -- `ec2:DescribeInstances` -- `cloudwatch:GetMetricStatistics` -- `cloudwatch:ListMetrics` - -> **Not run by default.** Run with `cleancloud scan --provider aws --category ai`. Attach [`security/aws/ai-readonly.json`](../security/aws/ai-readonly.json) to your IAM role to enable this rule. The NVIDIA CloudWatch agent is not required — instances without it fall back to CPU utilisation at MEDIUM confidence. - ---- - -#### Idle Bedrock Provisioned Throughput - -**Rule ID:** `aws.bedrock.provisioned_throughput.idle` - -**Category:** `ai` - -**What it detects:** AWS Bedrock Provisioned Throughput reservations (Model Units) in `InService` state with zero invocations over 7+ days (default, configurable). Provisioned Throughput reserves dedicated model capacity and bills per Model Unit per hour regardless of whether any inference requests are made — up to ~$7,300/MU/month for Claude 3 Opus on no-commitment pricing. A zero-invocation reservation is paying for capacity delivering zero value. - -**Confidence:** -- **HIGH:** Zero invocations confirmed for the full idle window (deployment age ≥ `idle_days`) - -**Risk:** -- **HIGH:** All provisioned throughput reservations (significant always-on spend) - -**Why this matters:** -- Provisioned Throughput bills per Model Unit per hour while `InService`, regardless of invocation count -- Claude 3 Opus: ~$7,300/MU/month; Claude 3 Sonnet / 3.5 Sonnet: ~$2,600/MU/month; Claude 3 Haiku: ~$600/MU/month (no-commitment pricing — reserved terms are 25–60% lower but still significant) -- Abandoned proof-of-concept and experiment reservations are common — teams switch to on-demand after initial testing but forget to delete the provisioned throughput - -**Cost estimates (per Model Unit, us-east-1, no-commitment):** - -| Model family | Monthly cost per MU | -|---|---| -| Claude 3 Opus | ~$7,300 | -| Claude 3 Sonnet / 3.5 Sonnet | ~$2,600 | -| Claude 3 Haiku / 3.5 Haiku | ~$600 | -| Meta Llama 3 | ~$1,000 | - -Multiply by `desiredModelUnits` for total monthly idle cost. - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `idle_days` | `7` | Days of zero invocations before flagging | - -**Required permissions:** -- `bedrock:ListProvisionedModelThroughputs` -- `cloudwatch:GetMetricStatistics` - -> **Not run by default.** Run with `cleancloud scan --provider aws --category ai`. Attach [`security/aws/ai-readonly.json`](../security/aws/ai-readonly.json) alongside `base-readonly.json` to your IAM role to enable this rule. - ---- - -#### Idle SageMaker Studio Apps - -**Rule ID:** `aws.sagemaker.studio_app.idle` - -**Category:** `ai` - -**What it detects:** SageMaker Studio apps of type `KernelGateway`, `JupyterLab`, or `CodeEditor` in `InService` state with no usable recent activity signal for 7+ days (default, configurable). Other app types, including `JupyterServer`, are excluded from evaluation. - -**Detection signal:** `LastUserActivityTimestamp` from `sagemaker:DescribeApp`, but only when it is usable. AWS documents that health checks can also update `LastUserActivityTimestamp`; if it exactly matches `LastHealthCheckTimestamp`, the app is skipped and not treated as idle. - -**Confidence:** -- **HIGH:** `usable_activity_signal = true` and the last usable activity timestamp is at least the configured threshold old - -**Risk:** -- **HIGH:** GPU/accelerator instance (`ml.g*`, `ml.p*`, `ml.inf*`, `ml.trn*`) -- **MEDIUM:** CPU instance - -GPU families: `ml.g4dn`, `ml.g5`, `ml.p2`, `ml.p3`, `ml.p4d`, `ml.p4de`, `ml.p5`, `ml.trn1`, `ml.inf1`, `ml.inf2` - -**Why this matters:** -- Studio apps remain `InService` (and billing) until explicitly deleted — there is no auto-stop by default -- KernelGateway, JupyterLab, and CodeEditor apps each launch a separate compute instance per user session or space -- Teams frequently leave apps running after finishing a sprint, switching to a new space, or abandoning a project -- `estimated_monthly_cost_usd` is intentionally left unset by this rule - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `idle_days_threshold` | `7` | Days since the last usable activity timestamp before flagging | - -**Required permissions:** -- `sagemaker:ListApps` -- `sagemaker:DescribeApp` - -> **Not run by default.** Run with `cleancloud scan --provider aws --category ai`. Validate access first with `cleancloud doctor --provider aws --category ai`. Attach [`security/aws/ai-readonly.json`](../security/aws/ai-readonly.json) alongside `base-readonly.json` to your IAM role to enable this rule. - ---- - -#### Long-Running SageMaker Training Jobs - -**Rule ID:** `aws.sagemaker.training_job.long_running` - -**Category:** `ai` - -**What it detects:** SageMaker training jobs still in `InProgress` beyond the configured threshold (default 24 hours). Runtime is measured from `TrainingStartTime` when present, otherwise from `CreationTime`. - -**Detection signal:** Inventory is built by fully paginating `ListTrainingJobs` **without** relying on `StatusEquals` for completeness, then filtering `TrainingJobStatus` client-side. `DescribeTrainingJob` is used to confirm the current status, resolve the runtime anchor, and read `StoppingCondition`, `EnableManagedSpotTraining`, `ResourceConfig`, and optional heterogeneous `InstanceGroups`. - -**Confidence:** -- **HIGH:** elapsed runtime exceeds the applicable SageMaker stopping-condition limit (`MaxWaitTimeInSeconds` for managed Spot when present, otherwise `MaxRuntimeInSeconds` when `TrainingStartTime` is present) -- **MEDIUM:** elapsed runtime meets the threshold but no applicable stopping-condition limit was exceeded (or no such limit is configured) - -**Risk:** -- **HIGH:** GPU/accelerator instance (`ml.g*`, `ml.p*`, `ml.inf*`, `ml.trn*`) -- **MEDIUM:** Non-GPU/accelerator instance - -GPU/accelerator families: `ml.g4dn`, `ml.g5`, `ml.g6`, `ml.g6e`, `ml.g7`, `ml.p2`, `ml.p3`, `ml.p4d`, `ml.p4de`, `ml.p5`, `ml.p5en`, `ml.p6`, `ml.trn1`, `ml.trn2`, `ml.inf1`, `ml.inf2` - -**Managed spot training:** `EnableManagedSpotTraining=true` changes the effective wall-clock stopping limit. `MaxRuntimeInSeconds` counts only active compute time (not spot wait time) and is not a reliable wall-clock signal. For spot jobs the rule uses `MaxWaitTimeInSeconds` as the stopping limit; the summary and signals explicitly label which limit was exceeded. - -**Heterogeneous clusters:** When `ResourceConfig.InstanceGroups` is present, accelerator detection is evaluated across the groups rather than inferred from a single primary instance type. - -**Why this matters:** -- Long-running distributed training can keep all workers running and billing while producing limited or no useful progress -- Training jobs are not automatically stopped just because they are unusually long -- `estimated_monthly_cost_usd` is intentionally omitted — this is a transient runtime review rule, not a monthly-cost rule - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `long_running_hours_threshold` | `24` | Hours before a training job is considered long-running | - -**Required permissions:** -- `sagemaker:ListTrainingJobs` -- `sagemaker:DescribeTrainingJob` - -> **Not run by default.** Run with `cleancloud scan --provider aws --category ai`. Validate access first with `cleancloud doctor --provider aws --category ai`. Attach [`security/aws/ai-readonly.json`](../security/aws/ai-readonly.json) alongside `base-readonly.json` to your IAM role to enable this rule. - ---- - -#### Idle Azure ML Compute Clusters - -**Rule ID:** `azure.aml.compute.idle` - -**Category:** `ai` - -**What it detects:** Azure Machine Learning compute clusters (`AmlCompute`) with `min_node_count > 0` and zero active nodes over 14+ days. Clusters configured with a non-zero minimum keep instances running continuously regardless of job activity — identical billing model to SageMaker InService endpoints. GPU clusters (NC/ND/NV series) cost $600–$15K/month at minimum node count. - -**Confidence:** -- **HIGH:** Zero active nodes for the full 14-day window (cluster age ≥ 14 days) -- **MEDIUM:** Zero active nodes, cluster age is 7–13 days, or cluster creation time unavailable - -**Risk:** -- **HIGH:** GPU-backed VM size (`Standard_NC*`, `Standard_ND*`, `Standard_NV*`) -- **MEDIUM:** CPU-backed VM size - -**Why this matters:** -- `min_node_count > 0` means instances are always running, always billed — even with no jobs submitted -- GPU clusters cost $600–$15K/month per node at minimum capacity -- Clusters are frequently created for experiments or training runs and left with non-zero minimums for "warm-start convenience" - -**Metric strategy:** Queries Azure Monitor `Active Nodes` metric (with `ComputeName` dimension filter). Falls back to `NodeCount` and `CurrentNodeCount` if the primary metric is unavailable. Only dimension-filtered metrics are used to confirm idle — workspace-level unfiltered queries cannot safely confirm individual cluster state. - -**Estimated monthly cost (per node at `min_node_count`):** -- `Standard_NC6` — ~$648/month -- `Standard_NC12` — ~$1,296/month -- `Standard_NC6s_v3` — ~$2,203/month -- `Standard_ND40rs_v2` — ~$15,862/month -- `Standard_D4_v2` — ~$259/month - -**Required permissions:** -- `Microsoft.MachineLearningServices/workspaces/read` -- `Microsoft.MachineLearningServices/workspaces/computes/read` -- `Microsoft.Insights/metrics/read` - -> **Not run by default.** Run with `cleancloud scan --provider azure --category ai` (or `--category all`). Add `Microsoft.MachineLearningServices/workspaces/read` and `Microsoft.MachineLearningServices/workspaces/computes/read` to your custom role or use the built-in `AzureML Data Scientist` role in read-only mode. - ---- - -#### Idle Azure ML Compute Instances - -**Rule ID:** `azure.ml.compute_instance.idle` - -**Category:** `ai` - -**What it detects:** Azure ML Compute Instances in `Running` state with no control-plane activity for 14+ days, detected via `last_operation.operation_time`. Compute Instances are single-VM interactive development environments (Jupyter, VS Code, RStudio) that bill continuously while Running — regardless of kernel activity. GPU instances (NC/ND/NV series) idle for 2× the threshold are escalated to CRITICAL. - -**Detection signal — why `last_operation`:** -Azure ML Compute Instances do not publish per-instance utilisation metrics to Azure Monitor by default. `last_operation.operation_time` is updated by the Azure ML control plane on Start, Stop, Restart, and Create operations. An instance with no recent operation has had no control-plane activity — the same approach used for SageMaker Notebook `LastModifiedTime`. Falls back to `system_data.last_modified_at` if `last_operation` is unavailable. - -**Confidence:** -- **HIGH:** `last_operation.operation_time` or `last_modified_at` signal ≥ 14 days ago AND instance age ≥ 14 days -- **MEDIUM:** ≥ 75% of threshold on both signals, OR age-only fallback (when neither `last_operation` nor `last_modified_at` is available — age alone is not evidence of idleness) - -**Risk:** -- **CRITICAL:** GPU instance AND `idle_ratio ≥ 2.0` (e.g. 28+ days at the default 14-day window) -- **HIGH:** GPU instance (`Standard_NC*`, `Standard_ND*`, `Standard_NV*`) -- **MEDIUM:** CPU instance - -**Why this matters:** -- Compute Instances bill at the full VM rate while Running — a stopped instance costs nothing -- GPU instances cost $600–$15K+/month running continuously -- Data scientists frequently leave instances Running after finishing a sprint, switching to a new instance, or during holidays - -**Estimated monthly cost:** -- `Standard_DS3_v2` — ~$260/month -- `Standard_NC6s_v3` — ~$2,203/month -- `Standard_NC24s_v3` — ~$8,812/month -- `Standard_ND40rs_v2` — ~$15,862/month - -**Required permissions:** -- `Microsoft.MachineLearningServices/workspaces/read` -- `Microsoft.MachineLearningServices/workspaces/computes/read` - -> **Not run by default.** Run with `cleancloud scan --provider azure --category ai`. Attach `security/azure/ai-readonly-role.json` to your service principal to enable this rule. - ---- - -#### Idle Azure OpenAI Provisioned Deployment - -**Rule ID:** `azure.openai.provisioned_deployment.idle` - -**Category:** `ai` - -**What it detects:** Azure OpenAI provisioned deployments (PTUs) with zero API requests for 7+ days (default, configurable). Provisioned Throughput Units reserve dedicated model capacity and bill continuously at ~$1,460/PTU/month on-demand regardless of traffic — a single idle 100-PTU GPT-4o deployment wastes ~$146,000/month. - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `idle_days` | `7` | Days of zero requests before flagging | - -**Detection signal:** - -Queries Azure Monitor `AzureOpenAIRequests` (falling back to `ProcessedPromptTokens`) with a `ModelDeploymentName` dimension filter to isolate per-deployment traffic. If the per-deployment dimension is unsupported in a region, falls back to account-level totals. Conservative: returns no finding on any API error. - -**Provisioned SKUs detected:** -- `ProvisionedManaged` — single-region reserved capacity -- `GlobalProvisionedManaged` — multi-region reserved capacity -- `DataZoneProvisionedManaged` — data-zone-scoped reserved capacity - -**Confidence:** -- **HIGH:** Per-deployment metric confirms zero requests AND deployment age ≥ `idle_days` -- **MEDIUM:** Per-deployment zero confirmed but age < `idle_days`; OR account-level zero (per-deployment dimension unavailable in region) - -**Risk:** -- **HIGH:** ≥ 7 PTUs (~$10K+/month estimated) -- **MEDIUM:** < 7 PTUs (still significant — PTU deployments have no cost-free tier) - -**Why this matters:** -- PTU deployments have no free tier — every hour of idle time is pure waste -- Common abandonment pattern: PoC deployments left running after evaluation, dev/test deployments forgotten when team moves to production, traffic migrated to a new deployment without decommissioning the old one -- Nobody else detects idle PTU deployments in CI — first-mover advantage - -**Estimated monthly cost:** -- 1 PTU — ~$1,460/month (on-demand) -- 10 PTUs — ~$14,600/month -- 100 PTUs — ~$146,000/month -- *Note: Monthly/annual reserved pricing is 30–50% lower; estimated cost shown is on-demand ceiling* - -**Required permissions:** -- `Microsoft.CognitiveServices/accounts/read` -- `Microsoft.CognitiveServices/accounts/deployments/read` -- `Microsoft.Insights/metrics/read` - -> **Not run by default.** Run with `cleancloud scan --provider azure --category ai` (or `--category all`). Add the permissions above to your custom read-only role. - ---- - -#### Idle Azure ML Online Endpoints - -**Rule ID:** `azure.ml.online_endpoint.idle` - -**Category:** `ai` - -**What it detects:** Azure ML managed online endpoints in `Succeeded` provisioning state with zero scoring requests for 7+ days (default, configurable). These endpoints bill per-instance based on minimum replica count regardless of traffic — a GPU-backed endpoint with no scoring requests is paying for capacity delivering zero value. - -**Detection signal:** Queries Azure Monitor `RequestCount` (falling back to `ModelEndpointRequests`) with an `EndpointName` dimension filter to isolate per-endpoint traffic. If the dimension is unsupported, falls back to workspace-level totals. Age-only fallback applies when metric data is unavailable and endpoint age ≥ 2× idle window (MEDIUM confidence). - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `idle_days` | `7` | Days of zero scoring requests before flagging | - -**Confidence:** -- **HIGH:** Per-endpoint metric confirms zero requests AND endpoint age ≥ `idle_days` -- **MEDIUM:** Zero requests confirmed but age < `idle_days`; OR metric data unavailable and age ≥ 2× `idle_days` - -**Risk:** -- **CRITICAL:** GPU/accelerator instance AND `idle_ratio ≥ 2.0` (idle for 2× the threshold) -- **HIGH:** GPU/accelerator instance (`Standard_NC*`, `Standard_ND*`, `Standard_NV*`, T4/A100 families) -- **MEDIUM:** CPU-backed instance - -**Why this matters:** -- Managed online endpoints bill per minimum replica continuously while in Succeeded state — even with zero traffic -- GPU-backed endpoints cost $200–$2,600+/month at single minimum replica -- Experiment and PoC endpoints are commonly abandoned after demos without being deleted or scaled to zero -- Unlike batch endpoints, managed online endpoints have no auto-scale-to-zero by default - -**Estimated monthly cost:** -- `Standard_NC6` (K80 GPU) — ~$657/month per replica -- `Standard_NC6s_v2` — ~$900/month per replica -- `Standard_NC12` — ~$1,300/month per replica -- CPU-backed (fallback) — ~$200/month per replica - -**Required permissions:** -- `Microsoft.MachineLearningServices/workspaces/read` -- `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read` -- `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read` -- `Microsoft.Insights/metrics/read` - -> **Not run by default.** Run with `cleancloud scan --provider azure --category ai`. Attach `security/azure/ai-readonly-role.json` to your service principal to enable this rule. - ---- - -#### Idle Azure AI Search Services - -**Rule ID:** `azure.ai_search.idle` - -**Category:** `ai` - -**What it detects:** Azure AI Search services on Standard tier or above with zero search queries over a 30-day window (default, configurable). Cost is computed per SKU × replica count × partition count — a Standard3 service with 3 replicas and 2 partitions idles at ~$6,282/month. - -**Detection signal:** Queries Azure Monitor `SearchQueriesPerSecond` (Average), falling back to `TotalSearchRequestCount` (Sum). Service-level metrics only — no per-index dimension filtering needed. Age-only fallback applies when metric data is unavailable and service age ≥ 2× idle window (MEDIUM confidence). - -**Watched SKUs:** `standard`, `standard2`, `standard3`, `storage_optimized_l1`, `storage_optimized_l2` — Basic tier is excluded (low cost, no signal). - -**Configurable parameters:** - -| Parameter | Default | Description | -|---|---|---| -| `idle_days` | `30` | Days of zero queries before flagging | - -**Confidence:** -- **HIGH:** Zero average `SearchQueriesPerSecond` for the full idle window AND service age ≥ `idle_days` -- **MEDIUM:** Zero confirmed but age < `idle_days`; OR metric data unavailable and age ≥ 2× `idle_days` - -**Risk:** -- **HIGH:** Estimated monthly cost ≥ $1,000 (e.g. Standard2+ or multi-replica/partition Standard) -- **MEDIUM:** All other cases - -**Why this matters:** -- AI Search services bill continuously by SKU × replicas × partitions regardless of query volume -- A Standard service with 1 replica and 1 partition costs ~$261/month idle — scale up to 2 replicas and the bill doubles -- Services are commonly left running after a project ends, a search index is replaced, or a PoC is abandoned -- Standard3 High-Density (HD) with 12 partitions can idle at ~$12,564/month - -**Estimated monthly cost per replica per partition:** - -| SKU | Monthly cost | -|---|---| -| Standard | $261 | -| Standard2 | $523 | -| Standard3 | $1,047 | -| Storage Optimized L1 | $2,014 | -| Storage Optimized L2 | $4,028 | - -Multiply by `replica_count × partition_count` for total monthly idle cost. - -**Required permissions:** -- `Microsoft.Search/searchServices/read` -- `Microsoft.Insights/metrics/read` - -> **Not run by default.** Run with `cleancloud scan --provider azure --category ai`. Attach `security/azure/ai-readonly-role.json` to your service principal to enable this rule. - ---- - -## Azure Rules - -### Compute Waste - -#### Stopped (Not Deallocated) VMs - -**Rule ID:** `azure.vm.stopped_not_deallocated` - -**What it detects:** VMs in 'Stopped' state (OS-level shutdown) that are not deallocated, still incurring full compute charges - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Power state is 'Stopped' (deterministic state check, zero false positives) - -**Risk:** HIGH - -**Why HIGH risk:** -- Stopped-but-not-deallocated VMs incur full compute charges ($30-500+/month depending on SKU) -- Users often believe their VM is "off" but are paying full price -- Classic Azure cost trap with significant financial impact - -**Why this matters:** -- Azure distinguishes between 'Stopped' (OS shutdown) and 'Deallocated' (compute released) -- Only deallocated VMs stop incurring compute charges -- 100% deterministic state check with zero false positives - -**Detection logic:** -```python -for vm in virtual_machines.list_all(): - instance_view = virtual_machines.instance_view(resource_group, vm.name) - power_state = get_power_state(instance_view.statuses) # PowerState/* code - if power_state == "PowerState/stopped": - confidence = "HIGH" # Deterministic: stopped but not deallocated - risk = "HIGH" # Full compute charges still applied -``` - -**Power states:** -- `PowerState/running` — active, skip -- `PowerState/deallocated` — properly stopped, skip -- `PowerState/stopped` — **FLAGGED** (still incurring compute charges) -- `PowerState/starting`, `PowerState/stopping`, `PowerState/deallocating` — transitional, skip - -**Common causes:** -- Shutting down the VM from inside the OS (instead of Azure portal/CLI) -- Using `Stop-AzVM` without `-StayProvisioned false` -- RDP/SSH shutdown commands -- Automated scripts that stop but don't deallocate - -**Required permission:** `Microsoft.Compute/virtualMachines/read` - ---- - -### Storage Waste - -#### Unattached Managed Disks - -**Rule ID:** `azure.compute.disk.unattached` - -**What it detects:** Managed disks not attached to any VM - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** Unattached ≥ 7 days (conservative for all ages — unattached state is deterministic but attachment intent is not) -- Not flagged: < 7 days - -**Detection logic:** -```python -for disk in disks.list(): - if disk.managed_by is not None: - continue # attached to a VM - age_days = (now - disk.time_created).days - if age_days >= 7: - confidence = "MEDIUM" # conservative regardless of age - else: - continue # too new to flag -``` - -**Common causes:** -- Disks from deleted VMs -- Failed deployments -- Autoscaling cleanup gaps - -**Required permission:** `Microsoft.Compute/disks/read` - ---- - -#### Old Managed Disk Snapshots - -**Rule ID:** `azure.compute.snapshot.old` - -**What it detects:** Old managed snapshots that meet the conservative review threshold (default: 30 days) and are surfaced as review candidates only - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **LOW:** Age ≥ 30 days and < `max_age_days` (default 90) — conservative review candidate -- **MEDIUM:** Age ≥ `max_age_days` (default 90) — very old snapshot, higher review priority -- Not flagged: < 30 days -- `HIGH` is never used — age alone cannot establish HIGH confidence - -**Configurable params:** -- `max_age_days` (default: `90`) — age threshold for the MEDIUM confidence band - -**Detection logic:** -```python -for snapshot in snapshots.list(): - if not snapshot.id or snapshot.provisioning_state != "Succeeded": - continue - if snapshot.completion_percent is not None and snapshot.completion_percent < 100: - continue - age_days = (now - snapshot.time_created).days - if age_days < 30: - continue - confidence = "MEDIUM" if age_days >= max_age_days else "LOW" -``` - -**Cost model:** `estimated_monthly_cost_usd` is always `None`. Azure bills snapshots on **used size**, not `diskSizeGB`, so no per-snapshot cost estimate is possible from the API response alone. - -**Limitations:** -- Age alone does not prove a snapshot is unused, orphaned, or safe to delete -- Does not check backup ownership, DR retention intent, or application restore references -- If Azure surfaces `completionPercent`, incomplete background-copy snapshots are skipped conservatively -- Conservative by design — flags review candidates only - -**Common causes:** -- Snapshots from backup jobs retained beyond their useful life -- Over-retention without lifecycle policies -- Snapshots from deleted or migrated disks - -**Required permission:** `Microsoft.Compute/snapshots/read` - ---- - -### Network Waste - -#### Unused Public IP Addresses - -**Rule ID:** `azure.network.public_ip.unused` - -**What it detects:** Public IPs not attached to any network interface - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** Not attached (deterministic state, but may be reserved intentionally) - -**Why this matters:** -- Public IPs incur charges even when unused -- State is deterministic (no heuristics needed) - -**Detection logic:** -```python -if public_ip.ip_configuration is None: - confidence = "MEDIUM" -``` - -**Required permission:** `Microsoft.Network/publicIPAddresses/read` - ---- - -#### Standard Load Balancer with No Backend Members - -**Rule ID:** `azure.load_balancer.no_backends` - -**What it detects:** Standard Load Balancers where all backend pools have zero members - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Standard SKU with zero backend members across all pools (deterministic state) - -**Excluded:** -- Basic SKU load balancers are skipped (retired, no cost signal) - -**Why this matters:** -- Standard Load Balancers incur base charges (~$18/month) regardless of backends -- Empty LBs are a clear cost optimization signal -- Common after VM/VMSS teardowns or migrations - -**Detection logic:** -```python -if lb.sku.name == "Standard": - pools = lb.backend_address_pools or [] - # Check both NIC-based and IP-based backend representations - has_members = any( - pool.backend_ip_configurations or pool.load_balancer_backend_addresses - for pool in pools - ) - if not has_members: - confidence = "HIGH" # Deterministic: zero members across all pools -``` - -**Backend representations checked:** -- `backend_ip_configurations` — NIC-based backends (standard VMs) -- `load_balancer_backend_addresses` — IP-based backends (Private Link, hybrid) - -**Common causes:** -- VMs or VMSS deleted but LB retained -- Migration from Basic to Standard leaving empty LBs -- Failed deployments or incomplete teardowns -- Hub-spoke architecture cleanup gaps - -**Required permission:** `Microsoft.Network/loadBalancers/read` - ---- - -#### Application Gateway with No Backend Targets - -**Rule ID:** `azure.application_gateway.no_backends` - -**What it detects:** Application Gateways where all backend pools have zero targets (no IP addresses or FQDNs) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** All backend pools have zero targets (deterministic state) - -**Excluded:** -- Gateways with `provisioning_state != "Succeeded"` are skipped (in-progress) - -**Why this matters:** -- Application Gateways incur significant charges regardless of backends -- Standard_v2 and WAF_v2 SKUs cost $150-300+/month -- Empty gateways are a clear cost optimization signal - -**Detection logic:** -```python -for gw in application_gateways: - pools = gw.backend_address_pools or [] - has_any_targets = any( - (pool.backend_addresses and len(pool.backend_addresses) > 0) or - (pool.backend_ip_configurations and len(pool.backend_ip_configurations) > 0) - for pool in pools - ) - if not has_any_targets: - confidence = "HIGH" # Deterministic: zero targets across all pools - risk = "MEDIUM" # Significant cost impact ($150-300+/month) -``` - -**Backend targets checked:** -- `backend_addresses` array (IP addresses or FQDNs) -- `backend_ip_configurations` array (NIC-based backend references) - -**Common causes:** -- Backend VMs or services deleted but gateway retained -- Migration or transition leaving empty gateways -- Failed deployments or incomplete teardowns -- WAF-only setup without actual backends (rare) - -**Cost estimates by SKU:** -- Standard_v2, WAF_v2: $150-300+/month -- Standard, WAF (v1): $20-50/month - -**Required permission:** `Microsoft.Network/applicationGateways/read` - ---- - -#### Idle VNet Gateways (VPN/ExpressRoute) - -**Rule ID:** `azure.virtual_network_gateway.idle` - -**What it detects:** VPN Gateways and ExpressRoute Gateways with no active connections +| Provider | Hygiene | AI/ML | Total | Catalog | +|---|---|---|---|---| +| AWS | 13 | 6 | 19 | [rules/aws.md](rules/aws.md) | +| Azure | 12 | 5 | 17 | [rules/azure.md](rules/azure.md) | +| GCP | 5 | 5 | 10 | [rules/gcp.md](rules/gcp.md) | -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** No active connections (connection state checked, but P2S clients not verified) - -**Why MEDIUM confidence:** -- We can verify Site-to-Site and ExpressRoute connections -- Point-to-Site VPN client count requires additional API calls -- Gateway may have P2S config but no way to check active clients without deeper inspection - -**Risk:** HIGH - -**Why HIGH risk:** -- VNet Gateways are among the most expensive idle resources ($500-3,500+/month) -- Cost impact is material even for a single idle gateway -- Significantly higher than Load Balancers (~$18/month) or App Gateways (~$150-300/month) - -**Why this matters:** -- VNet Gateways incur significant charges regardless of connections -- VPN Gateway SKUs: $27-3,500+/month depending on SKU -- ExpressRoute Gateway SKUs: $125-1,100+/month -- Idle gateways are a major cost optimization signal - -**Detection logic:** -```python -for gw in virtual_network_gateways: - connections = list_connections(gw) - active_connections = [c for c in connections if c.connection_status == "Connected"] - - if gw.gateway_type == "Vpn": - if len(active_connections) == 0 and not has_p2s_config: - # Flag as idle - elif gw.gateway_type == "ExpressRoute": - if len(active_connections) == 0: - # Flag as idle -``` - -**Connection states checked:** -- Site-to-Site VPN connections (connection_status == "Connected") -- ExpressRoute circuit connections -- Point-to-Site VPN configuration (presence only, not active client count) - -**Common causes:** -- VPN tunnels torn down but gateway retained -- ExpressRoute circuits decommissioned -- Test/dev gateways left running -- Migration or transition leaving orphaned gateways -- DR standby gateways (intentional, but worth reviewing) - -**Cost estimates by SKU:** -- Basic: $27/month -- VpnGw1/ErGw1AZ: $140-195/month -- VpnGw2/ErGw2AZ: $360-505/month -- VpnGw3/ErGw3AZ: $930-1,115/month -- HighPerformance/UltraPerformance: $335-670/month - -**Required permissions:** -- `Microsoft.Network/virtualNetworkGateways/read` -- `Microsoft.Network/connections/read` - ---- - -### Platform Waste - -#### Empty App Service Plans - -**Rule ID:** `azure.app_service_plan.empty` - -**What it detects:** Paid App Service Plans with zero hosted apps (`number_of_sites == 0`) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Paid tier plan with 0 apps (deterministic state) - -**Excluded tiers:** -- Free and Shared tiers are skipped (no cost signal) - -**Why this matters:** -- Paid App Service Plans incur charges regardless of hosted apps -- Empty plans are a clear cost optimization signal -- Common after app deletions or failed deployments - -**Detection logic:** -```python -if plan.number_of_sites == 0: - if plan.sku.tier not in ("Free", "Shared"): - confidence = "HIGH" # Deterministic: zero apps on paid plan -``` - -**Common causes:** -- Apps deleted but plan retained -- Failed deployments leaving empty plans -- Scaling plans created but never used -- Migration leaving old plans behind - -**Required permissions:** `Microsoft.Web/serverfarms/read`, `Microsoft.Web/serverfarms/sites/read` - ---- - -#### Idle Azure SQL Databases - -**Rule ID:** `azure.sql.database.idle` - -**What it detects:** Azure SQL databases with zero connections for 14+ days (default, configurable) - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Zero connections for 14+ days (Azure Monitor metrics checked, strong idle signal) - -**Risk:** HIGH - -**Why HIGH risk:** -- Azure SQL databases in Standard/Premium tiers cost $15-$7,500+/month -- Idle databases with no connections are a clear cost optimization signal - -**Why this matters:** -- Azure SQL databases incur charges regardless of usage -- Standard and Premium tiers have significant hourly costs -- Idle databases are a major cost optimization opportunity - -**Detection logic:** -```python -for server in sql_servers: - for db in databases.list_by_server(rg, server.name): - if db.name == "master": # Skip system databases - continue - if db.sku.tier == "Basic": # Skip Basic tier (< $5/month) - continue - connections = get_metric(connection_successful, period=14_days) - if connections == 0: - confidence = "HIGH" - risk = "HIGH" -``` - -**Azure Monitor metrics checked:** -- `connection_successful` (daily total over 14-day window) - -**Exclusions:** -- System databases (`master`) -- Basic tier databases (< $5/month, not worth flagging) - -**Common causes:** -- Applications migrated to different databases -- Dev/staging databases left running -- Decommissioned services with retained databases -- Test databases no longer needed - -**Cost estimates by SKU:** -- Standard S0: ~$15/month -- Standard S3: ~$150/month -- Premium P1: ~$465/month -- Premium P6: ~$3,720/month -- Premium P15: ~$7,446/month - -**Required permissions:** -- `Microsoft.Sql/servers/read` -- `Microsoft.Sql/servers/databases/read` -- `Microsoft.Insights/metrics/read` - ---- - -#### Idle App Services - -**Rule ID:** `azure.app_service.idle` - -**What it detects:** Running App Service web apps with zero HTTP requests for 14+ days on paid plans - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Zero requests for 14+ days (Azure Monitor `Requests` metric, strong idle signal) - -**Risk:** MEDIUM - -**Why this matters:** -- App Service Plans on paid tiers bill compute charges continuously regardless of traffic -- An app with zero requests for 14+ days is a strong signal of abandonment -- Common for dev/staging apps that were never decommissioned - -**Detection logic:** -```python -for app in web_apps.list(): - if app.state == "Running" and app.sku.tier not in ("Free", "Shared", "Dynamic"): - requests = monitor.metrics("Requests", period=days_idle) - if requests == 0: - confidence = "HIGH" - risk = "MEDIUM" -``` - -**Excluded tiers:** -- Free, Shared, Dynamic (Consumption/serverless) — no meaningful idle cost - -**Common causes:** -- Dev or staging apps left running after project end -- Feature branches deployed and never torn down -- Apps migrated to containers but old App Service not removed - -**Cost estimates by tier (single instance):** -- Basic: ~$55/month -- Standard: ~$73/month -- Premium/PremiumV2/V3: ~$146/month -- Isolated/IsolatedV2: ~$298/month - -Cost assumes one instance. Scaled-out plans (multiple instances) will cost proportionally more — treat these as minimum estimates. - -**Not detected:** -- Non-HTTP workloads such as WebJobs or background services with no inbound HTTP traffic — these produce zero `Requests` metric data even when active. Review before deleting. - -**Required permissions:** -- `Microsoft.Web/sites/read` -- `Microsoft.Web/serverfarms/read` -- `Microsoft.Insights/metrics/read` - ---- - -#### Unused Container Registries - -**Rule ID:** `azure.container_registry.unused` - -**What it detects:** Container registries with zero **successful** pulls and pushes for 90+ days (default, configurable), after the registry is old enough to cover the full inactivity window - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **HIGH:** Zero successful pulls AND zero successful pushes for 90+ days (Azure Monitor `SuccessfulPullCount` and `SuccessfulPushCount` metrics) - -**Risk:** LOW - -**Why this matters:** -- Container registries accrue storage and per-operation charges regardless of usage -- A registry with no pulls and no pushes for 90+ days signals complete abandonment -- Common after workload migrations to other registries or container platforms - -**Detection logic:** -```python -for registry in registries.list(): - if registry.provisioning_state != "Succeeded": - continue - if registry.creation_date is None or registry.creation_date > window_start: - continue - - pulls = evaluate_metric("SuccessfulPullCount", interval="PT1H") - pushes = evaluate_metric("SuccessfulPushCount", interval="PT1H") - if pulls == "ZERO" and pushes == "ZERO": - confidence = "HIGH" - risk = "LOW" -``` - -Registries with active push activity (for example CI pipelines writing images) but zero pulls are **not** flagged. Registries with sparse, failed, or low-coverage metrics are skipped rather than emitted. - -**Common causes:** -- Workloads migrated to another registry (e.g., Docker Hub → ACR → GHCR) -- Projects retired without cleaning up the registry -- Old build artifacts never consumed by any deployment - -**Cost estimates by SKU (base fee only):** -- Basic: ~$5/month + storage -- Standard: ~$20/month + storage -- Premium: ~$50/month + storage - -Unknown or future SKU labels are still evaluated for inactivity, but `estimated_monthly_cost_usd` is left unset when the SKU is not one of `Basic`, `Standard`, or `Premium`. - -These are base monthly registry fees only. Storage charges and related Azure costs are not included. - -**Required permissions:** -- `Microsoft.ContainerRegistry/registries/read` -- `Microsoft.Insights/metrics/read` - ---- - -### Governance - -#### Untagged Resources - -**Rule ID:** `azure.resource.untagged` - -**What it detects:** Resources with zero tags - -**Resources checked:** -- Managed disks (7+ days old) -- Snapshots - -**Confidence:** - -Confidence thresholds and signal weighting are documented in [confidence.md](confidence.md). - -- **MEDIUM:** Untagged disk that's also unattached -- **LOW:** Untagged snapshot or attached disk - -**Required permissions:** -- `Microsoft.Compute/disks/read` -- `Microsoft.Compute/snapshots/read` - ---- - -## GCP Rules - -### Compute Waste - -#### Stopped VM Instances - -**Rule ID:** `gcp.compute.vm.stopped` - -**What it detects:** VM instances in `TERMINATED` state for 30+ days - -**Confidence:** - -- **HIGH:** `lastStopTimestamp` present and ≥ 30 days ago (deterministic timestamp) -- **MEDIUM:** `lastStopTimestamp` absent — instance is TERMINATED but stop time is unavailable -- Not flagged: stopped < 30 days, or instance in any other state (RUNNING, STAGING, etc.) - -**Risk:** LOW - -**Why this matters:** -- A TERMINATED GCP VM does not charge for vCPU or memory — but every attached Persistent Disk accrues storage charges at ~$0.04/GB-month (standard) or ~$0.17/GB-month (SSD), regardless of instance state -- A 500 GB root disk on a forgotten stopped instance costs ~$20/month indefinitely -- This is the GCP equivalent of a stopped EC2 instance — the compute is free, the storage is not - -**Detection logic:** -```python -for instance in instances_client.aggregated_list(project=project_id): - if instance.status == "TERMINATED": - if _parse_gcp_timestamp(instance.last_stop_timestamp) > cutoff: - flag(instance) -``` - -**Cost estimate:** Sum of attached PERSISTENT disk sizes × $0.04/GB/month (SCRATCH disks excluded — they are ephemeral) - -**Required permissions:** -- `compute.instances.list` (included in `roles/compute.viewer`) - ---- - -### Storage Waste - -#### Unattached Persistent Disks - -**Rule ID:** `gcp.compute.disk.unattached` - -**What it detects:** Persistent Disks in `READY` state with no attached VM (`users == []`) - -**Confidence:** - -- **HIGH:** Disk is READY and has no users — unambiguous detachment - -**Risk:** LOW - -**Why this matters:** -- GCP charges for Persistent Disks regardless of whether they are attached to a VM -- pd-standard: ~$0.04/GB/month, pd-ssd: ~$0.17/GB/month, pd-balanced: ~$0.10/GB/month, pd-extreme: ~$0.12/GB/month -- Unattached disks accumulate when VMs are deleted without deleting their disks — the most common source of GCP storage waste -- A 500 GB pd-ssd left unattached costs ~$85/month - -**Detection logic:** -```python -for disk in disks_client.aggregated_list(project=project_id): - if disk.status == "READY" and not disk.users: - flag(disk) -``` - -**Cost estimate by disk type:** - -| Type | Rate | -|---|---| -| `pd-standard` | $0.04/GB/month | -| `pd-balanced` | $0.10/GB/month | -| `pd-ssd` | $0.17/GB/month | -| `pd-extreme` | $0.12/GB/month | - -**Required permissions:** -- `compute.disks.list` (included in `roles/compute.viewer`) - ---- - -#### Old Disk Snapshots - -**Rule ID:** `gcp.compute.snapshot.old` - -**What it detects:** Disk snapshots older than 90 days - -**Confidence:** - -- **HIGH:** Source disk no longer exists (snapshot is orphaned — the source was deleted) -- **MEDIUM:** Source disk still exists (might be intentional long-term backup or DR snapshot) - -**Risk:** LOW - -**Why this matters:** -- GCP snapshots are billed at ~$0.026/GB/month compressed storage in Cloud Storage -- Automated snapshot policies are frequently removed while their snapshots are left behind -- One-off manual snapshots are rarely cleaned up — they persist indefinitely until explicitly deleted -- Snapshots are global resources — they accumulate across all zones and appear in no specific region - -**Detection logic:** -```python -for snapshot in snapshots_client.list(project=project_id): - if snapshot.status == "READY": - if _parse_gcp_timestamp(snapshot.creation_timestamp) < cutoff: - confidence = HIGH if not snapshot.source_disk else MEDIUM - flag(snapshot) -``` - -**Cost estimate:** Uses `storage_bytes` (actual compressed size) when available; falls back to `disk_size_gb × $0.026/GB/month` - -Note: `region_filter` is ignored for snapshots — GCP snapshots are global resources with no region attribute. - -**Required permissions:** -- `compute.snapshots.list` (included in `roles/compute.viewer`) - ---- - -### Network Waste - -#### Unused Reserved Static IPs - -**Rule ID:** `gcp.compute.ip.unused` - -**What it detects:** Reserved static IP addresses (regional and global) in `RESERVED` status (not `IN_USE`) - -**Confidence:** - -- **HIGH:** IP status is `RESERVED` — unambiguous, GCP itself confirms it is not attached - -**Risk:** LOW - -**Why this matters:** -- GCP bills ~$0.01/hour (~$7.20/month) for each static IP in RESERVED status under the PREMIUM network tier -- Reserved IPs accumulate when VMs, load balancers, or NAT gateways are deleted without releasing their IPs -- Unlike ephemeral IPs, reserved IPs persist independently — they must be explicitly released to stop billing - -**Detection logic:** -```python -# Regional IPs -for address in addresses_client.aggregated_list(project=project_id): - if address.status == "RESERVED": - flag(address, scope="regional") - -# Global IPs (skipped if region_filter is set) -for address in global_addresses_client.list(project=project_id): - if address.status == "RESERVED": - flag(address, scope="global") -``` - -**Graceful degradation:** If `compute.globalAddresses.list` is denied but regional IPs succeed, the rule returns regional findings rather than failing entirely. - -**Cost estimate:** $7.20/month per unused IP (PREMIUM network tier default) - -**Required permissions:** -- `compute.addresses.list` (included in `roles/compute.viewer`) -- `compute.globalAddresses.list` (included in `roles/compute.viewer`) - ---- - -### Platform Waste - -#### Idle Cloud SQL Instances - -**Rule ID:** `gcp.sql.instance.idle` - -**What it detects:** Cloud SQL instances in `RUNNABLE` state with zero database connections for 14+ days - -**Confidence:** - -- **HIGH:** Monitoring confirms zero connections for the full 14-day window - -**Risk:** HIGH - -**Why this matters:** -- Cloud SQL bills continuously for vCPU and memory regardless of query load -- A `db-n1-standard-2` costs ~$93/month with zero queries -- Dev and staging databases are frequently left running after feature branches merge or projects wind down -- Cloud SQL is the highest-cost idle resource type in most GCP environments - -**Detection logic:** -```python -for instance in sql_admin_api.list(project_id): - if instance.state == "RUNNABLE" and not is_read_replica(instance): - if not has_connections(monitoring_client, project_id, instance.name, days=14): - flag(instance) -``` - -**Conservative monitoring fallback:** If Cloud Monitoring is unavailable or permission-denied, the instance is assumed active — it is not flagged. This avoids false positives when monitoring data is temporarily unavailable. - -**Read replicas excluded:** Read replicas have no independent billing basis — the primary instance cost is what matters. - -**Cost estimates by tier:** - -| Tier | ~Monthly cost | -|---|---| -| `db-f1-micro` | $7.67 | -| `db-g1-small` | $25.22 | -| `db-n1-standard-1` | $46.55 | -| `db-n1-standard-2` | $93.10 | -| `db-n1-standard-4` | $186.19 | -| `db-n1-highmem-2` | $113.45 | -| `db-n1-highmem-4` | $226.90 | - -Costs are approximate for us-central1 with HA disabled. - -**Required permissions:** -- `cloudsql.instances.list` (included in `roles/cloudsql.viewer`) -- `monitoring.timeSeries.list` (included in `roles/monitoring.viewer`) - ---- - -### AI/ML Waste (opt-in — `--category ai`) - -#### Idle Vertex AI Online Prediction Endpoints - -**Rule ID:** `gcp.vertex.endpoint.idle` - -**What it detects:** Vertex AI Online Prediction endpoints with `dedicatedResources.minReplicaCount > 0` and zero prediction requests for 14+ days - -**Confidence:** - -- **HIGH:** Zero predictions for the full 14-day window (endpoint age ≥ 14 days) -- **MEDIUM:** Zero predictions, endpoint age ≥ 75% of threshold (≥ 10 days), or age unknown - -**Risk:** HIGH (GPU-backed endpoints: T4, V100, A100, L4, H100, TPU), MEDIUM (CPU-only) - -**Why this matters:** -- Vertex AI endpoints with `minReplicaCount > 0` keep dedicated compute running 24/7 regardless of traffic -- GPU endpoints (T4: $311/month per GPU, A100: $2,933/month, H100: $8,000/month) are especially costly when idle -- Experiment and prototype endpoints are commonly abandoned after demos without being deleted or scaled to zero -- Endpoints using `automaticResources` (which scale to zero) are excluded — only `dedicatedResources` incur idle cost - -**Detection logic:** -```python -for endpoint in vertex_ai_api.list(project_id, location="-"): # all locations - total_min_replicas = sum( - m.dedicatedResources.minReplicaCount - for m in endpoint.deployedModels - if m.dedicatedResources # skip automaticResources - ) - if total_min_replicas > 0: - if not has_predictions(monitoring_client, endpoint_id, days=14): - flag(endpoint) -``` - -**Conservative monitoring fallback:** If Cloud Monitoring is unavailable or permission-denied, the endpoint is assumed active — it is not flagged. - -**Cost estimates by machine type (per node, us-central1):** - -| Machine Type | ~Monthly cost/node | -|---|---| -| `n1-standard-4` | $138 | -| `n1-standard-8` | $277 | -| `n1-standard-4` + T4 GPU | $449 | -| `n1-standard-4` + V100 GPU | $1,523 | -| `a2-highgpu-1g` (A100 40GB) | $2,933 | -| `a2-highgpu-2g` (2× A100) | $5,866 | -| `a2-ultragpu-1g` (A100 80GB) | $5,103 | -| `g2-standard-8` (L4 GPU) | $1,060 | - -Costs are approximate for us-central1, on-demand. Multiply by `minReplicaCount` for total monthly idle cost. - -**Required permissions:** -- `aiplatform.endpoints.list` (included in `roles/aiplatform.viewer`) -- `monitoring.timeSeries.list` (included in `roles/monitoring.viewer`) - ---- - -#### Idle Vertex AI Workbench Instances - -**Rule ID:** `gcp.vertex.workbench.idle` - -**What it detects:** Vertex AI Workbench instances in ACTIVE state with no control-plane activity for 14+ days - -**Confidence:** - -- **HIGH:** `updateTime` ≥ 14 days ago AND instance age ≥ 14 days -- **MEDIUM:** `updateTime` ≥ 75% of threshold (≥ 10 days) **and** instance age ≥ 10 days, or `updateTime` unavailable (age-fallback, capped at MEDIUM) - -**Risk:** CRITICAL (GPU-backed, idle ≥ 2× threshold), HIGH (GPU-backed), MEDIUM (CPU-only) - -**Why this matters:** -- Workbench instances incur continuous compute charges while ACTIVE, even with no open notebooks or active kernels -- GPU instances (T4: $311/month, A100: $2,933/month, H100: $8,000/month) are extremely costly when left idle -- Data scientists commonly leave instances running after a sprint ends, a project is deprioritised, or when switching to a newer instance - -**Detection logic:** -```python -for instance in notebooks_api.list(project_id, location="-"): # all locations - if instance.state == "ACTIVE": - idle_days = (now - instance.updateTime).days - if idle_days >= 14: - flag(instance) -``` - -**updateTime** is updated by the Notebooks API when the instance is started, stopped, restarted, or reconfigured. Instances with stale `updateTime` have had no control-plane activity. This mirrors `LastModifiedTime` (SageMaker) and `last_modified_at` (Azure ML). - -**Cost estimates (per instance, us-central1, on-demand):** - -| Machine Type | ~Monthly cost | -|---|---| -| `n1-standard-4` | $138 | -| `n1-standard-4` + T4 GPU | $449 | -| `n1-standard-4` + V100 GPU | $1,523 | -| `a2-highgpu-1g` (A100 40GB) | $2,933 | -| `g2-standard-8` (L4 GPU) | $1,060 | - -**Required permissions:** -- `notebooks.instances.list` (included in `roles/notebooks.viewer`) +**Information hierarchy:** +- `docs/rules/.md` — operator catalog: permissions, params, exclusions, spec links +- `docs/specs//.md` — canonical decision contracts, evidence shape, cost model, failure behavior +- Rule `.py` header — implementation notes for engineers --- -#### Long-Running Vertex AI Training Jobs - -**Rule ID:** `gcp.vertex.training_job.long_running` - -**What it detects:** Vertex AI CustomJobs (state=`JOB_STATE_RUNNING`) and TrainingPipelines (state=`PIPELINE_STATE_RUNNING`) that have been running longer than expected. The default threshold is 24 hours. GPU/TPU accelerator jobs and expensive CPU clusters raise an early warning at 90% of the threshold (21.6h at defaults) because high burn rates make runaway detection time-sensitive. - -Most training jobs complete in minutes to a few hours. A job still running well past the threshold is likely hung, stalled, or runaway — waiting on data, deadlocked in distributed training, caught in an OOM loop, or simply forgotten after a project was cancelled. - -GPU-backed training is especially costly: an A100 40GB node (`a2-highgpu-1g`) runs at ~$4/hour; an H100 node (`a3-highgpu-8g`) with 8 GPUs runs at ~$80/hour. Distributed multi-worker jobs multiply cost linearly. - -**Confidence:** - -- **HIGH:** `duration ≥ long_running_hours × 3` — clearly runaway for almost any single training run -- **MEDIUM:** `duration ≥ long_running_hours` — worth reviewing; could be legitimate large-scale training -- **MEDIUM (early warning):** GPU/TPU accelerator job, or CPU cluster with burn rate above `expensive_hourly_threshold` (default $20/hr), at 90–100% of threshold — not emitted for cheap CPU-only jobs below threshold - -**Risk:** - -| Confidence | GPU/Accelerator | Risk | -|---|---|---| -| HIGH | Yes | CRITICAL | -| HIGH | No or unknown | HIGH | -| MEDIUM | Any | MEDIUM | - -**Why this matters:** -- Vertex AI CustomJobs with GPU workers continue billing as long as they are in `JOB_STATE_RUNNING` -- There is no automatic stop unless `timeout` is set in the job spec — jobs can run indefinitely if hung or if the stopping condition is never met -- TrainingPipelines wrap CustomJobs and can also run indefinitely if the underlying job does not terminate - -**Detection logic:** -```python -# Queries both resource types across all locations via REST API -for job in vertex_ai.customJobs(project, locations="-", filter='state="JOB_STATE_RUNNING"'): - duration = now - job.startTime # fallback to createTime if absent - is_accelerator = has_gpu_or_tpu(job.workerPoolSpecs) - burn_rate = total_hourly_cost(job.workerPoolSpecs) - if duration < threshold * 0.9: - continue # too young - if duration < threshold and not (is_accelerator or burn_rate > 20): - continue # early-warning zone: skip cheap CPU-only jobs - -for pipeline in vertex_ai.trainingPipelines(project, locations="-", filter='state="PIPELINE_STATE_RUNNING"'): - ... # same logic; hardware parsed from trainingTaskInputs when available -``` - -**Hardware detection:** -- Accelerator classification uses `workerPoolSpecs[].machineSpec.acceleratorType` against a frozenset of known accelerator types (GPU families and TPU pod types), plus machine type prefixes that bundle accelerator cost (`a2-*`, `a3-*`, `a4-*`, `a4x-*`, `g2-*`, `g4-*`, `ct4-*`, `ct5*`, `ct6*`, `tpu*`) -- TPU machines use `tpuTopology` (e.g. `"2x4"`) to derive the physical host count — `replicaCount` is always 1 in the Vertex AI API regardless of pod size -- TrainingPipelines embed hardware in opaque `trainingTaskInputs` — when specs cannot be parsed, cost uses a duration-tiered placeholder (`>24h → $20/hr`, `6–24h → $5/hr`, `<6h → $1/hr`) and `is_accelerator` is `False` (unknown hardware does not imply GPU workload) -- For bundled accelerator machines, co-scheduling is modeled: when `acceleratorCount` divides `machine_gpu_count` evenly, the machine cost is divided by `machine_gpu_count ÷ acceleratorCount` replicas per VM - -**Cost reported:** -- Accrued cost so far: `duration_hours × hourly_burn_rate` (sum across all worker pools); stored raw in `details["accrued_cost_usd"]` and capped at $1M in display text -- `estimated_monthly_cost_usd` is intentionally `None` — training jobs are transient, not recurring monthly expenses; populating that field would corrupt monthly savings totals -- Pricing is a static estimate (us-central1, on-demand); `details["pricing_scope"] = "us-central1_reference"` and `details["pricing_note"]` indicate the reference region and whether the job's actual region may differ significantly -- `details["pricing_confidence"]` is `"published"` when all prices come from GCP pricing pages, or `"partial_estimate"` for newer machine families (a3-megagpu, a4-*, g4-*, ct5p-*, ct6e-*, tpu7x-*) where rates are estimated - -**Cost estimates (per node, us-central1, on-demand):** - -| Machine Type | ~Hourly cost | Notes | -|---|---|---| -| `n1-standard-8` + T4 | ~$0.80/hr | GPU cost additive | -| `n1-standard-8` + V100 | ~$2.27/hr | GPU cost additive | -| `a2-highgpu-1g` (A100 40GB) | ~$4.02/hr | GPU bundled | -| `a2-highgpu-8g` (8× A100 40GB) | ~$32.14/hr | GPU bundled | -| `a3-highgpu-8g` (8× H100 80GB) | ~$80.00/hr | GPU bundled [est] | -| `g2-standard-8` (L4) | ~$1.45/hr | GPU bundled | -| `ct5lp-hightpu-8t` (8× TPU v5e) | ~$9.60/hr | TPU bundled | - -**What it does not check:** -- Intentional long-running distributed training (LLM pre-training, large fine-tunes) -- Checkpoint saving — job may be making progress without visible status updates -- Committed use discounts — actual cost may be significantly lower than on-demand estimate -- Preemptible/Spot workers — cost and interruption semantics differ -- Co-scheduling for g2-standard-32 — GPU count is ambiguous in GCP docs; that machine type uses full-price-per-replica as a conservative fallback - -**Required permissions:** -- `aiplatform.customJobs.list` (included in `roles/aiplatform.viewer`) -- `aiplatform.trainingPipelines.list` (included in `roles/aiplatform.viewer`) - ---- - -#### Idle Cloud TPU Nodes - -**Rule ID:** `gcp.tpu.idle` - -**What it detects:** Cloud TPU nodes in `READY` state with near-zero utilization for 7+ days. A READY TPU node incurs compute charges continuously, regardless of whether any workload is running. Forgotten TPU nodes left running after a training job completes are a common source of runaway cost. - -**Confidence:** - -- **HIGH:** Cloud Monitoring reports max `tpu.googleapis.com/node/accelerator/duty_cycle ≤ 2%` across all workers over the idle window (7 days by default) — the TPU was genuinely not executing any workload -- **LOW:** Monitoring data unavailable; node exists for ≥ idle_days with no observed activity — existence duration is not a reliable idle proxy (node may still be in active use) - -**Risk:** - -| Confidence | Hourly cost | Risk | -|---|---|---| -| HIGH | ≥ $10/hr | CRITICAL | -| HIGH | < $10/hr | HIGH | -| LOW | Any | MEDIUM | - -**Why this matters:** -- TPU nodes bill from the moment they reach READY state, regardless of utilization -- Unlike GPU instances, Cloud TPU nodes have no automatic stop after a job completes — they must be explicitly deleted -- An idle v4 node (4 chips, `2x2x1` topology) costs ~$12.88/hr; a v5p-8 costs ~$33.60/hr; a forgotten large pod runs up thousands per day - -**Detection logic:** -```python -# List all READY TPU nodes via Cloud TPU v2 REST API (locations/- wildcard) -for node in tpu.projects.locations.nodes.list(project, location="-"): - if node.state != "READY": - continue - age = age_days(node.createTime) - if age < idle_days: - continue # too young — enforce minimum observation window - # Check Cloud Monitoring for near-zero duty_cycle - duty_cycle = max_duty_cycle(node.id, window=idle_days) - if duty_cycle is not None: - idle = duty_cycle <= 0.02 # HIGH confidence - else: - idle = True # LOW confidence — age-based heuristic, utilization unknown -``` - -**Cost estimates (us-central1, on-demand):** - -| TPU Type | Chips | ~Hourly cost | Notes | -|---|---|---|---| -| `v2-8` | 8 | $12.00/hr | $1.50/chip-hr, published | -| `v3-8` | 8 | $17.60/hr | $2.20/chip-hr (device); v3 pod is $2.00/chip-hr | -| `v4` (2x2x1) | 4 | $12.88/hr | $3.22/chip-hr, published | -| `v4` (2x2x2) | 8 | $25.76/hr | $3.22/chip-hr, published | -| `v5e` (litepod-4) | 4 | $4.80/hr | $1.20/chip-hr, published | -| `v5e` (litepod-8) | 8 | $9.60/hr | $1.20/chip-hr, published | -| `v5p-4` | 4 | $16.80/hr | $4.20/chip-hr, published | -| `v5p-8` | 8 | $33.60/hr | $4.20/chip-hr, published | - -**What it does not check:** -- Batch or scheduled jobs that run intermittently (the 7-day window may miss a recent burst) -- Preemptible TPU nodes — may have been interrupted and not yet restarted intentionally -- Committed use discounts — actual cost may be significantly lower -- Nodes shared across teams where utilization is tracked externally - -**Required permissions:** -- `tpu.nodes.list` (included in `roles/tpu.viewer`) -- `monitoring.timeSeries.list` (included in `roles/monitoring.viewer`) — optional; falls back to age-based detection if absent - ---- - -#### Idle Vertex AI Feature Store Online Stores - -**Rule ID:** `gcp.vertex.featurestore.idle` - -**What it detects:** Vertex AI Feature Store online stores — both legacy `featurestores` (with `fixedNodeCount > 0` or autoscaled via `scaling.minNodeCount`) and new-generation `featureOnlineStores` (Bigtable-backed or Optimized) — that have received zero online serving requests for 30+ days while remaining in STABLE state. Legacy featurestores and Bigtable-backed online stores incur continuous Bigtable compute charges; Optimized stores incur storage and query compute charges. Feature stores are frequently left running after a model or recommendation system is retired. - -**Confidence:** - -- **HIGH:** Cloud Monitoring confirms zero `online_serving/request_count` over the 30-day window — the store had no `ReadFeatureValues` (or equivalent) requests at all -- **LOW:** Monitoring data unavailable; store has been in STABLE state for ≥ 30 days — heuristic: age only, request activity unknown - -**Risk:** - -| Confidence | Risk | -|---|---| -| HIGH | HIGH | -| LOW | MEDIUM | - -**Why this matters:** -- Legacy featurestores with `fixedNodeCount > 0` bill ~$0.27/node-hour (us-central1, SSD-backed Bigtable) continuously — a 1-node store costs ~$197/month, a 3-node HA store costs ~$591/month -- New-generation featureOnlineStores (Bigtable-backed) have similar per-node costs via `autoScaling.minNodeCount` -- Optimized (BigQuery-backed) featureOnlineStores have lower base cost but still incur storage and query charges -- These stores are often provisioned during model development and forgotten after the serving layer is replaced - -**Detection logic:** -```python -# Legacy featurestores with online serving configured (fixed or autoscaled) -for store in vertex_ai.featurestores(project, locations="-"): - config = store.onlineServingConfig - if config.fixedNodeCount == 0 and config.scaling.minNodeCount == 0: - continue # no online serving cost - requests = monitoring.sum("featurestore/online_serving/request_count", window=30d) - if requests is not None: - if requests == 0: - flag() # HIGH confidence - elif age_days >= 30: - flag() # LOW confidence — age heuristic, request activity unknown - -# New featureOnlineStores (Bigtable or Optimized) -for store in vertex_ai.featureOnlineStores(project, locations="-"): - requests = monitoring.sum("featureonlinestore/online_serving/request_count", window=30d) - if requests is not None: - if requests == 0: - flag() # HIGH confidence - elif age_days >= 30: - flag() # LOW confidence — age heuristic, request activity unknown -``` - -**Cost estimates (us-central1, on-demand):** - -| Store type | Config | ~Monthly cost | -|---|---|---| -| Legacy featurestore | 1 Bigtable node | ~$197/mo | -| Legacy featurestore | 3 Bigtable nodes (HA) | ~$591/mo | -| Feature Online Store | 1 Bigtable node (min) | ~$197/mo | -| Feature Online Store | 3 Bigtable nodes (min) | ~$591/mo | -| Feature Online Store | Optimized (BigQuery) | ~$100+/mo [est] | - -**What it does not check:** -- Periodic or low-frequency batch workflows querying less often than the 30-day window -- Feature stores used by scheduled pipelines (e.g. weekly batch inference) -- Committed use discounts — actual cost may be lower -- Stores intentionally kept warm for latency-sensitive cold-start mitigation - -**Required permissions:** -- `aiplatform.featurestores.list` (included in `roles/aiplatform.viewer`) -- `aiplatform.featureOnlineStores.list` (included in `roles/aiplatform.viewer`) -- `monitoring.timeSeries.list` (included in `roles/monitoring.viewer`) — optional; falls back to age-based detection if absent - ---- - -## Rule Stability Guarantee - -Once a rule reaches production status: -- Rule ID remains stable -- Confidence semantics unchanged -- Backwards compatibility preserved -- Schema additions only (no breaking changes) - -This guarantees trust for long-running CI/CD integrations. - ---- - -## Coming Soon - -**AI/ML (all providers):** -- Orphaned SageMaker training artifacts in S3 (AWS) - -**AWS:** -- S3 lifecycle gaps, Redshift idle, NAT Gateway routing waste - -**Azure:** -- Azure Firewall idle, AKS node pool idle, Azure Batch unused pools - -**GCP:** -- GKE node pool idle, BigQuery slot waste, GCS cold storage, Cloud Run idle revisions - -**Multi-Cloud:** -- Rule filtering (`--rules` flag) -- Policy-as-code (`cleancloud.yaml`) - ---- +## Design Principles -**Next:** [AWS Setup →](aws.md) | [Azure Setup →](azure.md) | [GCP Setup →](gcp.md) | [CI/CD Integration →](ci.md) +- **Read-only always** — no Delete, Modify, Tag, or Update operations; safe for production +- **Conservative by default** — multiple signals preferred; false negatives over false positives +- **Explicit confidence** — every finding carries HIGH / MEDIUM / LOW confidence +- **Review-only** — findings are candidates for human review, not triggers for automated action diff --git a/docs/rules/aws.md b/docs/rules/aws.md new file mode 100644 index 0000000..82adc6e --- /dev/null +++ b/docs/rules/aws.md @@ -0,0 +1,302 @@ +# AWS Rules + +19 rules (13 hygiene + 6 AI/ML). AI/ML rules require `--category ai`. + +← [Back to index](../rules.md) + +| Rule ID | Cost Surface | What It Detects | +|---|---|---| +| `aws.ec2.instance.stopped` | Compute | EC2 instances stopped 30+ days (EBS charges continue) | +| `aws.ec2.security_group.unused` | Governance | Security groups with no ENI associations | +| `aws.ebs.unattached` | Storage | EBS volumes not attached to any instance | +| `aws.ebs.snapshot.old` | Storage | Snapshots ≥ 90 days old | +| `aws.ec2.ami.old` | Storage | AMIs older than 180 days | +| `aws.ec2.elastic_ip.unattached` | Network | Elastic IPs not associated with any instance or network interface | +| `aws.ec2.eni.detached` | Network | Detached ENIs not currently attached | +| `aws.ec2.nat_gateway.idle` | Network | NAT Gateways with zero traffic 14+ days | +| `aws.elbv2.alb.idle` / `aws.elbv2.nlb.idle` / `aws.elb.clb.idle` | Network | Load balancers with zero traffic 14+ days | +| `aws.rds.instance.idle` | Platform | RDS instances with zero connections 14+ days | +| `aws.rds.snapshot.old` | Storage | Manual RDS snapshots older than 90 days | +| `aws.cloudwatch.logs.infinite_retention` | Observability | Log groups with no retention policy | +| `aws.resource.untagged` | Governance | EC2/S3/CloudWatch resources with zero tags | +| `aws.sagemaker.endpoint.idle` | AI/ML | Real-time SageMaker endpoints with no traffic 14+ days | +| `aws.sagemaker.notebook.idle` | AI/ML | SageMaker Notebook Instances with stale activity 14+ days | +| `aws.ec2.gpu.idle` | AI/ML | EC2 GPU/accelerator instances with <5% GPU or <10% CPU over 7 days | +| `aws.bedrock.provisioned_throughput.idle` | AI/ML | Bedrock Provisioned Throughput with zero invocations 7+ days | +| `aws.sagemaker.studio_app.idle` | AI/ML | SageMaker Studio apps with no usable activity 7+ days | +| `aws.sagemaker.training_job.long_running` | AI/ML | SageMaker training jobs still running beyond threshold | + +--- + +## Compute + +#### `aws.ec2.instance.stopped` +**Detects:** EC2 instances in `stopped` state for 30+ days; EBS volumes continue accruing charges + +**Confidence / Risk:** HIGH (CloudTrail stop event ≥ 30 days, restart-cycle aware) / MEDIUM + +**Permissions:** `ec2:DescribeInstances`, `ec2:DescribeVolumes`, `cloudtrail:LookupEvents` + +**Params:** none + +**Exclusions:** none + +**Spec:** [specs/aws/ec2_stopped.md](../specs/aws/ec2_stopped.md) + +--- + +## Governance + +#### `aws.ec2.security_group.unused` +**Detects:** Security groups with no ENI associations + +**Confidence / Risk:** MEDIUM (no ENI associations found) / LOW + +**Permissions:** `ec2:DescribeSecurityGroups`, `ec2:DescribeNetworkInterfaces` + +**Params:** none + +**Exclusions:** `default` security group (AWS prevents deletion) + +**Spec:** [specs/aws/ec2_sg_unused.md](../specs/aws/ec2_sg_unused.md) + +#### `aws.resource.untagged` +**Detects:** EC2 volumes, S3 buckets, and CloudWatch Log Groups with zero tags + +**Confidence / Risk:** HIGH (deterministic from authoritative tag source) / MEDIUM + +**Permissions:** `ec2:DescribeVolumes`, `s3:ListAllMyBuckets`, `s3:GetBucketTagging`, `logs:DescribeLogGroups`, `logs:ListTagsForResource` + +**Params:** none + +**Exclusions:** none + +**Spec:** [specs/aws/untagged_resources.md](../specs/aws/untagged_resources.md) + +--- + +## Storage + +#### `aws.ebs.unattached` +**Detects:** EBS volumes in `available` state for 7+ days + +**Confidence / Risk:** MEDIUM (`available` state ≥ 7 days) / LOW + +**Permissions:** `ec2:DescribeVolumes` + +**Params:** none + +**Exclusions:** volumes younger than 7 days + +**Spec:** [specs/aws/ebs_unattached.md](../specs/aws/ebs_unattached.md) + +#### `aws.ebs.snapshot.old` +**Detects:** EBS snapshots older than `days_old` + +**Confidence / Risk:** LOW (age alone is a weak signal) / LOW + +**Permissions:** `ec2:DescribeSnapshots`, `ec2:DescribeSnapshotAttribute` + +**Params:** `days_old` (default: 90) + +**Exclusions:** snapshots linked to registered AMIs + +**Spec:** [specs/aws/ebs_snapshot_old.md](../specs/aws/ebs_snapshot_old.md) + +#### `aws.ec2.ami.old` +**Detects:** AMIs older than `days_old` in `available` state + +**Confidence / Risk:** MEDIUM (age + state) / HIGH–LOW (varies by usage signal) + +**Permissions:** `ec2:DescribeImages` + +**Params:** `days_old` (default: 180) + +**Exclusions:** AMIs not in `available` state + +**Spec:** [specs/aws/ami_old.md](../specs/aws/ami_old.md) + +#### `aws.rds.snapshot.old` +**Detects:** Manual RDS snapshots older than `days_old` + +**Confidence / Risk:** LOW (age alone is a weak signal) / LOW + +**Permissions:** `rds:DescribeDBSnapshots`, `rds:DescribeDBSnapshotAttributes` + +**Params:** `days_old` (default: 90) + +**Exclusions:** automated snapshots (`SnapshotType=automated`), snapshots not in `available` state + +**Spec:** [specs/aws/rds_snapshot_old.md](../specs/aws/rds_snapshot_old.md) + +--- + +## Network + +#### `aws.ec2.elastic_ip.unattached` +**Detects:** Elastic IPs with all four association fields absent + +**Confidence / Risk:** HIGH (deterministic state, no age threshold) / LOW + +**Permissions:** `ec2:DescribeAddresses` + +**Params:** none + +**Exclusions:** Classic EIPs without `AllocationTime` are annotated but not excluded + +**Spec:** [specs/aws/elastic_ip_unattached.md](../specs/aws/elastic_ip_unattached.md) + +#### `aws.ec2.eni.detached` +**Detects:** ENIs in `available` (detached) state + +**Confidence / Risk:** HIGH (`Status=available`, no temporal threshold) / LOW + +**Permissions:** `ec2:DescribeNetworkInterfaces` + +**Params:** none + +**Exclusions:** none (Lambda/ECS/RDS managed ENIs included and annotated) + +**Spec:** [specs/aws/eni_detached.md](../specs/aws/eni_detached.md) + +#### `aws.ec2.nat_gateway.idle` +**Detects:** NAT Gateways with zero traffic across all 5 CloudWatch metrics for `idle_threshold_days` + +**Confidence / Risk:** HIGH (zero traffic + no route table refs); MEDIUM (zero traffic, route refs exist) / MEDIUM + +**Permissions:** `ec2:DescribeNatGateways`, `cloudwatch:GetMetricStatistics` + +**Params:** `idle_threshold_days` (default: 14) + +**Exclusions:** gateways younger than threshold; any metric with no datapoints → skip + +**Spec:** [specs/aws/nat_gateway_idle.md](../specs/aws/nat_gateway_idle.md) + +#### `aws.elbv2.alb.idle` / `aws.elbv2.nlb.idle` / `aws.elb.clb.idle` +**Detects:** Load balancers with zero traffic for `idle_threshold_days` + +**Confidence / Risk:** HIGH (zero traffic + no registered targets); MEDIUM (zero traffic only) / MEDIUM + +**Permissions:** `elasticloadbalancing:DescribeLoadBalancers`, `elasticloadbalancing:DescribeTargetGroups`, `elasticloadbalancing:DescribeTargetHealth`, `cloudwatch:GetMetricStatistics` + +**Params:** `idle_threshold_days` (default: 14) + +**Exclusions:** LBs younger than threshold + +**Spec:** [specs/aws/elb_idle.md](../specs/aws/elb_idle.md) + +--- + +## Platform + +#### `aws.rds.instance.idle` +**Detects:** RDS instances with zero `DatabaseConnections` for `idle_threshold_days` + +**Confidence / Risk:** MEDIUM (zero connections; proxies may obscure usage) / MEDIUM + +**Permissions:** `rds:DescribeDBInstances`, `cloudwatch:GetMetricStatistics` + +**Params:** `idle_threshold_days` (default: 14) + +**Exclusions:** Aurora cluster members, read replicas, instances younger than threshold + +**Spec:** [specs/aws/rds_idle.md](../specs/aws/rds_idle.md) + +--- + +## Observability + +#### `aws.cloudwatch.logs.infinite_retention` +**Detects:** CloudWatch Log Groups with no retention policy set + +**Confidence / Risk:** HIGH (directly observable config fact) / HIGH (≥ 1 GB stored), MEDIUM (> 0 bytes), LOW (empty) + +**Permissions:** `logs:DescribeLogGroups` + +**Params:** none + +**Exclusions:** none + +**Spec:** [specs/aws/cloudwatch_logs_no_retention.md](../specs/aws/cloudwatch_logs_no_retention.md) + +--- + +## AI/ML *(opt-in: `--category ai`)* + +#### `aws.sagemaker.endpoint.idle` +**Detects:** Real-time SageMaker endpoints `InService` with zero invocations across all billable production variants for `idle_days` + +**Confidence / Risk:** HIGH (all variants confirmed zero traffic); MEDIUM (at least one variant missing datapoints) / HIGH (accelerator-backed variants: ml.g*, ml.p*, ml.inf*, ml.trn*); MEDIUM (CPU-only) + +**Permissions:** `sagemaker:ListEndpoints`, `sagemaker:DescribeEndpoint`, `sagemaker:DescribeEndpointConfig`, `cloudwatch:GetMetricStatistics` + +**Params:** `idle_days` (default: 14) + +**Exclusions:** async inference endpoints (`AsyncInferenceConfig` set), serverless variants without current provisioned concurrency + +**Spec:** [specs/aws/ai/sagemaker_endpoint_idle.md](../specs/aws/ai/sagemaker_endpoint_idle.md) + +#### `aws.sagemaker.notebook.idle` +**Detects:** SageMaker Notebook Instances `InService` with stale `LastModifiedTime` for `idle_days` (control-plane heuristic, not direct Jupyter activity) + +**Confidence / Risk:** MEDIUM (weak heuristic) / HIGH (GPU/accelerator instances: ml.g4dn, ml.g5, ml.p3, ml.p4d, ml.p4de, ml.p5, Inferentia, Trainium); MEDIUM (CPU) + +**Permissions:** `sagemaker:ListNotebookInstances` + +**Params:** `idle_days` (default: 14) + +**Exclusions:** `Stopped` instances (out of scope) + +**Spec:** [specs/aws/ai/sagemaker_notebook_idle.md](../specs/aws/ai/sagemaker_notebook_idle.md) + +#### `aws.ec2.gpu.idle` +**Detects:** EC2 GPU/accelerator instances (p/g/trn/inf/dl families) with <5% GPU utilization or <10% CPU over `idle_days` + +**Confidence / Risk:** HIGH (NVIDIA CW agent present, max GPU < 5%); MEDIUM (no NVIDIA agent, avg CPU < 10%) / CRITICAL (`idle_ratio ≥ 2.0`); HIGH (all other GPU/accelerator instances) + +**Permissions:** `ec2:DescribeInstances`, `cloudwatch:GetMetricStatistics`, `cloudwatch:ListMetrics` + +**Params:** `idle_days` (default: 7), `gpu_threshold` (default: 5.0%), `cpu_threshold` (default: 10.0%) + +**Exclusions:** non-GPU instance families; instances younger than threshold + +**Spec:** — + +#### `aws.bedrock.provisioned_throughput.idle` +**Detects:** Bedrock Provisioned Throughput (Model Units) with zero invocations for `idle_days`; bills per MU per hour regardless of traffic + +**Confidence / Risk:** HIGH (zero invocations confirmed + age ≥ `idle_days`) / HIGH + +**Permissions:** `bedrock:ListProvisionedModelThroughputs`, `cloudwatch:GetMetricStatistics` + +**Params:** `idle_days` (default: 7) + +**Exclusions:** none + +**Spec:** [specs/aws/ai/bedrock_provisioned_idle.md](../specs/aws/ai/bedrock_provisioned_idle.md) + +#### `aws.sagemaker.studio_app.idle` +**Detects:** SageMaker Studio `KernelGateway`/`JupyterLab`/`CodeEditor` apps `InService` with no usable recent activity for `idle_days_threshold` + +**Confidence / Risk:** HIGH (usable activity signal present and ≥ threshold); skipped if `LastUserActivityTimestamp == LastHealthCheckTimestamp` / HIGH (GPU/accelerator: ml.g4dn, ml.g5, ml.p2–p5, ml.trn1, ml.inf1/2); MEDIUM (CPU) + +**Permissions:** `sagemaker:ListApps`, `sagemaker:DescribeApp` + +**Params:** `idle_days_threshold` (default: 7) + +**Exclusions:** `JupyterServer` app type; apps where `LastUserActivityTimestamp == LastHealthCheckTimestamp` (health check artifact) + +**Spec:** [specs/aws/ai/sagemaker_studio_app_idle.md](../specs/aws/ai/sagemaker_studio_app_idle.md) + +#### `aws.sagemaker.training_job.long_running` +**Detects:** SageMaker training jobs still `InProgress` beyond `long_running_hours_threshold` + +**Confidence / Risk:** HIGH (elapsed time exceeds configured stopping-condition limit); MEDIUM (threshold exceeded, no stopping-condition limit) / CRITICAL (HIGH confidence + GPU/accelerator); HIGH (HIGH confidence + non-GPU); MEDIUM (all MEDIUM confidence) + +**Permissions:** `sagemaker:ListTrainingJobs`, `sagemaker:DescribeTrainingJob` + +**Params:** `long_running_hours_threshold` (default: 24) + +**Exclusions:** completed/stopped jobs; spot jobs use `MaxWaitTimeInSeconds`, not `MaxRuntimeInSeconds` + +**Spec:** [specs/aws/ai/sagemaker_training_job_long_running.md](../specs/aws/ai/sagemaker_training_job_long_running.md) diff --git a/docs/rules/azure.md b/docs/rules/azure.md new file mode 100644 index 0000000..c63f044 --- /dev/null +++ b/docs/rules/azure.md @@ -0,0 +1,270 @@ +# Azure Rules + +17 rules (12 hygiene + 5 AI/ML). AI/ML rules require `--category ai`. + +← [Back to index](../rules.md) + +| Rule ID | Cost Surface | What It Detects | +|---|---|---| +| `azure.vm.stopped_not_deallocated` | Compute | Stopped but not deallocated VMs (full charges) | +| `azure.compute.disk.unattached` | Storage | Managed disks not attached to any VM | +| `azure.compute.snapshot.old` | Storage | Old managed snapshots as conservative review candidates | +| `azure.network.public_ip.unused` | Network | Public IPs unattached across all four control-plane linkage surfaces | +| `azure.load_balancer.no_backends` | Network | Standard LBs with billable rules but no backend members | +| `azure.application_gateway.no_backends` | Network | App Gateways with zero backend targets | +| `azure.virtual_network_gateway.idle` | Network | VPN/ExpressRoute Gateways with no connections | +| `azure.app_service_plan.empty` | Platform | Paid App Service Plans with zero apps | +| `azure.app_service.idle` | Platform | App Services with zero HTTP requests 14+ days | +| `azure.sql.database.idle` | Platform | Dedicated single databases with zero activity across all five required metrics over idle window | +| `azure.container_registry.unused` | Platform | Container registries with zero pulls and pushes 90+ days | +| `azure.resource.untagged` | Governance | Disks and snapshots with zero tags | +| `azure.aml.compute.idle` | AI/ML | AML compute clusters with min_node_count > 0 and no active nodes 14+ days | +| `azure.ml.compute_instance.idle` | AI/ML | Azure ML Compute Instances Running with no activity 14+ days | +| `azure.ml.online_endpoint.idle` | AI/ML | Azure ML managed online endpoints with zero scoring requests 7+ days | +| `azure.ai_search.idle` | AI/ML | Azure AI Search services (Standard+) with zero queries 30+ days | +| `azure.openai.provisioned_deployment.idle` | AI/ML | Azure OpenAI provisioned deployments (PTUs) with zero requests 7+ days | + +--- + +## Compute + +#### `azure.vm.stopped_not_deallocated` +**Detects:** VMs in `PowerState/stopped` state (full compute charges continue; only `deallocated` stops billing) + +**Confidence / Risk:** HIGH (deterministic power state) / HIGH + +**Permissions:** `Microsoft.Compute/virtualMachines/read` + +**Params:** none + +**Exclusions:** `PowerState/deallocated`, transitional states (starting, stopping, deallocating) + +**Spec:** — + +--- + +## Storage + +#### `azure.compute.disk.unattached` +**Detects:** Managed disks with `managed_by is None` for 7+ days + +**Confidence / Risk:** MEDIUM (deterministic state; attachment intent unknown) / LOW + +**Permissions:** `Microsoft.Compute/disks/read` + +**Params:** none + +**Exclusions:** disks younger than 7 days + +**Spec:** — + +#### `azure.compute.snapshot.old` +**Detects:** Managed snapshots older than 30 days as conservative review candidates; confidence escalates with age relative to `max_age_days` + +**Confidence / Risk:** LOW (age ≥ 30 days and < `max_age_days`); MEDIUM (age ≥ `max_age_days`) / LOW + +**Permissions:** `Microsoft.Compute/snapshots/read` + +**Params:** `max_age_days` (default: 90) + +**Exclusions:** `provisioning_state != "Succeeded"`, incomplete snapshots (`completion_percent < 100`), snapshots younger than 30 days + +**Spec:** [specs/azure/disk_snapshot_old.md](../specs/azure/disk_snapshot_old.md) + +--- + +## Network + +#### `azure.network.public_ip.unused` +**Detects:** Public IP addresses fully unattached across all four known Azure control-plane linkage surfaces: `ip_configuration`, `nat_gateway`, `service_public_ip_address`, `linked_public_ip_address` + +**Confidence / Risk:** HIGH (all four linkages cleanly absent — deterministic) / LOW + +**Permissions:** `Microsoft.Network/publicIPAddresses/read` + +**Params:** none + +**Exclusions:** `provisioning_state != "Succeeded"`; any linkage present with a non-empty `id`; linkage object present but `id` unresolvable (malformed reference — skipped conservatively); unattached Dynamic Public IP with no assigned `ip_address` (low-signal placeholder) + +**Spec:** [azure/public_ip_unused.md](../specs/azure/public_ip_unused.md) + +#### `azure.load_balancer.no_backends` +**Detects:** Standard SKU Load Balancers with load-balancing or outbound rules whose referenced backend pools all have zero members + +**Confidence / Risk:** HIGH (all relevant pools resolved and empty — deterministic) / LOW + +**Permissions:** `Microsoft.Network/loadBalancers/read` + +**Params:** none + +**Exclusions:** Basic and Gateway SKU; LBs with no load-balancing rules and no outbound rules (no billable signal); any LB where a referenced pool cannot be resolved + +**Spec:** [specs/azure/lb_no_backends.md](../specs/azure/lb_no_backends.md) + +#### `azure.application_gateway.no_backends` +**Detects:** Application Gateways where all backend pools have zero targets + +**Confidence / Risk:** HIGH (deterministic control-plane state) / MEDIUM + +**Permissions:** `Microsoft.Network/applicationGateways/read` + +**Params:** none + +**Exclusions:** gateways with `provisioning_state != "Succeeded"` + +**Spec:** [specs/azure/app_gateway_no_backends.md](../specs/azure/app_gateway_no_backends.md) + +#### `azure.virtual_network_gateway.idle` +**Detects:** VPN or ExpressRoute Gateways with no active S2S/ExpressRoute connections + +**Confidence / Risk:** MEDIUM (no active connections; P2S client count not checked) / HIGH + +**Permissions:** `Microsoft.Network/virtualNetworkGateways/read`, `Microsoft.Network/connections/read` + +**Params:** none + +**Exclusions:** gateways with P2S configuration present and no active connections are still flagged if no other connections exist + +**Spec:** — + +--- + +## Platform + +#### `azure.app_service_plan.empty` +**Detects:** Paid-tier App Service Plans with zero hosted apps (`number_of_sites == 0`) + +**Confidence / Risk:** HIGH (deterministic) / LOW + +**Permissions:** `Microsoft.Web/serverfarms/read`, `Microsoft.Web/serverfarms/sites/read` + +**Params:** none + +**Exclusions:** Free and Shared tier plans + +**Spec:** [specs/azure/app_service_plan_empty.md](../specs/azure/app_service_plan_empty.md) + +#### `azure.app_service.idle` +**Detects:** App Services on paid plans with zero HTTP `Requests` metric for `days_idle` + +**Confidence / Risk:** HIGH (zero HTTP traffic confirmed) / MEDIUM + +**Permissions:** `Microsoft.Web/sites/read`, `Microsoft.Web/serverfarms/read`, `Microsoft.Insights/metrics/read` + +**Params:** `days_idle` (default: 14) + +**Exclusions:** Free, Shared, Dynamic (Consumption/serverless) tiers; non-HTTP workloads (WebJobs, background services) may produce false positives + +**Spec:** [specs/azure/app_service_idle.md](../specs/azure/app_service_idle.md) + +#### `azure.sql.database.idle` +**Detects:** Dedicated single databases with zero activity across all five required Azure Monitor metrics (`connection_successful`, `sessions_count`, `cpu_percent`, `physical_data_read_percent`, `log_write_percent`) over the idle window; single-metric silence is not sufficient + +**Confidence / Risk:** HIGH (all five metrics confirmed zero for full window) / HIGH + +**Permissions:** `Microsoft.Sql/servers/read`, `Microsoft.Sql/servers/databases/read`, `Microsoft.Insights/metrics/read` + +**Params:** `idle_days` (default: 14) + +**Exclusions:** `master` system database; elastic pool databases (billing is at pool level); replica / secondary-shaped databases (`secondary_type` non-empty); currently paused serverless databases (`status == "Paused"` or `paused_date > resumed_date`); databases younger than `idle_days`; any required metric absent, series empty, or query failing (conservative skip) + +**Spec:** [azure/sql_database_idle.md](../specs/azure/sql_database_idle.md) + +#### `azure.container_registry.unused` +**Detects:** Container registries with zero successful pulls AND zero successful pushes for `days_unused`; registries with sparse or missing metrics are skipped + +**Confidence / Risk:** HIGH (both `SuccessfulPullCount` and `SuccessfulPushCount` metrics confirmed zero) / LOW + +**Permissions:** `Microsoft.ContainerRegistry/registries/read`, `Microsoft.Insights/metrics/read` + +**Params:** `days_unused` (default: 90) + +**Exclusions:** `provisioning_state != "Succeeded"`; registries younger than observation window + +**Spec:** [specs/azure/container_registry_unused.md](../specs/azure/container_registry_unused.md) + +--- + +## Governance + +#### `azure.resource.untagged` +**Detects:** Managed disks and snapshots with zero tags + +**Confidence / Risk:** MEDIUM (untagged + unattached disk); LOW (untagged snapshot or attached disk) / LOW + +**Permissions:** `Microsoft.Compute/disks/read`, `Microsoft.Compute/snapshots/read` + +**Params:** none + +**Exclusions:** disks younger than 7 days + +**Spec:** — + +--- + +## AI/ML *(opt-in: `--category ai`)* + +#### `azure.aml.compute.idle` +**Detects:** AML compute clusters with `min_node_count > 0` and zero active nodes for 14+ days + +**Confidence / Risk:** HIGH (zero nodes, cluster age ≥ 14 days); MEDIUM (zero nodes, age 7–13 days or creation time unavailable) / HIGH (GPU VM sizes: Standard_NC*, Standard_ND*, Standard_NV*); MEDIUM (CPU) + +**Permissions:** `Microsoft.MachineLearningServices/workspaces/read`, `Microsoft.MachineLearningServices/workspaces/computes/read`, `Microsoft.Insights/metrics/read` + +**Params:** none (14-day threshold is fixed) + +**Exclusions:** clusters with `min_node_count == 0` (scale-to-zero; no idle cost) + +**Spec:** — + +#### `azure.ml.compute_instance.idle` +**Detects:** Azure ML Compute Instances in `Running` state with no control-plane activity for `idle_days` + +**Confidence / Risk:** HIGH (`last_operation.operation_time` or `last_modified_at` ≥ threshold, age ≥ threshold); MEDIUM (≥ 75% of threshold on both signals, or age-only fallback) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU: Standard_NC*, Standard_ND*, Standard_NV*); MEDIUM (CPU) + +**Permissions:** `Microsoft.MachineLearningServices/workspaces/read`, `Microsoft.MachineLearningServices/workspaces/computes/read` + +**Params:** `idle_days` (default: 14) + +**Exclusions:** stopped instances (only `Running` state evaluated) + +**Spec:** — + +#### `azure.ml.online_endpoint.idle` +**Detects:** Azure ML managed online endpoints in `Succeeded` provisioning state with zero scoring requests for `idle_days` + +**Confidence / Risk:** HIGH (per-endpoint `RequestCount` metric confirms zero + age ≥ `idle_days`); MEDIUM (zero confirmed but age < `idle_days`, or metric unavailable + age ≥ 2× `idle_days`) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU/accelerator); MEDIUM (CPU) + +**Permissions:** `Microsoft.MachineLearningServices/workspaces/read`, `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read`, `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read`, `Microsoft.Insights/metrics/read` + +**Params:** `idle_days` (default: 7) + +**Exclusions:** `provisioning_state != "Succeeded"`; batch endpoints + +**Spec:** — + +#### `azure.ai_search.idle` +**Detects:** Azure AI Search services (Standard tier and above) with zero `SearchQueriesPerSecond` for `idle_days` + +**Confidence / Risk:** HIGH (zero queries confirmed + age ≥ `idle_days`); MEDIUM (zero confirmed but age < `idle_days`, or metric unavailable + age ≥ 2× `idle_days`) / HIGH (estimated cost ≥ $1,000/month); MEDIUM (otherwise) + +**Permissions:** `Microsoft.Search/searchServices/read`, `Microsoft.Insights/metrics/read` + +**Params:** `idle_days` (default: 30) + +**Exclusions:** Basic tier and below; only `standard`, `standard2`, `standard3`, `storage_optimized_l1`, `storage_optimized_l2` evaluated + +**Spec:** — + +#### `azure.openai.provisioned_deployment.idle` +**Detects:** Azure OpenAI provisioned deployments (PTUs) with zero API requests for `idle_days`; bills per PTU per hour regardless of traffic + +**Confidence / Risk:** HIGH (per-deployment `AzureOpenAIRequests` metric confirms zero + age ≥ `idle_days`); MEDIUM (per-deployment zero but age < `idle_days`, or account-level zero only) / HIGH (≥ 7 PTUs, ~$10K+/month); MEDIUM (< 7 PTUs) + +**Permissions:** `Microsoft.CognitiveServices/accounts/read`, `Microsoft.CognitiveServices/accounts/deployments/read`, `Microsoft.Insights/metrics/read` + +**Params:** `idle_days` (default: 7) + +**Exclusions:** non-provisioned SKUs; only `ProvisionedManaged`, `GlobalProvisionedManaged`, `DataZoneProvisionedManaged` evaluated + +**Spec:** — diff --git a/docs/rules/gcp.md b/docs/rules/gcp.md new file mode 100644 index 0000000..c7b5015 --- /dev/null +++ b/docs/rules/gcp.md @@ -0,0 +1,168 @@ +# GCP Rules + +10 rules (5 hygiene + 5 AI/ML). AI/ML rules require `--category ai`. + +← [Back to index](../rules.md) + +| Rule ID | Cost Surface | What It Detects | +|---|---|---| +| `gcp.compute.vm.stopped` | Compute | TERMINATED VMs stopped 30+ days (disk charges continue) | +| `gcp.compute.disk.unattached` | Storage | Persistent Disks in READY state with no attached VM | +| `gcp.compute.snapshot.old` | Storage | Disk snapshots older than 90 days | +| `gcp.compute.ip.unused` | Network | Reserved static IPs in RESERVED state | +| `gcp.sql.instance.idle` | Platform | Cloud SQL instances with zero connections 14+ days | +| `gcp.vertex.endpoint.idle` | AI/ML | Vertex AI endpoints with dedicated capacity and zero predictions 14+ days | +| `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances with no activity 14+ days | +| `gcp.vertex.training_job.long_running` | AI/ML | Vertex AI jobs running beyond threshold | +| `gcp.tpu.idle` | AI/ML | Cloud TPU nodes with near-zero utilization 7+ days | +| `gcp.vertex.featurestore.idle` | AI/ML | Vertex AI Feature Stores with zero serving requests 30+ days | + +--- + +## Compute + +#### `gcp.compute.vm.stopped` +**Detects:** TERMINATED VM instances stopped 30+ days; persistent disk charges continue + +**Confidence / Risk:** HIGH (`lastStopTimestamp` ≥ 30 days ago); MEDIUM (TERMINATED but timestamp absent) / MEDIUM + +**Permissions:** `compute.instances.list` (roles/compute.viewer) + +**Params:** none (30-day threshold is fixed) + +**Exclusions:** instances not in TERMINATED state; stopped < 30 days + +**Spec:** — + +--- + +## Storage + +#### `gcp.compute.disk.unattached` +**Detects:** Persistent Disks in `READY` state with `users == []` + +**Confidence / Risk:** HIGH (unambiguous detachment) / LOW + +**Permissions:** `compute.disks.list` (roles/compute.viewer) + +**Params:** none + +**Exclusions:** none + +**Spec:** — + +#### `gcp.compute.snapshot.old` +**Detects:** Disk snapshots older than `days_old`; confidence reflects whether source disk still exists + +**Confidence / Risk:** HIGH (source disk no longer exists — orphaned); MEDIUM (source disk still exists) / LOW + +**Permissions:** `compute.snapshots.list` (roles/compute.viewer) + +**Params:** `days_old` (default: 90) + +**Exclusions:** snapshots not in `READY` status; younger than threshold; `region_filter` is ignored (snapshots are global) + +**Spec:** — + +--- + +## Network + +#### `gcp.compute.ip.unused` +**Detects:** Reserved static IPs (regional and global) in `RESERVED` state (GCP confirms not attached) + +**Confidence / Risk:** HIGH (GCP confirms RESERVED state) / LOW + +**Permissions:** `compute.addresses.list`, `compute.globalAddresses.list` (roles/compute.viewer); gracefully degrades if globalAddresses permission denied + +**Params:** none + +**Exclusions:** IPs in `IN_USE` status; global IPs skipped if `region_filter` is set + +**Spec:** — + +--- + +## Platform + +#### `gcp.sql.instance.idle` +**Detects:** Cloud SQL instances with zero connections for `idle_days`; if Monitoring unavailable, instance is assumed active (conservative fallback — not flagged) + +**Confidence / Risk:** HIGH (Cloud Monitoring confirms zero connections for full window) / HIGH + +**Permissions:** `cloudsql.instances.list` (roles/cloudsql.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer) + +**Params:** `idle_days` (default: 14) + +**Exclusions:** read replicas; instances not in `RUNNABLE` state + +**Spec:** — + +--- + +## AI/ML *(opt-in: `--category ai`)* + +#### `gcp.vertex.endpoint.idle` +**Detects:** Vertex AI Online Prediction endpoints with `dedicatedResources` and zero predictions for `idle_days` + +**Confidence / Risk:** HIGH (zero predictions confirmed + age ≥ `idle_days`); MEDIUM (zero predictions, age ≥ 75% of threshold or age unknown) / HIGH (GPU-backed: T4, V100, A100, L4, H100, TPU); MEDIUM (CPU-only) + +**Permissions:** `aiplatform.endpoints.list` (roles/aiplatform.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer) + +**Params:** `idle_days` (default: 14) + +**Exclusions:** endpoints using `automaticResources` (scale-to-zero); only `dedicatedResources` with `minReplicaCount > 0` + +**Spec:** — + +#### `gcp.vertex.workbench.idle` +**Detects:** Vertex AI Workbench instances `ACTIVE` with no control-plane activity (`updateTime`) for `idle_days` + +**Confidence / Risk:** HIGH (`updateTime` ≥ `idle_days` + age ≥ `idle_days`); MEDIUM (`updateTime` ≥ 75% of threshold or unavailable) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU-backed); MEDIUM (CPU-only) + +**Permissions:** `notebooks.instances.list` (roles/notebooks.viewer) + +**Params:** `idle_days` (default: 14) + +**Exclusions:** instances not in `ACTIVE` state + +**Spec:** — + +#### `gcp.vertex.training_job.long_running` +**Detects:** Vertex AI CustomJobs and TrainingPipelines in `RUNNING` state beyond `long_running_hours_threshold`; GPU/TPU jobs near threshold also trigger early-warning findings + +**Confidence / Risk:** HIGH (duration ≥ 3× threshold — clearly runaway); MEDIUM (duration ≥ threshold) / CRITICAL (HIGH confidence + GPU/accelerator); HIGH (HIGH confidence + non-GPU); MEDIUM (all MEDIUM confidence) + +**Permissions:** `aiplatform.customJobs.list`, `aiplatform.trainingPipelines.list` (roles/aiplatform.viewer) + +**Params:** `long_running_hours_threshold` (default: 24); `expensive_hourly_threshold` (default: $20/hr, for early-warning CPU jobs) + +**Exclusions:** jobs < 90% of threshold; cheap CPU-only jobs in the 90–100% early-warning zone + +**Spec:** — + +#### `gcp.tpu.idle` +**Detects:** Cloud TPU nodes in `READY` state with max `duty_cycle ≤ 2%` across all workers for `idle_days` + +**Confidence / Risk:** HIGH (Cloud Monitoring confirms near-zero duty cycle); LOW (Monitoring unavailable — age-only heuristic) / CRITICAL (HIGH confidence + hourly cost ≥ $10/hr); HIGH (HIGH confidence + < $10/hr); MEDIUM (LOW confidence) + +**Permissions:** `tpu.nodes.list` (roles/tpu.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer, optional — falls back to age-based) + +**Params:** `idle_days` (default: 7) + +**Exclusions:** nodes not in `READY` state; nodes younger than `idle_days` + +**Spec:** — + +#### `gcp.vertex.featurestore.idle` +**Detects:** Vertex AI Feature Stores (legacy and new-gen) with zero `online_serving/request_count` for `idle_days`; Bigtable-backed stores bill ~$197/node/month regardless of utilization + +**Confidence / Risk:** HIGH (Cloud Monitoring confirms zero requests); LOW (Monitoring unavailable — age-only) / HIGH (HIGH confidence); MEDIUM (LOW confidence) + +**Permissions:** `aiplatform.featurestores.list`, `aiplatform.featureOnlineStores.list` (roles/aiplatform.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer, optional) + +**Params:** `idle_days` (default: 30) + +**Exclusions:** legacy featurestores with `fixedNodeCount == 0` and `scaling.minNodeCount == 0`; stores not in `STABLE` state + +**Spec:** — diff --git a/docs/specs/azure/lb_no_backends.md b/docs/specs/azure/lb_no_backends.md new file mode 100644 index 0000000..b7bd8b1 --- /dev/null +++ b/docs/specs/azure/lb_no_backends.md @@ -0,0 +1,393 @@ +# Azure Rule Spec — `azure.load_balancer.no_backends` + +## 1. Rule Identity + +- **Rule ID:** `azure.load_balancer.no_backends` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.Network/loadBalancers` +- **Finding resource_type:** `azure.load_balancer` + +--- + +## 2. Intent + +Detect **Standard Azure Load Balancers whose billable load-balancing configuration points to backend pools with no members**. + +This rule is deliberately **conservative**. It is a **review-candidate** rule, not proof that +the load balancer is unused, safe to delete, or guaranteed to create savings if removed. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 SKU and platform scope + +Microsoft documents three Azure Load Balancer SKUs: + +| SKU | Relevant behavior | +|---|---| +| Standard | Main production SKU; supports both NIC-based and IP-based backends | +| Basic (retired) | Retired legacy SKU | +| Gateway | Separate SKU for third-party NVAs / service chaining | + +Source: *Azure Load Balancer SKUs* +URL: https://learn.microsoft.com/en-us/azure/load-balancer/skus + +Rule consequence: + +1. This rule should evaluate **Standard** SKU only. +2. Basic is out of scope. +3. Gateway is out of scope; its service-chaining semantics are different from ordinary + frontend-to-backend load balancing. + +### 3.2 Backend-pool membership models + +Microsoft documents two ways of configuring Azure Load Balancer backend pools: + +1. **NIC-based** +2. **IP address-based** + +Microsoft also documents: + +- IP-based backend pools are supported only for **Standard** Load Balancer +- the same load balancer can have NIC-based and IP-based backend pools +- a single backend pool must not mix NIC-targeted members and direct IP-address members + +Source: *Backend pool management* +URL: https://learn.microsoft.com/en-us/azure/load-balancer/backend-pool-management + +Rule consequence: + +1. Membership checks must consider **both** backend models. +2. A pool is populated when it has either NIC-based members or IP-based backend addresses. +3. Checking only one backend representation is incomplete. + +### 3.3 Load balancer resource shape + +Microsoft's ARM/Bicep reference for `Microsoft.Network/loadBalancers` documents the control-plane +fields relevant to this rule, including: + +- `id` +- `name` +- `location` +- `sku.name` +- `backendAddressPools` +- `frontendIPConfigurations` +- `loadBalancingRules` +- `outboundRules` +- `probes` +- `tags` + +The same reference documents: + +- `backendAddressPools[].properties.loadBalancerBackendAddresses` +- `loadBalancingRules[].properties.backendAddressPool` +- `loadBalancingRules[].properties.backendAddressPools` +- `outboundRules[].properties.backendAddressPool` + +Source: *Microsoft.Network/loadBalancers template reference* +URL: https://learn.microsoft.com/en-us/azure/templates/microsoft.network/2024-10-01/loadbalancers + +Rule consequence: + +1. Load-balancing and outbound rules can reference backend pools explicitly. +2. Detection should be based on the pools referenced by billable rule surfaces, not on arbitrary + unrelated pools attached to the resource. + +### 3.4 Pricing meaning + +Microsoft pricing documentation states: + +- Azure Standard Load Balancer pricing depends on the number of configured + **load-balancing rules** and **outbound rules** +- **Inbound NAT rules are free** +- there is **no hourly charge** for Standard Load Balancer when **no rules are configured** +- data processing charges are separate and usage-based + +Source: *Azure Load Balancer pricing* +URL: https://azure.microsoft.com/en-us/pricing/details/load-balancer/ + +Rule consequence: + +1. A Standard Load Balancer with **no load-balancing rules and no outbound rules** is not a strong + direct cost signal for this rule and should be skipped. +2. This rule must **not** use a fixed flat monthly estimate such as `~$18/month`. +3. `estimated_monthly_cost_usd` should remain `None` unless a future implementation has a + documented and region-aware pricing source. + +### 3.5 Diagnostics are out of scope + +Microsoft documents Azure Monitor diagnostics for Standard Load Balancer, including health-probe, +data-path, byte-count, packet-count, and SNAT-related metrics. + +Source: *Standard Load Balancer diagnostics* +URL: https://learn.microsoft.com/en-us/azure/load-balancer/load-balancer-standard-diagnostics + +Rule consequence: + +This rule does **not** require metrics. Backend emptiness is a deterministic control-plane +condition and should not depend on Azure Monitor setup. + +--- + +## 4. Detection Goal + +Emit a finding when **all** of the following are true: + +1. `lb.id` is present and non-empty +2. `lb.name` is present and non-empty +3. optional region filter matches the normalized location +4. provisioning state resolves to exactly `"Succeeded"` +5. SKU resolves to exactly `"Standard"` +6. at least one **billable rule** exists (`loadBalancingRules` or `outboundRules`) +7. all **relevant backend pools** referenced by those billable rules resolve successfully +8. every relevant backend pool has **zero members** under the backend-membership contract + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that the load balancer is safe to delete +- that no future backend attachment is intended +- that a frontend public IP is unused +- that the load balancer has no traffic history +- that removing the load balancer will produce a specific monthly saving + +--- + +## 6. Canonical Inputs + +| API / signal | SDK method / source | Required permission | +|---|---|---| +| Load balancer inventory | `network_client.load_balancers.list_all()` | `Microsoft.Network/loadBalancers/read` | +| Load balancer fields | SDK projections for `id`, `name`, `location`, `sku`, `backend_address_pools`, `load_balancing_rules`, `outbound_rules`, `tags`, and provisioning state; raw/nested ARM-style fields only as fallback when needed | `Microsoft.Network/loadBalancers/read` | + +No Azure Monitor metrics are required by this rule. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Lowercase ARM location string; compare by exact lowercase string equality only. Do not remove spaces, hyphens, or digits. | +| `sku_name` | Lowercase only for comparison. Only exact `standard` is eligible. | +| `tags` | `lb.tags or {}` — never `None` in output | +| `backend pool id` | Compare backend pool ARM ids after lowercasing and trimming any trailing slash; apply the same normalization to both referenced ids and backend-pool inventory ids before matching | +| rule and membership collections | Normalize `None` to empty collection before evaluation | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `lb.id` absent, `None`, or empty | Skip | +| 8.2 | `lb.name` absent, `None`, or empty | Skip | +| 8.3 | Region filter set and normalized location does not match | Skip | +| 8.4 | Provisioning state does not resolve to `"Succeeded"` under the provisioning-state contract | Skip | +| 8.5 | SKU does not resolve to exact lowercase `standard` | Skip | +| 8.6 | No billable rules exist | Skip | +| 8.7 | Relevant backend-pool set cannot be resolved reliably | Skip | +| 8.8 | Billable rules exist but the resolved relevant backend-pool set is empty | Skip | +| 8.9 | Any relevant backend pool has one or more members | Skip | +| 8.10 | All relevant backend pools resolve and all are empty | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Provisioning-state contract + +Resolve provisioning state in this order: + +1. SDK projection such as `lb.provisioning_state` +2. nested/raw properties projection if present +3. otherwise unknown + +Only `"Succeeded"` is eligible for evaluation. Unknown or any other value must skip. + +### 9.2 Billable-rule contract + +For this rule: + +- **billable load-balancing rule** = entry in `loadBalancingRules` +- **billable outbound rule** = entry in `outboundRules` +- **non-billable for this rule** = `inboundNatRules` + +Required behavior: + +1. Prefer SDK projections for `loadBalancingRules` and `outboundRules`; use raw/nested ARM-style fields only if needed. +2. Normalize missing rule collections (`None`) to empty before evaluation. +3. Count `loadBalancingRules` and `outboundRules` only. +4. If both sets are empty, skip. +5. Do not treat inbound NAT rules as evidence of billable load-balancing cost for this rule. + +### 9.3 Relevant-backend-pool contract + +The set of **relevant backend pools** is the union of pool ids referenced by: + +1. `loadBalancingRules[].properties.backendAddressPool.id` +2. `loadBalancingRules[].properties.backendAddressPools[].id` +3. `outboundRules[].properties.backendAddressPool.id` + +Required behavior: + +1. Prefer SDK projections for rule-to-backend references; use raw/nested ARM-style fields only if needed. +2. Normalize missing reference collections (`None`) to empty before evaluation. +3. Normalize backend pool ids before comparison by lowercasing and trimming any trailing slash. +4. Normalize backend-pool inventory ids using the same lowercase + trailing-slash-trim contract before matching. +5. Match referenced ids against the normalized load balancer backend-pool inventory. +6. If billable rules exist but the resolved relevant backend-pool set is empty after normalization and resolution, skip the load balancer rather than emitting. +7. If a billable rule exists but its backend-pool reference is missing, unclear, or cannot be resolved, skip the load + balancer rather than emitting. +8. Treat partially configured or transitional billable rules with incomplete backend linkage as incomplete configuration and skip. +9. Unreferenced backend pools are **context only** and must not suppress or trigger findings. + +### 9.4 Backend-membership contract + +A relevant backend pool is considered to have members when **either** of the following contains +at least one entry: + +1. NIC-based members (SDK projection such as `backend_ip_configurations`, with raw/nested fallback only if needed) +2. IP-based members (SDK projection such as `load_balancer_backend_addresses`, with raw/nested fallback only if needed) + +A relevant backend pool is empty only when **both** backend representations are absent or empty +after normalizing `None` to empty collections. + +Rationale: + +- Standard Load Balancer supports both membership models +- checking only NIC-based or only IP-based membership creates false positives + +--- + +## 10. Contextual Signals + +These may appear in evidence/details but must not create or suppress findings directly: + +- `frontend_ip_configurations` +- `probes` +- `tags` +- unreferenced backend pools +- `inboundNatRules` + +--- + +## 11. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use a fixed monthly estimate such as `$18` +2. Do **not** infer cost from SKU alone +3. Do **not** infer cost when no billable rules are configured +4. Document that Standard Load Balancer pricing depends on configured billable rules and processed data + +--- + +## 12. Finding Shape + +### 12.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.load_balancer.no_backends"` | +| `resource_type` | `"azure.load_balancer"` | +| `resource_id` | Original ARM id from `lb.id` | +| `region` | Normalized location | +| `risk` | `LOW` | +| `confidence` | `HIGH` | +| `estimated_monthly_cost_usd` | `None` | + +### 12.2 Required evidence + +`signals_used` must include: + +1. `"Load Balancer SKU is Standard"` +2. `"Billable rule count is {billable_rule_count}"` +3. `"All relevant backend pools evaluated to empty using NIC-based and IP-based membership checks"` + +`signals_not_checked` should include: + +1. `"Planned backend attachment or cutover intent"` +2. `"IaC-managed placeholder or staged deployment intent"` +3. `"Traffic history or future activation plans"` +4. `"Frontend public IP cost or attachment evaluated by other rules"` + +### 12.3 Required details + +| Key | Nullable | +|---|---| +| `resource_name` | No | +| `subscription_id` | No | +| `sku_name` | No | +| `sku_tier` | Yes | +| `backend_pool_count` | No | +| `relevant_backend_pool_count` | No | +| `frontend_ip_count` | No | +| `load_balancing_rule_count` | No | +| `outbound_rule_count` | No | +| `tags` | No (`{}` when absent) | + +--- + +## 13. Failure Behavior + +- If the load balancer list call raises, let the exception propagate +- If an individual load balancer record is malformed or missing required fields, skip that load balancer +- If billable-rule references cannot be resolved to backend pools reliably, skip that load balancer +- Do not silently emit on incomplete provisioning or backend-reference data + +--- + +## 14. Acceptance Examples + +### 14.1 Must emit + +1. Standard SKU, provisioning state `"Succeeded"`, one load-balancing rule referencing one backend pool, and that pool has no NIC-based or IP-based members -> **EMIT** +2. Standard SKU, provisioning state `"Succeeded"`, two billable rules referencing two backend pools, and both referenced pools are empty -> **EMIT** +3. Standard SKU, provisioning state `"Succeeded"`, one outbound rule referencing one backend pool, and that pool is empty -> **EMIT** + +### 14.2 Must skip + +1. Basic or Gateway SKU -> **SKIP** +2. Provisioning state not `"Succeeded"` -> **SKIP** +3. No load-balancing rules and no outbound rules configured -> **SKIP** +4. A referenced backend pool has NIC-based members -> **SKIP** +5. A referenced backend pool has IP-based members -> **SKIP** +6. A billable rule references a backend pool that cannot be resolved -> **SKIP** +7. Region filter is set and location does not match -> **SKIP** +8. `lb.id == None` or `lb.name == None` -> **SKIP** + +--- + +## 15. Anti-Goals + +Implementations must **not**: + +1. emit on Basic or Gateway SKU load balancers +2. emit solely because `backendAddressPools` is empty when no billable rules exist +3. use inbound NAT rules as billable-rule evidence for this rule +4. use a fixed monthly load balancer cost estimate +5. use unreferenced backend pools to suppress or trigger findings +6. require Azure Monitor metrics for evaluation + +--- + +## 16. Rule Summary + +Rule: `azure.load_balancer.no_backends` + +- **Signal:** Standard Load Balancer with billable rule-backed backend pools that are empty +- **Type:** conservative review candidate +- **Scope:** Standard SKU only +- **Confidence:** `HIGH` +- **Risk:** `LOW` +- **Cost:** `None` (pricing depends on rule count and data processed; no hourly charge when no rules are configured) diff --git a/docs/specs/azure/public_ip_unused.md b/docs/specs/azure/public_ip_unused.md new file mode 100644 index 0000000..2df39d1 --- /dev/null +++ b/docs/specs/azure/public_ip_unused.md @@ -0,0 +1,353 @@ +# Azure Rule Spec — `azure.network.public_ip.unused` + +## 1. Rule Identity + +- **Rule ID:** `azure.network.public_ip.unused` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.Network/publicIPAddresses` +- **Finding resource_type:** `azure.network.public_ip` + +--- + +## 2. Intent + +Detect **Azure Public IP Address resources that are fully unattached across known Azure control-plane +linkage surfaces** and therefore represent conservative cleanup review candidates. + +This rule is deliberately **low-noise**. It is a **review-candidate** rule only, not proof that the +Public IP is delete-safe, not proof that no future attachment is intended, and not proof of a specific +monthly saving. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 Public IP association meaning + +Microsoft documents that Azure Public IP resources can be associated with many Azure resource types, +including: + +- virtual machine network interfaces +- virtual machine scale sets +- public load balancers +- virtual network gateways +- NAT gateways +- application gateways +- Azure Firewalls +- Bastion Hosts +- Route Servers +- API Management + +Source: *Public IP addresses* +URL: https://learn.microsoft.com/en-us/azure/virtual-network/ip-services/public-ip-addresses + +Rule consequence: + +1. Checking only VM/NIC-style linkage is incomplete. +2. This rule must treat Public IP attachment as a broader Azure platform concept, not just a NIC concept. + +### 3.2 SKU and allocation meaning + +Microsoft documents: + +- Public IP SKU can be **Standard (v1 or v2)** or **Basic** +- Standard Public IPs are **static only** +- Basic Public IPs may be dynamic or static depending on IP version +- dynamic Public IPs receive an address when associated with a resource + +Source: *Public IP addresses* +URL: https://learn.microsoft.com/en-us/azure/virtual-network/ip-services/public-ip-addresses#at-a-glance + +Rule consequence: + +1. An unattached **dynamic** Public IP without an assigned `ipAddress` is a weak placeholder-style signal. +2. To reduce noise, the rule should skip unattached dynamic Public IP resources that do not currently hold + an assigned IP address. + +### 3.3 Pricing meaning + +Microsoft pricing documentation states that Public IP pricing varies by SKU and type, including: + +- Basic (ARM) +- Standard (ARM) +- Standard v2 (ARM) +- Global (ARM) + +The same pricing page also explains that billing behavior differs between static and other Public IP types. + +Source: *IP Addresses pricing* +URL: https://azure.microsoft.com/en-us/pricing/details/ip-addresses/ + +Rule consequence: + +1. This rule must **not** use a single flat estimate such as `$3.60/month`. +2. `estimated_monthly_cost_usd` should remain `None` unless a future implementation has a documented, + region-aware pricing source with SKU/allocation specificity. + +### 3.4 SDK control-plane shape used by implementations + +The Azure Python SDK `PublicIPAddress` model exposes control-plane linkage fields including: + +- `ip_configuration` +- `nat_gateway` +- `service_public_ip_address` +- `linked_public_ip_address` +- `provisioning_state` +- `public_ip_allocation_method` +- `ip_address` + +This model is generated from the Azure Network control-plane schema and is the implementation surface +used by this rule. + +Rule consequence: + +The rule should resolve attachment by evaluating these linkage fields, using SDK projections first and +nested/raw fallback only if needed, including ARM-style nested properties fields when present. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. `public_ip.id` is present and non-empty +2. `public_ip.name` is present and non-empty +3. optional region filter matches the normalized location +4. provisioning state resolves to exactly `"Succeeded"` +5. the Public IP is **not attached** under the canonical attachment contract +6. the Public IP is **not** an unattached dynamic placeholder with no assigned `ipAddress` + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that the Public IP is safe to delete +- that DNS references no longer exist +- that firewall allowlists will not break +- that no future attachment is planned +- that the resource is currently billed at a specific monthly amount + +--- + +## 6. Canonical Inputs + +| API / signal | SDK method / source | Required permission | +|---|---|---| +| Public IP inventory | `network_client.public_ip_addresses.list_all()` | `Microsoft.Network/publicIPAddresses/read` | +| Public IP fields | SDK projections for `id`, `name`, `location`, `sku`, `public_ip_allocation_method`, `ip_address`, `ip_configuration`, `nat_gateway`, `service_public_ip_address`, `linked_public_ip_address`, `provisioning_state`, `ip_tags`, `zones`, `tags`; raw/nested fallback only if needed | `Microsoft.Network/publicIPAddresses/read` | + +No Azure Monitor metrics are required by this rule. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Lowercase ARM location string; compare by exact lowercase string equality only. Do not remove spaces, hyphens, or digits. | +| `public_ip_allocation_method` | Compare case-sensitively to canonical Azure values such as `"Static"` or `"Dynamic"` after SDK/raw resolution. | +| `provisioning_state` | Compare case-sensitively to canonical Azure value `"Succeeded"` after SDK/raw resolution. | +| attachment references | Treat `None` as absent. A reference with a non-empty `id` is attached. | +| `tags` | `public_ip.tags or {}` — never `None` in output | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `id` absent, `None`, or empty | Skip | +| 8.2 | `name` absent, `None`, or empty | Skip | +| 8.3 | Region filter set and normalized location does not match | Skip | +| 8.4 | Provisioning state does not resolve to `"Succeeded"` | Skip | +| 8.5 | Any attachment linkage is present under the attachment contract | Skip | +| 8.6 | Dynamic-placeholder contract is triggered | Skip | +| 8.7 | All required signals resolve, all known attachment linkages are absent, and the dynamic-placeholder contract is not triggered | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Provisioning-state contract + +Resolve provisioning state in this order: + +1. SDK projection such as `public_ip.provisioning_state` +2. nested/raw properties projection if present +3. otherwise unknown + +Only `"Succeeded"` is eligible for evaluation. Unknown or any other value must skip. + +### 9.2 Attachment contract + +Treat the Public IP as **attached** when any one of the following resolves to a non-empty reference: + +1. `ip_configuration` +2. `nat_gateway` +3. `service_public_ip_address` +4. `linked_public_ip_address` + +Required behavior: + +1. Prefer SDK projections first. +2. Fall back to the matching ARM-style `properties.*` field only if needed. +3. When a reference object is present, a non-empty `id` counts as attached. +4. If attachment linkage cannot be resolved reliably, skip rather than emit. + +Canonical SDK-to-ARM linkage mapping: + +| SDK field | Raw ARM fallback | +|---|---| +| `ip_configuration` | `properties.ipConfiguration` | +| `nat_gateway` | `properties.natGateway` | +| `service_public_ip_address` | `properties.servicePublicIPAddress` | +| `linked_public_ip_address` | `properties.linkedPublicIPAddress` | + +Rationale: + +- Azure documents many valid Public IP association targets beyond NICs. +- Azure SDK exposes multiple control-plane linkage fields beyond `ip_configuration`. +- A Public IP with any known control-plane attachment should not be emitted as unused. + +### 9.3 Dynamic-placeholder contract + +If the Public IP is unattached under the attachment contract: + +- and `public_ip_allocation_method == "Dynamic"` +- and `ip_address` is absent or empty + +then skip rather than emit. + +Rationale: + +Microsoft documents that dynamic Public IPs receive an address when associated. An unattached dynamic +resource without an assigned IP is a weaker placeholder/provisioning signal and would create more noise +than value in this rule. + +### 9.4 Context-only fields + +The following may appear in details/evidence but must not create or suppress findings directly: + +- `sku` +- `public_ip_address_version` +- `public_ip_prefix` +- `dns_settings` +- `delete_option` +- `ip_tags` +- `zones` + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use a fixed monthly estimate such as `$3.60` +2. Do **not** infer cost from attachment absence alone +3. Do **not** infer cost from SKU without a region-aware pricing source +4. Document that Azure Public IP pricing varies by SKU/type and billing semantics + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.network.public_ip.unused"` | +| `resource_type` | `"azure.network.public_ip"` | +| `resource_id` | Original ARM id from `public_ip.id` | +| `region` | Normalized location | +| `risk` | `LOW` | +| `confidence` | `HIGH` | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required evidence + +`signals_used` must include: + +1. `"Provisioning state is Succeeded"` +2. `"Public IP has no resolved attachment via ip_configuration, nat_gateway, service_public_ip_address, or linked_public_ip_address"` +3. `"Dynamic-placeholder contract not triggered"` + +`signals_not_checked` should include: + +1. `"Planned future association or reserved intent"` +2. `"DNS records or firewall allowlist references"` +3. `"Application-level reachability or traffic history"` +4. `"Exact Azure billing amount for this Public IP"` + +### 11.3 Required details + +| Key | Nullable | +|---|---| +| `resource_name` | No | +| `subscription_id` | No | +| `allocation_method` | Yes | +| `ip_address` | Yes | +| `sku` | Yes | +| `ip_version` | Yes | +| `ip_tags` | Yes | +| `attached` | No | +| `tags` | No (`{}` when absent) | + +--- + +## 12. Failure Behavior + +- If the Public IP list call raises, let the exception propagate +- If an individual Public IP record is malformed or missing required fields, skip that record +- If attachment linkage or provisioning state cannot be resolved reliably, skip that record +- Do not silently emit on incomplete control-plane attachment data + +--- + +## 13. Acceptance Examples + +### 13.1 Must emit + +1. `provisioning_state == "Succeeded"`, all four attachment linkages absent, allocation `"Static"` -> **EMIT** +2. `provisioning_state == "Succeeded"`, all four attachment linkages absent, allocation `"Dynamic"`, `ip_address` present -> **EMIT** + +### 13.2 Must skip + +1. `ip_configuration` present -> **SKIP** +2. `nat_gateway` present -> **SKIP** +3. `service_public_ip_address` present -> **SKIP** +4. `linked_public_ip_address` present -> **SKIP** +5. allocation `"Dynamic"` and `ip_address == None` -> **SKIP** +6. `provisioning_state != "Succeeded"` -> **SKIP** +7. region filter mismatch -> **SKIP** +8. `id == None` or `name == None` -> **SKIP** + +--- + +## 14. Anti-Goals + +Implementations must **not**: + +1. treat `ip_configuration is None` as sufficient proof of unused by itself +2. use a fixed Public IP monthly cost estimate +3. emit on unattached dynamic placeholder-style Public IPs with no assigned address +4. infer delete safety from attachment absence +5. require Azure Monitor metrics for this rule + +--- + +## 15. Rule Summary + +Rule: `azure.network.public_ip.unused` + +- **Signal:** fully unattached Public IP across known Azure control-plane linkage fields +- **Type:** conservative review candidate +- **Confidence:** `HIGH` +- **Risk:** `LOW` +- **Cost:** `None` (pricing varies by SKU/type and is not estimated by this rule) diff --git a/docs/specs/azure/sql_database_idle.md b/docs/specs/azure/sql_database_idle.md new file mode 100644 index 0000000..f24e887 --- /dev/null +++ b/docs/specs/azure/sql_database_idle.md @@ -0,0 +1,455 @@ +# Azure Rule Spec — `azure.sql.database.idle` + +## 1. Rule Identity + +- **Rule ID:** `azure.sql.database.idle` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.Sql/servers/databases` +- **Finding resource_type:** `azure.sql.database` + +--- + +## 2. Intent + +Detect **dedicated Azure SQL Database single-database resources** that show **no observable user workload activity** over the configured idle window and therefore represent conservative cleanup or rightsizing review candidates. + +This rule is deliberately **low-noise**. It is a **review-candidate** rule only, not proof that a database is delete-safe, not proof that no business continuity purpose exists, and not proof of a specific monthly saving. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 Azure SQL Database metrics + +Microsoft documents Azure Monitor metrics for `Microsoft.Sql/servers/databases`, including: + +- `connection_successful` +- `sessions_count` +- `cpu_percent` +- `physical_data_read_percent` +- `log_write_percent` + +Source: + +- *Monitoring data reference for Azure SQL Database* +- *Monitor resource utilization and query activity for Azure SQL Database* + +Rule consequence: + +1. `connection_successful` alone is not a sufficient idle signal. +2. A conservative idle rule should require **multiple zero-activity metrics** for the same window. +3. If any required metric cannot be resolved reliably, the database must be skipped rather than emitted. +4. These are **Azure Monitor platform metrics** for `Microsoft.Sql/servers/databases`, not DMV-derived or Log Analytics-only signals. + +### 3.2 Elastic pool billing semantics + +Microsoft documents that databases in an elastic pool share pool resources and that **there is no per-database charge for elastic pools**; billing is at the pool level. + +Source: *What are SQL elastic pools?* + +Rule consequence: + +Pooled databases must be **excluded** from this rule because per-database idleness does not directly imply per-database savings. + +### 3.3 Purchasing models and serverless compute + +Microsoft documents that Azure SQL Database supports DTU and vCore purchasing models, and that the vCore model includes a **serverless compute tier** that can automatically pause during inactive periods. + +Sources: + +- *Purchasing models in Azure SQL Database* +- *Serverless compute tier for Azure SQL Database* + +Rule consequence: + +1. Exact monthly savings cannot be inferred from SKU alone. +2. The rule must not use a flat monthly estimate. +3. Serverless requires special handling because inactivity can already be partially optimized by the platform. + +### 3.4 Serverless auto-pause + +Microsoft documents that, for serverless databases: + +- databases can automatically pause after an inactivity delay +- when a database is **paused**, **compute cost is zero** and only storage is billed +- auto-pause is currently supported only in the General Purpose tier + +Source: *Serverless compute tier for Azure SQL Database* + +Rule consequence: + +1. A database in a paused state must be **skipped**. +2. The rule should avoid treating already-paused serverless databases as high-confidence waste findings. + +### 3.5 Control-plane resource shape + +Microsoft REST/ARM documentation for `Microsoft.Sql/servers/databases` exposes database fields including: + +- `status` +- `creationDate` +- `currentServiceObjectiveName` +- `elasticPoolId` +- `autoPauseDelay` +- `pausedDate` +- `resumedDate` +- `secondaryType` +- `sourceDatabaseId` +- `sku` +- `tags` + +Source: + +- *Databases - Get (REST API)* +- *Microsoft.Sql/servers/databases ARM / Bicep reference* + +Rule consequence: + +These fields provide the canonical control-plane inputs for state, pool membership, serverless pause context, and replica/secondary-shaped exclusions. + +### 3.6 Geo-replication and failover groups + +Microsoft documents that: + +- active geo-replication creates **readable secondary databases** +- failover groups manage **replication and failover** for business continuity +- secondary databases can be used to **offload read-only workloads** + +Sources: + +- *Active geo-replication overview* +- *Failover groups overview* + +Rule consequence: + +Replica / secondary-shaped databases are valid operational resources and must be **skipped** to avoid false findings. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. `database.id` is present and non-empty +2. `database.name` is present and non-empty +3. the optional region filter matches the normalized location +4. `database.status` resolves to exactly `"Online"` +5. the database is **not** the `master` system database +6. the database is old enough to cover the full observation window +7. the database is **not** in an elastic pool +8. the database is **not** replica / secondary-shaped under the replica exclusion contract +9. the database is **not** currently paused +10. all required activity metrics resolve reliably for the same window +11. all required activity metrics are zero for the same window + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that deleting the database is safe +- that the parent logical server is removable +- that the database is not required for DR, read scale-out, failback, or migration workflows +- that no future application rollout depends on the database +- that the database produces a specific monthly saving + +--- + +## 6. Canonical Inputs + +### 6.1 Required control-plane surfaces + +The implementation may use: + +- `sql_client.servers.list()` +- `sql_client.databases.list_by_server(resource_group, server_name)` +- Azure Monitor platform metrics for the database ARM id + +It must **not** require DMV queries, in-database SQL access, or Log Analytics workspace data for rule correctness. + +Optional per-database `get(...)` reads are allowed if needed for reliable normalization, but any lookup failure must remain conservative. + +### 6.2 Idle window + +- Configurable parameter: `idle_days` +- Default: `14` +- Evaluation window: + - `window_end = now` + - `window_start = now - idle_days` + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Lowercase ARM location string; compare by exact lowercase string equality only. Do not remove spaces, hyphens, or digits. | +| `status` | Compare case-sensitively to canonical Azure value `"Online"` after SDK/raw resolution. | +| `elastic_pool_id` | Treat non-empty value as pooled. Lowercase and trim trailing `/` when used for comparisons or diagnostics. | +| `secondary_type` | Treat non-empty value as replica / secondary context. | +| `source_database_id` | Treat non-empty value as lineage context; use only as a conservative secondary-shaped exclusion signal when paired with replica indicators or equivalent control-plane context. | +| `creation_date` | Parse as UTC instant. If absent or invalid, age is unknown. | +| `tags` | `database.tags or {}` — never `None` in output. | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `id` absent, `None`, or empty | Skip | +| 8.2 | `name` absent, `None`, or empty | Skip | +| 8.3 | Region filter set and normalized location does not match | Skip | +| 8.4 | `status` does not resolve to `"Online"` | Skip | +| 8.5 | `name == "master"` | Skip | +| 8.6 | Database age is unknown or less than `idle_days` | Skip | +| 8.7 | Database is in an elastic pool | Skip | +| 8.8 | Replica / secondary exclusion contract is triggered | Skip | +| 8.9 | Current paused-state contract is triggered | Skip | +| 8.10 | One or more required metrics cannot be resolved reliably | Skip | +| 8.11 | Any required metric is non-zero over the idle window | Skip | +| 8.12 | All required signals resolve and all required metrics are zero over the idle window | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Online-state contract + +Resolve database state in this order: + +1. SDK projection such as `database.status` +2. nested/raw properties projection if present +3. otherwise unknown + +Only `"Online"` is eligible for evaluation. Unknown or any other value must skip. + +### 9.2 Age contract + +Resolve `creation_date` in this order: + +1. SDK projection such as `database.creation_date` +2. nested/raw properties projection such as `database.properties.creationDate` +3. otherwise unknown + +Required behavior: + +1. If `creation_date` is absent, invalid, or unparseable -> skip +2. If database age is less than `idle_days` -> skip +3. Only databases old enough for the full observation window are eligible + +### 9.3 Elastic-pool exclusion contract + +Treat the database as pooled when `elastic_pool_id` resolves to a non-empty value. + +Pooled databases must skip because Microsoft documents that billing is at the **pool** level, not the database level. + +### 9.4 Replica / secondary exclusion contract + +Skip when reliable control-plane signals indicate the database is replica / secondary-shaped, including: + +1. `secondary_type` resolves to a non-empty value +2. equivalent secondary/replica-shaped control-plane context is present + +Canonical replica-signal mapping: + +| Signal | Meaning | +|---|---| +| `secondary_type` | explicit replica / secondary indicator | +| `source_database_id` plus secondary/replica-shaped control-plane context | lineage tied to replica / restore / failover context | +| nested/raw replica fields such as `properties.secondaryType` or `properties.sourceDatabaseId` | conservative replica / secondary indicator when clearly present | + +Conservative guidance: + +1. Prefer SDK projections first. +2. Fall back to nested/raw properties fields only if needed. +3. If replica / secondary context cannot be resolved reliably, skip rather than emit. + +This exclusion is required because Microsoft documents readable secondary databases and failover-group replicas as valid DR and read-scale resources. + +### 9.5 Current paused-state contract + +Skip when reliable control-plane signals indicate the database is currently paused, including: + +1. `status == "Paused"` +2. equivalent paused-state control-plane context such as a current `paused_date` without evidence of a later resume + +Required behavior: + +1. Prefer SDK projections first. +2. Fall back to nested/raw properties fields only if needed. +3. If current paused state cannot be resolved reliably for a serverless-shaped database, skip rather than emit. + +### 9.6 Activity-metrics contract + +The following Azure Monitor **platform metrics** must be queried for the same `timespan`, where: + +- `timespan = window_start / window_end` +- `window_start = now - idle_days` +- `window_end = now` + +| Metric | REST name | Aggregation | +|---|---|---| +| Successful connections | `connection_successful` | `Total` | +| Sessions count | `sessions_count` | `Maximum` | +| CPU percentage | `cpu_percent` | `Maximum` | +| Data IO percentage | `physical_data_read_percent` | `Maximum` | +| Log IO percentage | `log_write_percent` | `Maximum` | + +Interpretation: + +1. If any aggregated datapoint returned for a required metric is `> 0`, treat the database as **active** +2. If Azure Monitor returns a usable metric series for the requested `timespan`, and all datapoints in that returned series are `0` or absent/`None`, that metric is **zero for the window** +3. If a metric query succeeds but the metric itself is absent from the response, treat that metric as **unknown** and skip +4. If a metric query succeeds and the metric is present but the series is empty, partial, or otherwise unusable for the requested `timespan`, treat that metric as **unknown** and skip +5. Implementations do **not** need bucket-by-bucket timestamp alignment across metrics; the contract is shared `timespan`, not identical datapoint timestamps +6. Missing data is **not** equivalent to proven zero activity unless Azure Monitor returned a usable metric series for that metric over the requested `timespan` + +### 9.7 Emission threshold + +Emit only when **all five required metrics** are zero for the full observation window. + +This stronger threshold is required because: + +- `connection_successful == 0` alone is not enough +- an existing session can produce work without a new connection +- read / write / CPU activity can reveal user workload even when connection counts are sparse + +### 9.8 Context-only fields + +The following may appear in details/evidence but must not create or suppress findings directly: + +- `sku` +- `current_service_objective_name` +- `min_capacity` +- `max_size_bytes` +- `license_type` +- `zone_redundant` +- `auto_pause_delay` + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use a flat estimate derived from DTU-era SKU tables +2. Do **not** infer cost from `current_service_objective_name` alone +3. Do **not** infer per-database savings for pooled databases +4. Document that Azure SQL pricing varies by purchasing model, tier, compute shape, storage, backup, and serverless behavior + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.sql.database.idle"` | +| `resource_type` | `"azure.sql.database"` | +| `resource_id` | Original ARM id from `database.id` | +| `region` | Normalized location | +| `risk` | `HIGH` | +| `confidence` | `HIGH` | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required evidence + +`signals_used` must clearly disclose: + +1. database state is `"Online"` +2. database age is at least `idle_days` +3. database is not pooled +4. replica / secondary exclusion contract is not triggered +5. paused-state contract is not triggered +6. zero `connection_successful` over the idle window +7. zero `sessions_count` over the idle window +8. zero `cpu_percent` over the idle window +9. zero `physical_data_read_percent` over the idle window +10. zero `log_write_percent` over the idle window + +`signals_not_checked` should include remaining blind spots such as: + +1. planned future cutover or deployment intent +2. undeclared business continuity requirements +3. workload activity outside documented rule signals +4. exact Azure billing amount for this database + +### 11.3 Required details + +Details should include at least: + +- `database_name` +- `server_name` +- `status` +- `current_service_objective_name` +- `sku_tier` +- `elastic_pool_id` +- `auto_pause_delay` +- `paused_date` +- `creation_date` +- `idle_days` +- `connection_successful` +- `sessions_count` +- `cpu_percent` +- `physical_data_read_percent` +- `log_write_percent` +- `tags` + +--- + +## 12. Failure Behavior + +- If the server list call raises, let the exception propagate +- If per-server database listing fails, skip that server rather than emit partial guesses +- If any individual database record is malformed or missing required fields, skip that database +- If any required metric query fails or returns unusable data, skip that database +- Do not silently emit on partial control-plane or metric data + +--- + +## 13. Acceptance Examples + +### 13.1 Must emit + +1. A dedicated single database with `status == "Online"`, age >= 14 days, no elastic pool, no secondary/replica signals, not paused, and all five required metrics zero for 14 days -> **EMIT** + +### 13.2 Must skip + +1. `master` database -> **SKIP** +2. database in an elastic pool (`elastic_pool_id` present) -> **SKIP** +3. database status is not `"Online"` -> **SKIP** +4. serverless database currently paused -> **SKIP** +5. readable secondary / geo-secondary / failover secondary-shaped database -> **SKIP** +6. database younger than `idle_days` -> **SKIP** +7. `connection_successful == 0` but `sessions_count > 0` -> **SKIP** +8. `connection_successful == 0` but `cpu_percent > 0` -> **SKIP** +9. any required metric query fails or is unavailable -> **SKIP** + +--- + +## 14. Anti-Goals + +Implementations must **not**: + +1. treat `connection_successful == 0` as sufficient proof of idleness by itself +2. emit for pooled databases +3. emit for currently paused serverless databases +4. emit for replica / DR-shaped databases +5. use fixed monthly price tables for findings + +--- + +## 15. Rule Summary + +Rule: `azure.sql.database.idle` + +- **Signal:** dedicated single database with zero successful connections, zero sessions, zero CPU, zero data IO, and zero log IO over `idle_days` +- **Primary exclusions:** pooled databases, replicas / secondaries, paused databases, young databases, non-Online databases +- **Cost model:** no flat estimate; `estimated_monthly_cost_usd = None` diff --git a/pyproject.toml b/pyproject.toml index d32303e..0d10470 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cleancloud" -version = "1.23.0" +version = "1.24.0" description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry." readme = "README.md" requires-python = ">=3.10" diff --git a/tests/cleancloud/providers/azure/test_azure_lb_no_backends.py b/tests/cleancloud/providers/azure/test_azure_lb_no_backends.py index c43702c..476ffa5 100644 --- a/tests/cleancloud/providers/azure/test_azure_lb_no_backends.py +++ b/tests/cleancloud/providers/azure/test_azure_lb_no_backends.py @@ -1,159 +1,1029 @@ +""" +Tests for azure.load_balancer.no_backends — spec-aligned. + +Covers: must-emit, must-skip, billable-rule contract, relevant-pool contract, + membership contract, finding shape, evidence contract, region filter, + failure behavior. +""" + from types import SimpleNamespace import pytest from cleancloud.providers.azure.rules.lb_no_backends import find_lb_no_backends +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- + +_SUB = "sub-123" +_RG = "rg" + + +def _pool_arm_id(pool_name: str, lb_name: str = "lb") -> str: + return ( + f"/subscriptions/{_SUB}/resourceGroups/{_RG}/providers/" + f"Microsoft.Network/loadBalancers/{lb_name}/backendAddressPools/{pool_name}" + ) -def _make_pool(nic_backends=None, ip_backends=None): + +def _make_pool(name: str, lb_name: str = "lb", nic_backends=None, ip_backends=None): + """Backend address pool with an ARM id and optional members.""" return SimpleNamespace( + id=_pool_arm_id(name, lb_name), backend_ip_configurations=nic_backends, load_balancer_backend_addresses=ip_backends, ) +def _pool_ref(pool_name: str, lb_name: str = "lb"): + """SubResource-style pool reference (as returned from a rule's backendAddressPool).""" + return SimpleNamespace(id=_pool_arm_id(pool_name, lb_name)) + + +def _make_lb_rule(pool_name: str = None, pool_names=None, lb_name: str = "lb"): + """Load-balancing rule referencing one pool (single) or multiple pools (multi).""" + return SimpleNamespace( + backend_address_pool=_pool_ref(pool_name, lb_name) if pool_name else None, + backend_address_pools=([_pool_ref(n, lb_name) for n in pool_names] if pool_names else []), + ) + + +def _make_outbound_rule(pool_name: str, lb_name: str = "lb"): + """Outbound rule with a single backend pool reference.""" + return SimpleNamespace( + backend_address_pool=_pool_ref(pool_name, lb_name), + backend_address_pools=[], + ) + + def _make_lb( - name, - sku_name="Standard", + name: str = "lb", + sku_name: str = "Standard", pools=None, - location="eastus", + location: str = "eastus", tags=None, - provisioning_state="Succeeded", + provisioning_state: str = "Succeeded", frontend_ips=None, - rules=None, + lb_rules=None, + outbound_rules=None, + lb_id: str = None, ): return SimpleNamespace( - id=f"/subscriptions/sub-123/resourceGroups/rg/providers/Microsoft.Network/loadBalancers/{name}", + id=( + lb_id + or f"/subscriptions/{_SUB}/resourceGroups/{_RG}/providers/" + f"Microsoft.Network/loadBalancers/{name}" + ), name=name, location=location, sku=SimpleNamespace(name=sku_name, tier="Regional"), - backend_address_pools=pools, + backend_address_pools=pools if pools is not None else [], frontend_ip_configurations=frontend_ips or [], - load_balancing_rules=rules or [], + load_balancing_rules=lb_rules if lb_rules is not None else [], + outbound_rules=outbound_rules if outbound_rules is not None else [], provisioning_state=provisioning_state, tags=tags, ) -@pytest.fixture -def mock_network_client(mocker): - lbs = [ - # Standard + all pools empty -> should be flagged - _make_lb("lb-empty", pools=[_make_pool()]), - # Standard + has NIC-based backend -> skip - _make_lb("lb-with-nic-backend", pools=[_make_pool(nic_backends=[{"id": "nic-1"}])]), - # Standard + has IP-based backend -> skip (Private Link / hybrid) - _make_lb("lb-with-ip-backend", pools=[_make_pool(ip_backends=[{"ip": "10.0.0.1"}])]), - # Standard + no pools at all -> should be flagged - _make_lb("lb-no-pools", pools=[]), - # Basic SKU + empty -> skip (no cost signal) - _make_lb("lb-basic-empty", sku_name="Basic", pools=[_make_pool()]), - # Standard + still provisioning -> skip - _make_lb("lb-provisioning", pools=[_make_pool()], provisioning_state="Creating"), - ] - client = mocker.MagicMock() - client.load_balancers.list_all.return_value = lbs - return client - - -def test_find_lb_no_backends(mock_network_client): - findings = find_lb_no_backends( - subscription_id="sub-123", - credential=None, - region_filter="eastus", - client=mock_network_client, - ) - names = [f.details["resource_name"] for f in findings] - - # Only the two empty Standard LBs should be flagged - assert len(findings) == 2 - assert "lb-empty" in names - assert "lb-no-pools" in names - - # Verify finding fields - for f in findings: - assert f.provider == "azure" - assert f.rule_id == "azure.load_balancer.no_backends" - assert f.confidence.value == "high" - assert f.risk.value == "low" - assert f.title == "Standard Load Balancer Has No Backend Members" - assert f.details["sku_name"] == "Standard" - assert f.estimated_monthly_cost_usd == 18.0 - - # Not flagged - assert "lb-with-nic-backend" not in names - assert "lb-with-ip-backend" not in names - assert "lb-basic-empty" not in names - assert "lb-provisioning" not in names - - -def test_find_lb_no_backends_empty_subscription(mocker): - client = mocker.MagicMock() - client.load_balancers.list_all.return_value = [] - - findings = find_lb_no_backends( - subscription_id="sub-123", +def _run(lbs, region_filter=None): + client = SimpleNamespace(load_balancers=SimpleNamespace(list_all=lambda: lbs)) + return find_lb_no_backends( + subscription_id=_SUB, credential=None, + region_filter=region_filter, client=client, ) - assert findings == [] -def test_find_lb_no_backends_region_filter(mocker): - lbs = [ - _make_lb("lb-east", location="eastus", pools=[_make_pool()]), - _make_lb("lb-west", location="westus", pools=[_make_pool()]), - ] - client = mocker.MagicMock() - client.load_balancers.list_all.return_value = lbs +# --------------------------------------------------------------------------- +# TestMustEmit — spec 14.1 +# --------------------------------------------------------------------------- - findings = find_lb_no_backends( - subscription_id="sub-123", - credential=None, - region_filter="eastus", - client=client, - ) - assert len(findings) == 1 - assert "lb-east" in findings[0].resource_id - - -def test_find_lb_no_backends_mixed_pools(mocker): - """LB with multiple pools where one has members — should NOT be flagged.""" - lbs = [ - _make_lb( - "lb-mixed", - pools=[ - _make_pool(), # empty pool - _make_pool(nic_backends=[{"id": "nic-1"}]), # pool with member - ], - ), - ] - client = mocker.MagicMock() - client.load_balancers.list_all.return_value = lbs - findings = find_lb_no_backends( - subscription_id="sub-123", - credential=None, - client=client, - ) - assert len(findings) == 0 +class TestMustEmit: + def test_one_lb_rule_one_empty_pool(self): + """Standard, Succeeded, 1 LB rule → 1 empty pool → EMIT.""" + pool = _make_pool("p1") + lb = _make_lb( + pools=[pool], + lb_rules=[_make_lb_rule("p1")], + ) + findings = _run([lb]) + assert len(findings) == 1 + def test_one_outbound_rule_one_empty_pool(self): + """Standard, Succeeded, 1 outbound rule → 1 empty pool → EMIT.""" + pool = _make_pool("p1") + lb = _make_lb( + pools=[pool], + outbound_rules=[_make_outbound_rule("p1")], + ) + findings = _run([lb]) + assert len(findings) == 1 -def test_find_lb_no_backends_multiple_empty_pools(mocker): - """LB with multiple pools all empty — should be flagged.""" - lbs = [ - _make_lb( - "lb-all-empty", - pools=[_make_pool(), _make_pool(), _make_pool()], - ), - ] - client = mocker.MagicMock() - client.load_balancers.list_all.return_value = lbs + def test_two_lb_rules_two_distinct_empty_pools(self): + """Two LB rules referencing two different empty pools → EMIT.""" + pool_a = _make_pool("pa") + pool_b = _make_pool("pb") + lb = _make_lb( + pools=[pool_a, pool_b], + lb_rules=[_make_lb_rule("pa"), _make_lb_rule("pb")], + ) + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].details["relevant_backend_pool_count"] == 2 - findings = find_lb_no_backends( - subscription_id="sub-123", - credential=None, - client=client, - ) - assert len(findings) == 1 - assert findings[0].details["backend_pool_count"] == 3 + def test_lb_rule_using_multi_pool_ref(self): + """LB rule using backend_address_pools (list form) → EMIT.""" + pool_a = _make_pool("pa") + pool_b = _make_pool("pb") + lb = _make_lb( + pools=[pool_a, pool_b], + lb_rules=[_make_lb_rule(pool_names=["pa", "pb"])], + ) + findings = _run([lb]) + assert len(findings) == 1 + + def test_mixed_lb_and_outbound_rules_both_empty(self): + """One LB rule + one outbound rule both referencing the same empty pool → EMIT.""" + pool = _make_pool("p1") + lb = _make_lb( + pools=[pool], + lb_rules=[_make_lb_rule("p1")], + outbound_rules=[_make_outbound_rule("p1")], + ) + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].details["load_balancing_rule_count"] == 1 + assert findings[0].details["outbound_rule_count"] == 1 + + +# --------------------------------------------------------------------------- +# TestMustSkip — spec 14.2 +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_basic_sku_skipped(self): + pool = _make_pool("p1") + lb = _make_lb(sku_name="Basic", pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] + + def test_gateway_sku_skipped(self): + pool = _make_pool("p1") + lb = _make_lb(sku_name="Gateway", pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] + + def test_provisioning_state_creating_skipped(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], provisioning_state="Creating") + assert _run([lb]) == [] + + def test_provisioning_state_none_skipped(self): + """None provisioning state must skip — not treated as Succeeded.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], provisioning_state=None) + assert _run([lb]) == [] + + def test_no_billable_rules_skipped(self): + """No load-balancing rules and no outbound rules → skip even if pools are empty.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool]) # no lb_rules, no outbound_rules + assert _run([lb]) == [] + + def test_only_inbound_nat_rules_not_billable(self): + """Inbound NAT rules are not billable — LB with only NAT rules must skip.""" + pool = _make_pool("p1") + inbound_nat_rule = SimpleNamespace( + backend_address_pool=_pool_ref("p1"), backend_address_pools=[] + ) + lb = SimpleNamespace( + id="/subscriptions/sub-123/resourceGroups/rg/providers/Microsoft.Network/loadBalancers/lb", + name="lb", + location="eastus", + sku=SimpleNamespace(name="Standard", tier="Regional"), + backend_address_pools=[pool], + frontend_ip_configurations=[], + load_balancing_rules=[], + outbound_rules=[], + inbound_nat_rules=[inbound_nat_rule], + provisioning_state="Succeeded", + tags=None, + ) + assert _run([lb]) == [] + + def test_referenced_pool_has_nic_members_skipped(self): + pool = _make_pool("p1", nic_backends=[{"id": "nic-1"}]) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] + + def test_referenced_pool_has_ip_members_skipped(self): + pool = _make_pool("p1", ip_backends=[SimpleNamespace(ip_address="10.0.0.1")]) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] + + def test_pool_ref_with_no_id_skipped(self): + """LB rule referencing a pool object whose id is None → unresolvable → skip.""" + pool = _make_pool("p1") + bad_rule = SimpleNamespace( + backend_address_pool=SimpleNamespace(id=None), + backend_address_pools=[], + ) + lb = _make_lb(pools=[pool], lb_rules=[bad_rule]) + assert _run([lb]) == [] + + def test_referenced_pool_not_in_inventory_skipped(self): + """Rule references a pool id not present in LB's backend_address_pools → skip.""" + pool_in_inventory = _make_pool("p1") + rule_refs_unknown = _make_lb_rule("p-unknown") # references "p-unknown", not "p1" + lb = _make_lb(pools=[pool_in_inventory], lb_rules=[rule_refs_unknown]) + assert _run([lb]) == [] + + def test_lb_rule_with_no_pool_reference_skipped(self): + """Billable rule with neither backend_address_pool nor backend_address_pools → skip.""" + pool = _make_pool("p1") + empty_rule = SimpleNamespace( + backend_address_pool=None, + backend_address_pools=[], + ) + lb = _make_lb(pools=[pool], lb_rules=[empty_rule]) + assert _run([lb]) == [] + + def test_absent_id_skipped(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], lb_id=None) + # Override id to None + lb.id = None + assert _run([lb]) == [] + + def test_absent_name_skipped(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.name = None + assert _run([lb]) == [] + + def test_region_filter_mismatch_skipped(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], location="westus") + assert _run([lb], region_filter="eastus") == [] + + +# --------------------------------------------------------------------------- +# TestBillableRuleContract — spec 9.2 +# --------------------------------------------------------------------------- + + +class TestBillableRuleContract: + def test_lb_rule_counts_as_billable(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert len(_run([lb])) == 1 + + def test_outbound_rule_counts_as_billable(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], outbound_rules=[_make_outbound_rule("p1")]) + assert len(_run([lb])) == 1 + + def test_billable_rule_count_in_signal(self): + """signals_used must include exact billable rule count.""" + pool_a = _make_pool("pa") + pool_b = _make_pool("pb") + lb = _make_lb( + pools=[pool_a, pool_b], + lb_rules=[_make_lb_rule("pa")], + outbound_rules=[_make_outbound_rule("pb")], + ) + findings = _run([lb]) + assert len(findings) == 1 + signals = findings[0].evidence.signals_used + assert any("Billable rule count is 2" in s for s in signals) + + +# --------------------------------------------------------------------------- +# TestRelevantPoolContract — spec 9.3 +# --------------------------------------------------------------------------- + + +class TestRelevantPoolContract: + def test_unreferenced_pool_with_members_does_not_suppress(self): + """Pool not referenced by any billable rule must not affect the finding.""" + empty_pool = _make_pool("empty") + unreferenced = _make_pool("unreferenced", nic_backends=[{"id": "nic-1"}]) + lb = _make_lb( + pools=[empty_pool, unreferenced], + lb_rules=[_make_lb_rule("empty")], # only references "empty" + ) + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].details["relevant_backend_pool_count"] == 1 + + def test_two_rules_same_pool_deduped(self): + """Two billable rules referencing the same pool → pool evaluated once.""" + pool = _make_pool("p1") + lb = _make_lb( + pools=[pool], + lb_rules=[_make_lb_rule("p1"), _make_lb_rule("p1")], + ) + findings = _run([lb]) + assert len(findings) == 1 + # pool should appear exactly once in relevant set + assert findings[0].details["relevant_backend_pool_count"] == 1 + + def test_partial_pool_missing_from_inventory_skips_lb(self): + """Two rules: one referenced pool is in inventory, one is not → skip.""" + pool_a = _make_pool("pa") + # "pb" referenced but not in inventory + lb = _make_lb( + pools=[pool_a], + lb_rules=[_make_lb_rule("pa"), _make_lb_rule("pb")], + ) + assert _run([lb]) == [] + + def test_pool_id_trailing_slash_normalized(self): + """Pool id with trailing slash in inventory still resolves correctly.""" + # Manually build a pool whose id has a trailing slash + pool = SimpleNamespace( + id=_pool_arm_id("p1") + "/", # trailing slash + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + findings = _run([lb]) + assert len(findings) == 1 + + def test_pool_id_uppercase_normalized(self): + """Pool id with uppercase letters in inventory matches lowercase reference.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1").upper(), # uppercase + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + findings = _run([lb]) + assert len(findings) == 1 + + +# --------------------------------------------------------------------------- +# TestMembershipContract — spec 9.4 +# --------------------------------------------------------------------------- + + +class TestMembershipContract: + def test_pool_with_nic_members_not_empty(self): + pool = _make_pool("p1", nic_backends=[{"id": "nic-1"}]) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] + + def test_pool_with_ip_members_not_empty(self): + pool = _make_pool("p1", ip_backends=[SimpleNamespace(ip_address="10.0.0.1")]) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] + + def test_pool_with_empty_nic_list_is_empty(self): + pool = _make_pool("p1", nic_backends=[]) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert len(_run([lb])) == 1 + + def test_pool_with_none_members_is_empty(self): + pool = _make_pool("p1", nic_backends=None, ip_backends=None) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert len(_run([lb])) == 1 + + def test_any_relevant_pool_with_members_skips(self): + """Two relevant pools — one empty, one has members → skip.""" + empty_pool = _make_pool("pa") + full_pool = _make_pool("pb", nic_backends=[{"id": "nic-1"}]) + lb = _make_lb( + pools=[empty_pool, full_pool], + lb_rules=[_make_lb_rule("pa"), _make_lb_rule("pb")], + ) + assert _run([lb]) == [] + + +# --------------------------------------------------------------------------- +# TestFindingShape — spec 12.1-12.3 +# --------------------------------------------------------------------------- + + +class TestFindingShape: + def _emit_one(self): + pool = _make_pool("p1") + lb = _make_lb( + name="lb-test", + pools=[pool], + lb_rules=[_make_lb_rule("p1")], + tags={"env": "test"}, + ) + findings = _run([lb]) + assert len(findings) == 1 + return findings[0] + + def test_provider(self): + assert self._emit_one().provider == "azure" + + def test_rule_id(self): + assert self._emit_one().rule_id == "azure.load_balancer.no_backends" + + def test_resource_type(self): + assert self._emit_one().resource_type == "azure.load_balancer" + + def test_resource_id_is_arm_id(self): + f = self._emit_one() + assert "Microsoft.Network/loadBalancers/lb-test" in f.resource_id + + def test_region_is_normalized(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], location="EastUS") + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].region == "eastus" + + def test_estimated_cost_is_none(self): + """Spec 11 mandates None — must not be $18 or any hardcoded value.""" + assert self._emit_one().estimated_monthly_cost_usd is None + + def test_confidence_high(self): + assert self._emit_one().confidence == pytest.approx(self._emit_one().confidence) + assert self._emit_one().confidence.value == "high" + + def test_risk_low(self): + assert self._emit_one().risk.value == "low" + + def test_details_resource_name(self): + assert self._emit_one().details["resource_name"] == "lb-test" + + def test_details_subscription_id(self): + assert self._emit_one().details["subscription_id"] == _SUB + + def test_details_sku_name(self): + assert self._emit_one().details["sku_name"] == "Standard" + + def test_details_sku_tier(self): + assert self._emit_one().details["sku_tier"] == "Regional" + + def test_details_backend_pool_count(self): + assert self._emit_one().details["backend_pool_count"] == 1 + + def test_details_relevant_backend_pool_count(self): + assert self._emit_one().details["relevant_backend_pool_count"] == 1 + + def test_details_load_balancing_rule_count(self): + assert self._emit_one().details["load_balancing_rule_count"] == 1 + + def test_details_outbound_rule_count(self): + assert self._emit_one().details["outbound_rule_count"] == 0 + + def test_details_frontend_ip_count(self): + assert self._emit_one().details["frontend_ip_count"] == 0 + + def test_details_tags_normalized_to_dict(self): + assert self._emit_one().details["tags"] == {"env": "test"} + + def test_details_tags_none_becomes_empty_dict(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], tags=None) + findings = _run([lb]) + assert findings[0].details["tags"] == {} + + +# --------------------------------------------------------------------------- +# TestEvidenceContract — spec 12.2 +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def _emit_one(self, billable_count=1): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + findings = _run([lb]) + assert len(findings) == 1 + return findings[0].evidence + + def test_signals_used_sku(self): + assert "Load Balancer SKU is Standard" in self._emit_one().signals_used + + def test_signals_used_billable_count(self): + assert "Billable rule count is 1" in self._emit_one().signals_used + + def test_signals_used_membership_check(self): + assert ( + "All relevant backend pools evaluated to empty using NIC-based and IP-based membership checks" + in self._emit_one().signals_used + ) + + def test_signals_not_checked_planned_backend(self): + assert ( + "Planned backend attachment or cutover intent" in self._emit_one().signals_not_checked + ) + + def test_signals_not_checked_iac(self): + assert ( + "IaC-managed placeholder or staged deployment intent" + in self._emit_one().signals_not_checked + ) + + def test_signals_not_checked_traffic(self): + assert "Traffic history or future activation plans" in self._emit_one().signals_not_checked + + def test_signals_not_checked_frontend(self): + assert ( + "Frontend public IP cost or attachment evaluated by other rules" + in self._emit_one().signals_not_checked + ) + + def test_billable_count_reflects_two_rules(self): + pool_a = _make_pool("pa") + pool_b = _make_pool("pb") + lb = _make_lb( + pools=[pool_a, pool_b], + lb_rules=[_make_lb_rule("pa")], + outbound_rules=[_make_outbound_rule("pb")], + ) + findings = _run([lb]) + evidence = findings[0].evidence + assert "Billable rule count is 2" in evidence.signals_used + + +# --------------------------------------------------------------------------- +# TestRegionFilter — spec 8.3 / 7 +# --------------------------------------------------------------------------- + + +class TestRegionFilter: + def test_exact_match_emits(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], location="eastus") + assert len(_run([lb], region_filter="eastus")) == 1 + + def test_case_insensitive_region_filter(self): + """Region filter is lowercased before comparison.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], location="eastus") + assert len(_run([lb], region_filter="EastUS")) == 1 + + def test_location_stored_lowercase(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], location="WestEurope") + findings = _run([lb]) + assert findings[0].region == "westeurope" + + def test_region_mismatch_skips(self): + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")], location="westus") + assert _run([lb], region_filter="eastus") == [] + + def test_no_region_filter_includes_all(self): + pool = _make_pool("p1") + lb_east = _make_lb("east", pools=[pool], lb_rules=[_make_lb_rule("p1")], location="eastus") + lb_west = _make_lb("west", pools=[pool], lb_rules=[_make_lb_rule("p1")], location="westus") + assert len(_run([lb_east, lb_west])) == 2 + + +# --------------------------------------------------------------------------- +# TestFailureBehavior — spec 13 +# --------------------------------------------------------------------------- + + +class TestFailureBehavior: + def test_list_exception_propagates(self): + """If list_all() raises, the exception must propagate.""" + client = SimpleNamespace( + load_balancers=SimpleNamespace( + list_all=lambda: (_ for _ in ()).throw(RuntimeError("API down")) + ) + ) + with pytest.raises(RuntimeError, match="API down"): + find_lb_no_backends(subscription_id=_SUB, credential=None, client=client) + + def test_malformed_lb_skipped(self): + """An LB record with no id is skipped; scan continues for the next one.""" + good_pool = _make_pool("p1") + good_lb = _make_lb("good", pools=[good_pool], lb_rules=[_make_lb_rule("p1")]) + bad_lb = SimpleNamespace( + id=None, # missing id + name="bad", + location="eastus", + sku=SimpleNamespace(name="Standard", tier="Regional"), + backend_address_pools=[good_pool], + frontend_ip_configurations=[], + load_balancing_rules=[_make_lb_rule("p1")], + outbound_rules=[], + provisioning_state="Succeeded", + tags=None, + ) + findings = _run([bad_lb, good_lb]) + assert len(findings) == 1 + assert findings[0].details["resource_name"] == "good" + + +# --------------------------------------------------------------------------- +# TestSDKFallbacks — spec 9.1-9.4 nested/raw ARM-style fallback paths +# --------------------------------------------------------------------------- + + +class TestSDKFallbacks: + """ + Each test removes the SDK-level attribute (sets to None) and asserts that + the implementation falls back to lb.properties.* / rule.properties.* / + pool.properties.* as required by spec 9.1-9.4. + """ + + # -- Gap 1: Provisioning state (spec 9.1) -- + + def test_provisioning_state_from_nested_properties_emits(self): + """lb.provisioning_state absent → lb.properties.provisioning_state='Succeeded' used.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.provisioning_state = None + lb.properties = SimpleNamespace(provisioning_state="Succeeded") + assert len(_run([lb])) == 1 + + def test_provisioning_state_nested_not_succeeded_skips(self): + """lb.provisioning_state absent, nested is 'Creating' → skip.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.provisioning_state = None + lb.properties = SimpleNamespace(provisioning_state="Creating") + assert _run([lb]) == [] + + def test_provisioning_state_both_absent_skips(self): + """Neither lb.provisioning_state nor lb.properties.provisioning_state → skip.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.provisioning_state = None + lb.properties = SimpleNamespace() # no provisioning_state attribute + assert _run([lb]) == [] + + # -- Gap 2: Billable rule collections (spec 9.2) -- + + def test_lb_rules_from_nested_properties(self): + """lb.load_balancing_rules absent → lb.properties.load_balancing_rules used.""" + pool = _make_pool("p1") + rule = _make_lb_rule("p1") + lb = _make_lb(pools=[pool]) + lb.load_balancing_rules = None + lb.properties = SimpleNamespace( + load_balancing_rules=[rule], + outbound_rules=[], + ) + assert len(_run([lb])) == 1 + + def test_outbound_rules_from_nested_properties(self): + """lb.outbound_rules absent → lb.properties.outbound_rules used.""" + pool = _make_pool("p1") + rule = _make_outbound_rule("p1") + lb = _make_lb(pools=[pool]) + lb.outbound_rules = None + lb.properties = SimpleNamespace( + load_balancing_rules=[], + outbound_rules=[rule], + ) + assert len(_run([lb])) == 1 + + def test_no_billable_rules_in_nested_skips(self): + """Both rule collections absent at SDK and nested level → skip (count = 0).""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool]) + lb.load_balancing_rules = None + lb.outbound_rules = None + lb.properties = SimpleNamespace(load_balancing_rules=[], outbound_rules=[]) + assert _run([lb]) == [] + + # -- Gap 3: Rule-to-backend reference (spec 9.3) -- + + def test_single_pool_ref_from_rule_properties(self): + """rule.backend_address_pool absent → rule.properties.backend_address_pool used.""" + pool = _make_pool("p1") + rule = SimpleNamespace( + backend_address_pool=None, + backend_address_pools=[], + properties=SimpleNamespace( + backend_address_pool=_pool_ref("p1"), + backend_address_pools=[], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[rule]) + assert len(_run([lb])) == 1 + + def test_multi_pool_refs_from_rule_properties(self): + """rule.backend_address_pools absent → rule.properties.backend_address_pools used.""" + pool = _make_pool("p1") + rule = SimpleNamespace( + backend_address_pool=None, + backend_address_pools=None, + properties=SimpleNamespace( + backend_address_pool=None, + backend_address_pools=[_pool_ref("p1")], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[rule]) + assert len(_run([lb])) == 1 + + def test_rule_nested_ref_with_no_id_skips(self): + """Nested pool reference object present but id is None → unresolvable → skip.""" + pool = _make_pool("p1") + rule = SimpleNamespace( + backend_address_pool=None, + backend_address_pools=[], + properties=SimpleNamespace( + backend_address_pool=SimpleNamespace(id=None), + backend_address_pools=[], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[rule]) + assert _run([lb]) == [] + + # -- Gap 4a: Pool membership (spec 9.4) -- + + def test_nic_members_from_pool_properties(self): + """pool.backend_ip_configurations absent → pool.properties.backend_ip_configurations used.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + properties=SimpleNamespace( + backend_ip_configurations=[{"id": "nic-1"}], + load_balancer_backend_addresses=None, + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] # members detected via fallback → skip + + def test_ip_members_from_pool_properties(self): + """pool.load_balancer_backend_addresses absent → pool.properties.* used.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + properties=SimpleNamespace( + backend_ip_configurations=None, + load_balancer_backend_addresses=[SimpleNamespace(ip_address="10.0.0.1")], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] # members detected via fallback → skip + + def test_pool_nested_both_empty_still_emits(self): + """pool.properties has both membership attrs, both empty → pool is empty → emit.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + properties=SimpleNamespace( + backend_ip_configurations=[], + load_balancer_backend_addresses=[], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert len(_run([lb])) == 1 + + # -- Gap 4b: Pool inventory (spec 9.3 / spec 6) -- + + def test_pool_inventory_from_lb_nested_properties(self): + """lb.backend_address_pools absent → lb.properties.backend_address_pools used.""" + pool = _make_pool("p1") + rule = _make_lb_rule("p1") + lb = _make_lb(lb_rules=[rule]) + lb.backend_address_pools = None # make SDK attribute absent + lb.properties = SimpleNamespace(backend_address_pools=[pool]) + assert len(_run([lb])) == 1 + + def test_pool_inventory_from_nested_has_members_skips(self): + """Pool found via nested fallback and it has members → skip.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=[{"id": "nic-1"}], + load_balancer_backend_addresses=None, + ) + rule = _make_lb_rule("p1") + lb = _make_lb(lb_rules=[rule]) + lb.backend_address_pools = None + lb.properties = SimpleNamespace(backend_address_pools=[pool]) + assert _run([lb]) == [] + + +class TestArmCamelCaseFallbacks: + """ + Verify that each resolver falls back to ARM camelCase field names when + both the SDK projection and the nested snake_case field are absent. + """ + + # -- provisioningState -- + + def test_provisioning_state_camel_case_succeeded_emits(self): + """lb.provisioningState and lb.properties.provisioning_state absent → lb.properties.provisioningState used.""" + pool = _make_pool("p1") + rule = _make_lb_rule("p1") + lb = _make_lb(pools=[pool], lb_rules=[rule]) + lb.provisioning_state = None + lb.properties = SimpleNamespace(provisioning_state=None, provisioningState="Succeeded") + assert len(_run([lb])) == 1 + + def test_provisioning_state_camel_case_not_succeeded_skips(self): + """lb.properties.provisioningState='Updating' → skip.""" + pool = _make_pool("p1") + rule = _make_lb_rule("p1") + lb = _make_lb(pools=[pool], lb_rules=[rule]) + lb.provisioning_state = None + lb.properties = SimpleNamespace(provisioning_state=None, provisioningState="Updating") + assert _run([lb]) == [] + + # -- loadBalancingRules -- + + def test_lb_rules_from_camel_case_nested(self): + """lb.load_balancing_rules absent and lb.properties.load_balancing_rules absent → lb.properties.loadBalancingRules used.""" + pool = _make_pool("p1") + rule = _make_lb_rule("p1") + lb = _make_lb(pools=[pool], lb_rules=[rule]) + lb.load_balancing_rules = None + lb.properties = SimpleNamespace(load_balancing_rules=None, loadBalancingRules=[rule]) + assert len(_run([lb])) == 1 + + # -- outboundRules -- + + def test_outbound_rules_from_camel_case_nested(self): + """lb.outbound_rules absent and lb.properties.outbound_rules absent → lb.properties.outboundRules used.""" + pool = _make_pool("p1") + rule = _make_outbound_rule("p1") + lb = _make_lb(pools=[pool], outbound_rules=[rule]) + lb.outbound_rules = None + lb.properties = SimpleNamespace(outbound_rules=None, outboundRules=[rule]) + assert len(_run([lb])) == 1 + + # -- backendAddressPools (inventory) -- + + def test_pool_inventory_from_camel_case_nested(self): + """lb.backend_address_pools absent and lb.properties.backend_address_pools absent → lb.properties.backendAddressPools used.""" + pool = _make_pool("p1") + rule = _make_lb_rule("p1") + lb = _make_lb(lb_rules=[rule]) + lb.backend_address_pools = None + lb.properties = SimpleNamespace(backend_address_pools=None, backendAddressPools=[pool]) + assert len(_run([lb])) == 1 + + # -- backendAddressPool (single rule ref) -- + + def test_single_pool_ref_from_camel_case_rule_properties(self): + """rule.backend_address_pool absent and rule.properties.backend_address_pool absent → rule.properties.backendAddressPool used.""" + pool = _make_pool("p1") + ref = _pool_ref("p1") + rule = SimpleNamespace( + backend_address_pool=None, + backend_address_pools=[], + properties=SimpleNamespace( + backend_address_pool=None, + backendAddressPool=ref, + backend_address_pools=None, + backendAddressPools=None, + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[rule]) + assert len(_run([lb])) == 1 + + # -- backendAddressPools (multi rule refs) -- + + def test_multi_pool_refs_from_camel_case_rule_properties(self): + """rule.backend_address_pools absent and rule.properties.backend_address_pools absent → rule.properties.backendAddressPools used.""" + pool = _make_pool("p1") + ref = _pool_ref("p1") + rule = SimpleNamespace( + backend_address_pool=None, + backend_address_pools=None, + properties=SimpleNamespace( + backend_address_pool=None, + backendAddressPool=None, + backend_address_pools=None, + backendAddressPools=[ref], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[rule]) + assert len(_run([lb])) == 1 + + # -- backendIpConfigurations (NIC membership) -- + + def test_nic_members_from_camel_case_pool_properties(self): + """pool.backend_ip_configurations absent and pool.properties.backend_ip_configurations absent → pool.properties.backendIpConfigurations used.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + properties=SimpleNamespace( + backend_ip_configurations=None, + backendIpConfigurations=[{"id": "nic-1"}], + load_balancer_backend_addresses=None, + loadBalancerBackendAddresses=None, + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] # members found via camelCase → skip + + # -- loadBalancerBackendAddresses (IP membership) -- + + def test_ip_members_from_camel_case_pool_properties(self): + """pool.load_balancer_backend_addresses absent and pool.properties.load_balancer_backend_addresses absent → pool.properties.loadBalancerBackendAddresses used.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + properties=SimpleNamespace( + backend_ip_configurations=None, + backendIpConfigurations=None, + load_balancer_backend_addresses=None, + loadBalancerBackendAddresses=[SimpleNamespace(ip_address="10.0.0.2")], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert _run([lb]) == [] # members found via camelCase → skip + + def test_camel_case_pool_both_empty_still_emits(self): + """pool.properties has camelCase membership attrs, both empty → pool is empty → emit.""" + pool = SimpleNamespace( + id=_pool_arm_id("p1"), + backend_ip_configurations=None, + load_balancer_backend_addresses=None, + properties=SimpleNamespace( + backend_ip_configurations=None, + backendIpConfigurations=[], + load_balancer_backend_addresses=None, + loadBalancerBackendAddresses=[], + ), + ) + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + assert len(_run([lb])) == 1 + + +class TestRobustnessAndFrontendDetails: + """ + Covers the two minor concerns: + 1. frontend_ip_count uses nested/raw fallback (detail field consistency). + 2. Non-iterable rule collections are skipped, not raised. + """ + + # -- Concern 1: frontend_ip_count fallback -- + + def test_frontend_count_from_nested_snake_case(self): + """lb.frontend_ip_configurations absent → lb.properties.frontend_ip_configurations used.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.frontend_ip_configurations = None + lb.properties = SimpleNamespace( + frontend_ip_configurations=[SimpleNamespace(id="fe-1"), SimpleNamespace(id="fe-2")], + frontendIPConfigurations=None, + ) + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].details["frontend_ip_count"] == 2 + + def test_frontend_count_from_nested_camel_case(self): + """Both SDK and snake_case nested absent → lb.properties.frontendIPConfigurations used.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.frontend_ip_configurations = None + lb.properties = SimpleNamespace( + frontend_ip_configurations=None, + frontendIPConfigurations=[SimpleNamespace(id="fe-1")], + ) + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].details["frontend_ip_count"] == 1 + + def test_frontend_count_zero_when_all_absent(self): + """All three sources absent → frontend_ip_count is 0, not an error.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool], lb_rules=[_make_lb_rule("p1")]) + lb.frontend_ip_configurations = None + lb.properties = SimpleNamespace( + frontend_ip_configurations=None, + frontendIPConfigurations=None, + ) + findings = _run([lb]) + assert len(findings) == 1 + assert findings[0].details["frontend_ip_count"] == 0 + + # -- Concern 2: non-iterable rule collections don't raise -- + + def test_non_iterable_lb_rules_treated_as_no_rules(self): + """lb.load_balancing_rules is a truthy non-iterable object → treated as empty → skip (no billable rules).""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool]) + lb.load_balancing_rules = object() # truthy, non-iterable + lb.outbound_rules = [] + # No billable rules resolved → must skip, not raise + assert _run([lb]) == [] + + def test_non_iterable_outbound_rules_treated_as_no_rules(self): + """lb.outbound_rules is a truthy non-iterable object → treated as empty → skip.""" + pool = _make_pool("p1") + lb = _make_lb(pools=[pool]) + lb.load_balancing_rules = [] + lb.outbound_rules = object() # truthy, non-iterable + assert _run([lb]) == [] + + def test_non_iterable_backend_pools_treated_as_empty_inventory(self): + """lb.backend_address_pools is a truthy non-iterable → pool inventory is empty → unresolvable reference → skip.""" + rule = _make_lb_rule("p1") + lb = _make_lb(lb_rules=[rule]) + lb.backend_address_pools = object() # truthy, non-iterable + assert _run([lb]) == [] diff --git a/tests/cleancloud/providers/azure/test_azure_public_ip_unused.py b/tests/cleancloud/providers/azure/test_azure_public_ip_unused.py index 5493af4..85df51c 100644 --- a/tests/cleancloud/providers/azure/test_azure_public_ip_unused.py +++ b/tests/cleancloud/providers/azure/test_azure_public_ip_unused.py @@ -1,45 +1,511 @@ +""" +Tests for azure.network.public_ip.unused — spec-aligned. + +Covers: must-emit, must-skip, attachment contract (all 4 linkages), + dynamic-placeholder contract, provisioning-state contract, + finding shape, evidence contract, region filter, failure behavior, + SDK-first / nested-fallback / ARM camelCase fallbacks. +""" + from types import SimpleNamespace import pytest from cleancloud.providers.azure.rules.public_ip_unused import find_unused_public_ips +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- -@pytest.fixture -def mock_network_client(mocker): - pip_old = SimpleNamespace( - id="pip-1", - name="pip-unused", - ip_address="1.2.3.4", - ip_configuration=None, - location="eastus", - tags=None, - ) - pip_in_use = SimpleNamespace( - id="pip-2", - name="pip-used", - ip_address="5.6.7.8", - ip_configuration={"id": "some-config"}, - location="eastus", - tags={"env": "prod"}, - ) +_SUB = "sub-123" +_ARM_ID = ( + "/subscriptions/sub-123/resourceGroups/rg/providers/Microsoft.Network/publicIPAddresses/pip1" +) - client = mocker.MagicMock() - client.public_ip_addresses.list_all.return_value = [pip_old, pip_in_use] - return client +def _ref(arm_id: str = "/some/resource/id"): + """A SubResource-style reference with an id.""" + return SimpleNamespace(id=arm_id) -def test_find_unused_public_ips(mock_network_client): - findings = find_unused_public_ips( - subscription_id="sub-123", - credential=None, - region_filter="eastus", - client=mock_network_client, + +def _make_pip( + name: str = "pip1", + location: str = "eastus", + provisioning_state: str = "Succeeded", + ip_configuration=None, + nat_gateway=None, + service_public_ip_address=None, + linked_public_ip_address=None, + allocation_method: str = "Static", + ip_address: str = "1.2.3.4", + pip_id: str = _ARM_ID, + tags=None, + sku_name: str = "Standard", + ip_version: str = "IPv4", + ip_tags=None, +): + return SimpleNamespace( + id=pip_id, + name=name, + location=location, + provisioning_state=provisioning_state, + ip_configuration=ip_configuration, + nat_gateway=nat_gateway, + service_public_ip_address=service_public_ip_address, + linked_public_ip_address=linked_public_ip_address, + public_ip_allocation_method=allocation_method, + ip_address=ip_address, + tags=tags, + sku=SimpleNamespace(name=sku_name), + public_ip_address_version=ip_version, + ip_tags=ip_tags, ) - resource_ids = [f.resource_id for f in findings] - assert "pip-1" in resource_ids - assert "pip-2" not in resource_ids - # Verify cost estimate ($3.60/month for Standard SKU) - finding = [f for f in findings if f.resource_id == "pip-1"][0] - assert finding.estimated_monthly_cost_usd == 3.60 + +def _run(pips): + from unittest.mock import MagicMock + + client = MagicMock() + client.public_ip_addresses.list_all.return_value = pips + return find_unused_public_ips(subscription_id=_SUB, credential=None, client=client) + + +# --------------------------------------------------------------------------- +# Must Emit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_static_with_ip_address_emits(self): + """Static allocation, all linkages absent, ip_address present → emit.""" + assert len(_run([_make_pip()])) == 1 + + def test_dynamic_with_ip_address_emits(self): + """Dynamic allocation with assigned ip_address, all linkages absent → emit.""" + pip = _make_pip(allocation_method="Dynamic", ip_address="2.3.4.5") + assert len(_run([pip])) == 1 + + def test_only_unused_pip_in_mixed_list(self): + """Only the unattached PIP emits from a list containing an attached one.""" + unused = _make_pip(pip_id="/id/pip-unused", name="pip-unused") + used = _make_pip(pip_id="/id/pip-used", name="pip-used", ip_configuration=_ref()) + findings = _run([unused, used]) + assert len(findings) == 1 + assert findings[0].resource_id == "/id/pip-unused" + + +# --------------------------------------------------------------------------- +# Must Skip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_skip_if_id_absent(self): + pip = _make_pip() + pip.id = None + assert _run([pip]) == [] + + def test_skip_if_id_empty(self): + pip = _make_pip() + pip.id = "" + assert _run([pip]) == [] + + def test_skip_if_name_absent(self): + pip = _make_pip() + pip.name = None + assert _run([pip]) == [] + + def test_skip_if_name_empty(self): + pip = _make_pip() + pip.name = "" + assert _run([pip]) == [] + + def test_skip_if_provisioning_not_succeeded(self): + assert _run([_make_pip(provisioning_state="Updating")]) == [] + + def test_skip_if_provisioning_failed(self): + assert _run([_make_pip(provisioning_state="Failed")]) == [] + + def test_skip_dynamic_no_ip_address(self): + """Unattached Dynamic PIP with no ip_address → dynamic-placeholder → skip.""" + pip = _make_pip(allocation_method="Dynamic", ip_address=None) + assert _run([pip]) == [] + + def test_skip_dynamic_empty_ip_address(self): + pip = _make_pip(allocation_method="Dynamic", ip_address="") + assert _run([pip]) == [] + + +# --------------------------------------------------------------------------- +# Attachment Contract — all four linkage fields +# --------------------------------------------------------------------------- + + +class TestAttachmentContract: + def test_ip_configuration_present_skips(self): + assert _run([_make_pip(ip_configuration=_ref())]) == [] + + def test_nat_gateway_present_skips(self): + assert _run([_make_pip(nat_gateway=_ref())]) == [] + + def test_service_public_ip_address_present_skips(self): + assert _run([_make_pip(service_public_ip_address=_ref())]) == [] + + def test_linked_public_ip_address_present_skips(self): + assert _run([_make_pip(linked_public_ip_address=_ref())]) == [] + + def test_ref_without_id_is_unresolvable_skips(self): + """A reference object present but id absent → unresolvable linkage → skip.""" + pip = _make_pip(ip_configuration=SimpleNamespace(id=None)) + assert _run([pip]) == [] + + def test_ref_with_empty_id_is_unresolvable_skips(self): + """A reference object present but id empty → unresolvable linkage → skip.""" + pip = _make_pip(nat_gateway=SimpleNamespace(id="")) + assert _run([pip]) == [] + + def test_all_four_absent_emits(self): + """All four linkages absent → emit.""" + assert len(_run([_make_pip()])) == 1 + + +# --------------------------------------------------------------------------- +# Dynamic-Placeholder Contract +# --------------------------------------------------------------------------- + + +class TestDynamicPlaceholderContract: + def test_static_no_ip_still_emits(self): + """Static allocation with no ip_address is NOT a dynamic placeholder → emit.""" + pip = _make_pip(allocation_method="Static", ip_address=None) + assert len(_run([pip])) == 1 + + def test_dynamic_with_ip_emits(self): + pip = _make_pip(allocation_method="Dynamic", ip_address="10.0.0.1") + assert len(_run([pip])) == 1 + + def test_dynamic_without_ip_skips(self): + pip = _make_pip(allocation_method="Dynamic", ip_address=None) + assert _run([pip]) == [] + + def test_none_allocation_with_no_ip_still_emits(self): + """allocation_method=None does not trigger the Dynamic check → emit.""" + pip = _make_pip(allocation_method=None, ip_address=None) + assert len(_run([pip])) == 1 + + +# --------------------------------------------------------------------------- +# Finding Shape +# --------------------------------------------------------------------------- + + +class TestFindingShape: + def setup_method(self): + self.finding = _run([_make_pip()])[0] + + def test_provider(self): + assert self.finding.provider == "azure" + + def test_rule_id(self): + assert self.finding.rule_id == "azure.network.public_ip.unused" + + def test_resource_type(self): + assert self.finding.resource_type == "azure.network.public_ip" + + def test_resource_id_is_arm_id(self): + assert self.finding.resource_id == _ARM_ID + + def test_region_is_normalized(self): + pip = _make_pip(location="EastUS") + f = _run([pip])[0] + assert f.region == "eastus" + + def test_cost_is_none(self): + assert self.finding.estimated_monthly_cost_usd is None + + def test_confidence_high(self): + from cleancloud.core.confidence import ConfidenceLevel + + assert self.finding.confidence == ConfidenceLevel.HIGH + + def test_risk_low(self): + from cleancloud.core.risk import RiskLevel + + assert self.finding.risk == RiskLevel.LOW + + def test_details_resource_name(self): + assert self.finding.details["resource_name"] == "pip1" + + def test_details_subscription_id(self): + assert self.finding.details["subscription_id"] == _SUB + + def test_details_allocation_method(self): + assert self.finding.details["allocation_method"] == "Static" + + def test_details_ip_address(self): + assert self.finding.details["ip_address"] == "1.2.3.4" + + def test_details_sku(self): + assert self.finding.details["sku"] == "Standard" + + def test_details_ip_version(self): + assert self.finding.details["ip_version"] == "IPv4" + + def test_details_attached_is_false(self): + assert self.finding.details["attached"] is False + + def test_details_tags_none_becomes_empty_dict(self): + pip = _make_pip(tags=None) + f = _run([pip])[0] + assert f.details["tags"] == {} + + def test_details_tags_preserved(self): + pip = _make_pip(tags={"env": "prod"}) + f = _run([pip])[0] + assert f.details["tags"] == {"env": "prod"} + + +# --------------------------------------------------------------------------- +# Evidence Contract +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def setup_method(self): + self.ev = _run([_make_pip()])[0].evidence + + def test_signals_used_provisioning(self): + assert "Provisioning state is Succeeded" in self.ev.signals_used + + def test_signals_used_attachment(self): + assert any("ip_configuration" in s and "nat_gateway" in s for s in self.ev.signals_used) + + def test_signals_used_dynamic_placeholder(self): + assert any("Dynamic-placeholder" in s for s in self.ev.signals_used) + + def test_signals_not_checked_planned(self): + assert any("Planned future association" in s for s in self.ev.signals_not_checked) + + def test_signals_not_checked_dns(self): + assert any("DNS" in s for s in self.ev.signals_not_checked) + + def test_signals_not_checked_traffic(self): + assert any("traffic" in s for s in self.ev.signals_not_checked) + + def test_signals_not_checked_billing(self): + assert any( + "billing" in s.lower() or "Azure billing" in s for s in self.ev.signals_not_checked + ) + + def test_time_window_is_none(self): + assert self.ev.time_window is None + + +# --------------------------------------------------------------------------- +# Region Filter +# --------------------------------------------------------------------------- + + +class TestRegionFilter: + def test_exact_match_emits(self): + assert len(_run([_make_pip(location="eastus")])) == 1 + + def test_case_insensitive_filter(self): + from unittest.mock import MagicMock + + client = MagicMock() + client.public_ip_addresses.list_all.return_value = [_make_pip(location="eastus")] + findings = find_unused_public_ips( + subscription_id=_SUB, credential=None, region_filter="EastUS", client=client + ) + assert len(findings) == 1 + + def test_region_mismatch_skips(self): + from unittest.mock import MagicMock + + client = MagicMock() + client.public_ip_addresses.list_all.return_value = [_make_pip(location="westeurope")] + findings = find_unused_public_ips( + subscription_id=_SUB, credential=None, region_filter="eastus", client=client + ) + assert findings == [] + + def test_no_filter_includes_all(self): + pips = [ + _make_pip(pip_id="/id/1", name="p1", location="eastus"), + _make_pip(pip_id="/id/2", name="p2", location="westeurope"), + ] + assert len(_run(pips)) == 2 + + def test_location_stored_lowercase(self): + pip = _make_pip(location="WestEurope") + f = _run([pip])[0] + assert f.region == "westeurope" + + +# --------------------------------------------------------------------------- +# Failure Behavior +# --------------------------------------------------------------------------- + + +class TestFailureBehavior: + def test_list_exception_propagates(self): + from unittest.mock import MagicMock + + client = MagicMock() + client.public_ip_addresses.list_all.side_effect = RuntimeError("API error") + with pytest.raises(RuntimeError): + find_unused_public_ips(subscription_id=_SUB, credential=None, client=client) + + def test_malformed_pip_skipped(self): + """PIP missing id is skipped; valid PIP still emits.""" + bad = SimpleNamespace(id=None, name="bad") + good = _make_pip(pip_id="/id/good", name="good") + findings = _run([bad, good]) + assert len(findings) == 1 + assert findings[0].resource_id == "/id/good" + + +# --------------------------------------------------------------------------- +# SDK Fallbacks — nested snake_case +# --------------------------------------------------------------------------- + + +class TestSDKFallbacks: + def test_provisioning_state_from_nested_snake_case_emits(self): + """pip.provisioning_state absent → pip.properties.provisioning_state used.""" + pip = _make_pip() + pip.provisioning_state = None + pip.properties = SimpleNamespace(provisioning_state="Succeeded", provisioningState=None) + assert len(_run([pip])) == 1 + + def test_provisioning_state_nested_not_succeeded_skips(self): + pip = _make_pip() + pip.provisioning_state = None + pip.properties = SimpleNamespace(provisioning_state="Updating", provisioningState=None) + assert _run([pip]) == [] + + def test_provisioning_state_both_absent_skips(self): + pip = _make_pip() + pip.provisioning_state = None + pip.properties = None + assert _run([pip]) == [] + + def test_ip_configuration_from_nested_snake_case_skips(self): + """pip.ip_configuration absent → pip.properties.ipConfiguration used.""" + pip = _make_pip() + pip.ip_configuration = None + pip.properties = SimpleNamespace( + ipConfiguration=_ref(), + natGateway=None, + servicePublicIPAddress=None, + linkedPublicIPAddress=None, + ) + assert _run([pip]) == [] + + def test_nat_gateway_from_nested_arm_skips(self): + pip = _make_pip() + pip.nat_gateway = None + pip.properties = SimpleNamespace( + ipConfiguration=None, + natGateway=_ref(), + servicePublicIPAddress=None, + linkedPublicIPAddress=None, + ) + assert _run([pip]) == [] + + def test_service_pip_from_nested_arm_skips(self): + pip = _make_pip() + pip.service_public_ip_address = None + pip.properties = SimpleNamespace( + ipConfiguration=None, + natGateway=None, + servicePublicIPAddress=_ref(), + linkedPublicIPAddress=None, + ) + assert _run([pip]) == [] + + def test_linked_pip_from_nested_arm_skips(self): + pip = _make_pip() + pip.linked_public_ip_address = None + pip.properties = SimpleNamespace( + ipConfiguration=None, + natGateway=None, + servicePublicIPAddress=None, + linkedPublicIPAddress=_ref(), + ) + assert _run([pip]) == [] + + def test_allocation_method_from_nested_snake_case(self): + """pip.public_ip_allocation_method absent → nested snake_case used.""" + pip = _make_pip(allocation_method=None, ip_address=None) + pip.public_ip_allocation_method = None + pip.properties = SimpleNamespace( + public_ip_allocation_method="Dynamic", + publicIPAllocationMethod=None, + ip_address=None, + ipAddress=None, + ) + # Dynamic + no ip_address → skip + assert _run([pip]) == [] + + def test_allocation_method_from_nested_camel_case(self): + """Snake_case also absent → camelCase used.""" + pip = _make_pip(allocation_method=None, ip_address=None) + pip.public_ip_allocation_method = None + pip.properties = SimpleNamespace( + public_ip_allocation_method=None, + publicIPAllocationMethod="Dynamic", + ip_address=None, + ipAddress=None, + ) + assert _run([pip]) == [] + + def test_ip_address_from_nested_snake_case(self): + """pip.ip_address absent → nested snake_case used.""" + pip = _make_pip(allocation_method="Dynamic", ip_address=None) + pip.ip_address = None + pip.properties = SimpleNamespace( + ip_address="10.0.0.1", + ipAddress=None, + public_ip_allocation_method=None, + publicIPAllocationMethod=None, + ) + # Dynamic + ip_address from nested → emit + assert len(_run([pip])) == 1 + + def test_ip_address_from_nested_camel_case(self): + pip = _make_pip(allocation_method="Dynamic", ip_address=None) + pip.ip_address = None + pip.properties = SimpleNamespace( + ip_address=None, + ipAddress="10.0.0.2", + public_ip_allocation_method=None, + publicIPAllocationMethod=None, + ) + assert len(_run([pip])) == 1 + + def test_provisioning_state_camel_case_emits(self): + """All snake_case absent → pip.properties.provisioningState used.""" + pip = _make_pip() + pip.provisioning_state = None + pip.properties = SimpleNamespace(provisioning_state=None, provisioningState="Succeeded") + assert len(_run([pip])) == 1 + + def test_provisioning_state_camel_case_not_succeeded_skips(self): + pip = _make_pip() + pip.provisioning_state = None + pip.properties = SimpleNamespace(provisioning_state=None, provisioningState="Deleting") + assert _run([pip]) == [] + + def test_nested_linkage_ref_with_no_id_is_unresolvable_skips(self): + """SDK linkage absent but nested ARM attr has object with no id → unresolvable → skip.""" + pip = _make_pip() + pip.ip_configuration = None + pip.properties = SimpleNamespace( + ipConfiguration=SimpleNamespace(id=None), + natGateway=None, + servicePublicIPAddress=None, + linkedPublicIPAddress=None, + ) + assert _run([pip]) == [] diff --git a/tests/cleancloud/providers/azure/test_azure_sql_database_idle.py b/tests/cleancloud/providers/azure/test_azure_sql_database_idle.py new file mode 100644 index 0000000..f877d1b --- /dev/null +++ b/tests/cleancloud/providers/azure/test_azure_sql_database_idle.py @@ -0,0 +1,942 @@ +""" +Tests for azure.sql.database.idle — spec-aligned. + +Covers: must-emit, must-skip, online-state contract, age contract, + elastic-pool contract, replica/secondary contract (incl. source_database_id + NOT a standalone skip), paused-state contract, metrics contract (incl. + all-None datapoints → zero), finding shape, evidence contract, + region filter, failure behavior, SDK/ARM camelCase fallbacks, + _query_metric and helper unit tests. +""" + +from datetime import datetime, timedelta, timezone +from types import SimpleNamespace +from unittest.mock import MagicMock + +from cleancloud.providers.azure.rules.sql_database_idle import ( + _is_paused, + _is_replica_secondary, + _query_metric, + find_idle_sql_databases, +) + +# --------------------------------------------------------------------------- +# Constants / shared fixtures +# --------------------------------------------------------------------------- + +_SUB = "sub-123" +_IDLE_DAYS = 14 + + +# --------------------------------------------------------------------------- +# Monitor mock sentinels +# --------------------------------------------------------------------------- + +_ABSENT = object() # metric absent from response → unknown → skip +_EMPTY_SERIES = object() # metric present, no data items → unknown → skip +_NONE_DPS = object() # metric present, data items all None → 0.0 (spec 9.6 rule 2) + + +# --------------------------------------------------------------------------- +# Object helpers +# --------------------------------------------------------------------------- + + +def _server_id(name: str = "srv") -> str: + return f"/subscriptions/{_SUB}/resourceGroups/rg1" f"/providers/Microsoft.Sql/servers/{name}" + + +def _db_id(name: str = "mydb", server: str = "srv") -> str: + return ( + f"/subscriptions/{_SUB}/resourceGroups/rg1" + f"/providers/Microsoft.Sql/servers/{server}/databases/{name}" + ) + + +def _old_enough(days: int = 30) -> datetime: + return datetime.now(timezone.utc) - timedelta(days=days) + + +def _too_young(days: int = 5) -> datetime: + return datetime.now(timezone.utc) - timedelta(days=days) + + +def _make_server(name: str = "srv", location: str = "eastus") -> SimpleNamespace: + return SimpleNamespace(id=_server_id(name), name=name, location=location) + + +def _make_db( + name: str = "mydb", + location: str = "eastus", + status: str = "Online", + creation_date=None, + elastic_pool_id=None, + secondary_type=None, + source_database_id=None, + paused_date=None, + resumed_date=None, + sku=None, + tags=None, + server: str = "srv", + **extra, +) -> SimpleNamespace: + cd = creation_date if creation_date is not None else _old_enough() + ns = SimpleNamespace( + id=_db_id(name, server), + name=name, + location=location, + status=status, + creation_date=cd, + elastic_pool_id=elastic_pool_id, + secondary_type=secondary_type, + source_database_id=source_database_id, + paused_date=paused_date, + resumed_date=resumed_date, + sku=sku, + tags=tags, + current_service_objective_name=None, + auto_pause_delay=None, + properties=None, + ) + for k, v in extra.items(): + setattr(ns, k, v) + return ns + + +# --------------------------------------------------------------------------- +# Monitor client mock +# --------------------------------------------------------------------------- + + +def _make_monitor_client(overrides: dict = None, raise_on: str = None): + """ + Build a mock MonitorManagementClient. + + overrides — per-metric value: + float regular datapoint value (0.0 = zero, > 0 = active) + _ABSENT metric absent from response → unknown → skip + _EMPTY_SERIES metric present, series has no data items → unknown → skip + _NONE_DPS metric present, data items exist but all aggregated values None + → usable series, confirmed zero (spec 9.6 rule 2) → 0.0 + + Any metric not listed in overrides defaults to 0.0 (confirmed zero). + raise_on — metric name that causes the list() call to raise. + """ + defaults = { + "connection_successful": 0.0, + "sessions_count": 0.0, + "cpu_percent": 0.0, + "physical_data_read_percent": 0.0, + "log_write_percent": 0.0, + } + spec = {**defaults, **(overrides or {})} + + def _mock_list(resource_uri, metricnames, aggregation="Total", **kwargs): + if raise_on and metricnames == raise_on: + raise RuntimeError("Monitor unavailable") + + val = spec.get(metricnames, 0.0) + agg_attr = "total" if aggregation == "Total" else "maximum" + + if val is _ABSENT: + return SimpleNamespace(value=[]) + + if val is _EMPTY_SERIES: + ts = SimpleNamespace(data=[]) + metric = SimpleNamespace(name=SimpleNamespace(value=metricnames), timeseries=[ts]) + return SimpleNamespace(value=[metric]) + + if val is _NONE_DPS: + dp = SimpleNamespace(**{agg_attr: None, "total": None, "maximum": None}) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name=SimpleNamespace(value=metricnames), timeseries=[ts]) + return SimpleNamespace(value=[metric]) + + # Regular numeric value + dp = SimpleNamespace(**{agg_attr: val, "total": val, "maximum": val}) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name=SimpleNamespace(value=metricnames), timeseries=[ts]) + return SimpleNamespace(value=[metric]) + + mon = MagicMock() + mon.metrics.list.side_effect = _mock_list + return mon + + +# --------------------------------------------------------------------------- +# SQL client mock + run helper +# --------------------------------------------------------------------------- + + +def _make_sql_client(server=None, dbs=None, db_list_raises: bool = False): + sql = MagicMock() + sql.servers.list.return_value = [server or _make_server()] + if db_list_raises: + sql.databases.list_by_server.side_effect = Exception("listing failed") + else: + sql.databases.list_by_server.return_value = dbs or [] + return sql + + +def _run( + dbs, + server=None, + region_filter=None, + idle_days: int = _IDLE_DAYS, + monitor=None, + db_list_raises: bool = False, +): + sql = _make_sql_client(server=server, dbs=dbs, db_list_raises=db_list_raises) + mon = monitor or _make_monitor_client() + return find_idle_sql_databases( + subscription_id=_SUB, + credential=None, + region_filter=region_filter, + client=sql, + monitor_client=mon, + idle_days=idle_days, + ) + + +# =========================================================================== +# TestMustEmit — spec §13.1 +# =========================================================================== + + +class TestMustEmit: + def test_fully_qualifying_database_emits(self): + db = _make_db() + assert len(_run([db])) == 1 + + def test_all_five_metrics_zero_emits(self): + db = _make_db() + mon = _make_monitor_client() # all zero by default + assert len(_run([db], monitor=mon)) == 1 + + def test_multiple_qualifying_databases_all_emit(self): + dbs = [_make_db(name=f"db{i}") for i in range(3)] + assert len(_run(dbs)) == 3 + + +# =========================================================================== +# TestMustSkip — spec §13.2 +# =========================================================================== + + +class TestMustSkip: + def test_master_skips(self): + assert _run([_make_db(name="master")]) == [] + + def test_elastic_pool_skips(self): + db = _make_db( + elastic_pool_id=( + "/subscriptions/sub/resourceGroups/rg" + "/providers/Microsoft.Sql/servers/srv/elasticPools/pool1" + ) + ) + assert _run([db]) == [] + + def test_status_not_online_skips(self): + assert _run([_make_db(status="Offline")]) == [] + + def test_paused_status_skips(self): + assert _run([_make_db(status="Paused")]) == [] + + def test_secondary_type_skips(self): + assert _run([_make_db(secondary_type="Geo")]) == [] + + def test_younger_than_idle_days_skips(self): + assert _run([_make_db(creation_date=_too_young(5))]) == [] + + def test_connections_zero_sessions_count_nonzero_skips(self): + db = _make_db() + mon = _make_monitor_client(overrides={"sessions_count": 1.0}) + assert _run([db], monitor=mon) == [] + + def test_connections_zero_cpu_nonzero_skips(self): + db = _make_db() + mon = _make_monitor_client(overrides={"cpu_percent": 5.0}) + assert _run([db], monitor=mon) == [] + + def test_metric_query_fails_skips(self): + db = _make_db() + mon = _make_monitor_client(overrides={"connection_successful": _ABSENT}) + assert _run([db], monitor=mon) == [] + + +# =========================================================================== +# TestOnlineStateContract — spec §9.1 +# =========================================================================== + + +class TestOnlineStateContract: + def test_online_emits(self): + assert len(_run([_make_db(status="Online")])) == 1 + + def test_offline_skips(self): + assert _run([_make_db(status="Offline")]) == [] + + def test_paused_skips(self): + assert _run([_make_db(status="Paused")]) == [] + + def test_creating_skips(self): + assert _run([_make_db(status="Creating")]) == [] + + def test_status_none_skips(self): + db = _make_db() + db.status = None + assert _run([db]) == [] + + def test_nested_snake_case_status_online_emits(self): + db = _make_db() + db.status = None + db.properties = SimpleNamespace(status="Online") + assert len(_run([db])) == 1 + + def test_nested_snake_case_status_offline_skips(self): + db = _make_db() + db.status = None + db.properties = SimpleNamespace(status="Offline") + assert _run([db]) == [] + + +# =========================================================================== +# TestAgeContract — spec §9.2 +# =========================================================================== + + +class TestAgeContract: + def test_old_enough_emits(self): + assert len(_run([_make_db(creation_date=_old_enough(30))])) == 1 + + def test_too_young_skips(self): + assert _run([_make_db(creation_date=_too_young(5))]) == [] + + def test_exactly_idle_days_old_emits(self): + # age_days == idle_days: 14 < 14 is False → proceeds + db = _make_db(creation_date=_old_enough(_IDLE_DAYS)) + assert len(_run([db])) == 1 + + def test_creation_date_absent_skips(self): + db = _make_db() + db.creation_date = None + assert _run([db]) == [] + + def test_creation_date_as_iso_string_emits(self): + cd_str = _old_enough(30).strftime("%Y-%m-%dT%H:%M:%SZ") + db = _make_db() + db.creation_date = cd_str + assert len(_run([db])) == 1 + + def test_creation_date_as_iso_string_too_young_skips(self): + cd_str = _too_young(5).strftime("%Y-%m-%dT%H:%M:%SZ") + db = _make_db() + db.creation_date = cd_str + assert _run([db]) == [] + + def test_nested_camel_case_creation_date_emits(self): + db = _make_db() + db.creation_date = None + db.properties = SimpleNamespace(creationDate=_old_enough(30)) + assert len(_run([db])) == 1 + + def test_nested_snake_case_creation_date_emits(self): + db = _make_db() + db.creation_date = None + db.properties = SimpleNamespace(creation_date=_old_enough(30)) + assert len(_run([db])) == 1 + + +# =========================================================================== +# TestElasticPoolContract — spec §9.3 +# =========================================================================== + + +class TestElasticPoolContract: + def test_elastic_pool_id_present_skips(self): + db = _make_db(elastic_pool_id="/subscriptions/sub/pool1") + assert _run([db]) == [] + + def test_elastic_pool_id_none_emits(self): + assert len(_run([_make_db(elastic_pool_id=None)])) == 1 + + def test_elastic_pool_id_empty_string_emits(self): + assert len(_run([_make_db(elastic_pool_id="")])) == 1 + + def test_nested_camel_case_elastic_pool_id_skips(self): + db = _make_db() + db.elastic_pool_id = None + db.properties = SimpleNamespace(elasticPoolId="/subscriptions/sub/pool1") + assert _run([db]) == [] + + def test_nested_snake_case_elastic_pool_id_skips(self): + db = _make_db() + db.elastic_pool_id = None + db.properties = SimpleNamespace(elastic_pool_id="/subscriptions/sub/pool1") + assert _run([db]) == [] + + +# =========================================================================== +# TestReplicaSecondaryContract — spec §9.4 +# =========================================================================== + + +class TestReplicaSecondaryContract: + def test_secondary_type_geo_skips(self): + assert _run([_make_db(secondary_type="Geo")]) == [] + + def test_secondary_type_named_skips(self): + assert _run([_make_db(secondary_type="Named")]) == [] + + def test_source_database_id_alone_does_not_skip(self): + """ + spec 9.4: source_database_id alone is NOT a standalone skip signal. + It must be paired with secondary/replica-shaped control-plane context. + A restore copy has source_database_id but is not a replica. + """ + db = _make_db(source_database_id="/subscriptions/sub/databases/source") + assert len(_run([db])) == 1 + + def test_neither_signal_emits(self): + assert len(_run([_make_db(secondary_type=None, source_database_id=None)])) == 1 + + def test_nested_camel_case_secondary_type_skips(self): + db = _make_db() + db.secondary_type = None + db.properties = SimpleNamespace(secondaryType="Geo") + assert _run([db]) == [] + + def test_nested_snake_case_secondary_type_skips(self): + db = _make_db() + db.secondary_type = None + db.properties = SimpleNamespace(secondary_type="Geo") + assert _run([db]) == [] + + def test_nested_source_database_id_alone_does_not_skip(self): + """Even via nested path, source_database_id alone must not skip.""" + db = _make_db() + db.source_database_id = None + db.properties = SimpleNamespace(sourceDatabaseId="/subscriptions/sub/db/src") + assert len(_run([db])) == 1 + + +# =========================================================================== +# TestPausedStateContract — spec §9.5 +# =========================================================================== + + +class TestPausedStateContract: + def test_status_paused_skips(self): + assert _run([_make_db(status="Paused")]) == [] + + def test_paused_date_without_resumed_date_skips(self): + db = _make_db(paused_date=_old_enough(2)) + assert _run([db]) == [] + + def test_paused_date_with_later_resumed_date_does_not_skip(self): + # resumed after pausing → not currently paused + db = _make_db(paused_date=_old_enough(5), resumed_date=_old_enough(1)) + assert len(_run([db])) == 1 + + def test_paused_date_with_earlier_resumed_date_skips(self): + # paused_date > resumed_date → currently paused again + db = _make_db(paused_date=_old_enough(1), resumed_date=_old_enough(5)) + assert _run([db]) == [] + + def test_no_paused_date_does_not_skip(self): + assert len(_run([_make_db(paused_date=None)])) == 1 + + def test_nested_camel_case_paused_date_skips(self): + db = _make_db() + db.paused_date = None + db.resumed_date = None + db.properties = SimpleNamespace(pausedDate=_old_enough(2)) + assert _run([db]) == [] + + def test_nested_camel_case_paused_resumed_pair_does_not_skip(self): + db = _make_db() + db.paused_date = None + db.resumed_date = None + db.properties = SimpleNamespace(pausedDate=_old_enough(5), resumedDate=_old_enough(1)) + assert len(_run([db])) == 1 + + +# =========================================================================== +# TestMetricsContract — spec §9.6 +# =========================================================================== + + +class TestMetricsContract: + def test_all_five_zero_emits(self): + assert len(_run([_make_db()])) == 1 + + def test_connection_successful_nonzero_skips(self): + mon = _make_monitor_client(overrides={"connection_successful": 10.0}) + assert _run([_make_db()], monitor=mon) == [] + + def test_sessions_count_nonzero_skips(self): + mon = _make_monitor_client(overrides={"sessions_count": 2.0}) + assert _run([_make_db()], monitor=mon) == [] + + def test_cpu_percent_nonzero_skips(self): + mon = _make_monitor_client(overrides={"cpu_percent": 0.5}) + assert _run([_make_db()], monitor=mon) == [] + + def test_physical_data_read_nonzero_skips(self): + mon = _make_monitor_client(overrides={"physical_data_read_percent": 10.0}) + assert _run([_make_db()], monitor=mon) == [] + + def test_log_write_nonzero_skips(self): + mon = _make_monitor_client(overrides={"log_write_percent": 3.0}) + assert _run([_make_db()], monitor=mon) == [] + + def test_metric_absent_from_response_skips(self): + """Metric absent from response → unknown → skip (spec 9.6 rule 3).""" + mon = _make_monitor_client(overrides={"connection_successful": _ABSENT}) + assert _run([_make_db()], monitor=mon) == [] + + def test_series_with_no_data_items_skips(self): + """Metric present, series has no data items → unusable → skip (spec 9.6 rule 4).""" + mon = _make_monitor_client(overrides={"cpu_percent": _EMPTY_SERIES}) + assert _run([_make_db()], monitor=mon) == [] + + def test_series_with_all_none_datapoints_is_confirmed_zero_emits(self): + """ + Metric present, data items exist but all aggregated values are None → + usable series, all datapoints 0-or-None → confirmed zero (spec 9.6 rule 2) → emit. + """ + mon = _make_monitor_client(overrides={"sessions_count": _NONE_DPS}) + assert len(_run([_make_db()], monitor=mon)) == 1 + + def test_all_none_datapoints_on_all_metrics_emits(self): + """All five metrics returning None datapoints still counts as zero → emit.""" + mon = _make_monitor_client( + overrides={ + m: _NONE_DPS + for m in [ + "connection_successful", + "sessions_count", + "cpu_percent", + "physical_data_read_percent", + "log_write_percent", + ] + } + ) + assert len(_run([_make_db()], monitor=mon)) == 1 + + def test_metric_query_exception_skips(self): + mon = _make_monitor_client(raise_on="log_write_percent") + assert _run([_make_db()], monitor=mon) == [] + + def test_second_metric_fails_skips_db(self): + mon = _make_monitor_client(raise_on="sessions_count") + assert _run([_make_db()], monitor=mon) == [] + + +# =========================================================================== +# TestFindingShape — spec §11 +# =========================================================================== + + +class TestFindingShape: + def _finding(self, **db_kwargs): + db = _make_db(**db_kwargs) + findings = _run([db]) + assert len(findings) == 1 + return findings[0] + + def test_provider_is_azure(self): + assert self._finding().provider == "azure" + + def test_rule_id(self): + assert self._finding().rule_id == "azure.sql.database.idle" + + def test_resource_type(self): + assert self._finding().resource_type == "azure.sql.database" + + def test_resource_id_is_database_arm_id(self): + db = _make_db() + findings = _run([db]) + assert findings[0].resource_id == db.id + + def test_region_is_normalized_lowercase(self): + db = _make_db(location="East US") + findings = _run([db]) + assert findings[0].region == "east us" + + def test_estimated_monthly_cost_is_none(self): + assert self._finding().estimated_monthly_cost_usd is None + + def test_risk_is_high(self): + from cleancloud.core.risk import RiskLevel + + assert self._finding().risk == RiskLevel.HIGH + + def test_confidence_is_high(self): + from cleancloud.core.confidence import ConfidenceLevel + + assert self._finding().confidence == ConfidenceLevel.HIGH + + def test_details_has_all_required_keys(self): + required = { + "database_name", + "server_name", + "status", + "current_service_objective_name", + "sku_tier", + "elastic_pool_id", + "auto_pause_delay", + "paused_date", + "creation_date", + "idle_days", + "connection_successful", + "sessions_count", + "cpu_percent", + "physical_data_read_percent", + "log_write_percent", + "tags", + } + assert required <= set(self._finding().details.keys()) + + def test_details_database_name(self): + assert self._finding(name="proddb").details["database_name"] == "proddb" + + def test_details_all_metric_values_zero(self): + d = self._finding().details + assert d["connection_successful"] == 0.0 + assert d["sessions_count"] == 0.0 + assert d["cpu_percent"] == 0.0 + assert d["physical_data_read_percent"] == 0.0 + assert d["log_write_percent"] == 0.0 + + def test_details_idle_days_reflects_param(self): + db = _make_db() + findings = _run([db], idle_days=7) + assert findings[0].details["idle_days"] == 7 + + def test_tags_defaults_to_empty_dict_when_absent(self): + assert self._finding(tags=None).details["tags"] == {} + + def test_tags_preserved_when_set(self): + assert self._finding(tags={"env": "prod"}).details["tags"] == {"env": "prod"} + + def test_evidence_signals_used_count_is_ten(self): + assert len(self._finding().evidence.signals_used) == 10 + + def test_evidence_signals_not_checked_count_is_four(self): + assert len(self._finding().evidence.signals_not_checked) == 4 + + def test_evidence_time_window_reflects_idle_days(self): + db = _make_db() + findings = _run([db], idle_days=7) + assert findings[0].evidence.time_window == "7 days" + + def test_evidence_signals_include_all_five_metrics(self): + signals = self._finding().evidence.signals_used + assert any("connection_successful" in s for s in signals) + assert any("sessions_count" in s for s in signals) + assert any("cpu_percent" in s for s in signals) + assert any("physical_data_read_percent" in s for s in signals) + assert any("log_write_percent" in s for s in signals) + + +# =========================================================================== +# TestRegionFilter — spec §8.3 +# =========================================================================== + + +class TestRegionFilter: + def test_matching_region_emits(self): + db = _make_db(location="eastus") + server = _make_server(location="eastus") + assert len(_run([db], server=server, region_filter="eastus")) == 1 + + def test_non_matching_region_skips(self): + db = _make_db(location="westus") + server = _make_server(location="westus") + assert _run([db], server=server, region_filter="eastus") == [] + + def test_no_filter_emits_all(self): + dbs = [_make_db(name=f"db{i}") for i in range(3)] + assert len(_run(dbs)) == 3 + + def test_region_filter_case_insensitive(self): + db = _make_db(location="eastus") + server = _make_server(location="eastus") + assert len(_run([db], server=server, region_filter="EastUS")) == 1 + + def test_server_level_prefilter_skips_all_dbs_on_mismatched_server(self): + db = _make_db(location="westus") + server = _make_server(location="westus") + assert _run([db], server=server, region_filter="northeurope") == [] + + +# =========================================================================== +# TestFailureBehavior — spec §12 +# =========================================================================== + + +class TestFailureBehavior: + def test_db_listing_fails_skips_server(self): + """Per spec 12: per-server listing failure → skip server, not propagate.""" + assert _run([_make_db()], db_list_raises=True) == [] + + def test_metric_exception_skips_db(self): + """Per spec 12: metric query failure → skip database.""" + mon = _make_monitor_client(raise_on="cpu_percent") + assert _run([_make_db()], monitor=mon) == [] + + def test_db_with_no_id_skips(self): + db = _make_db() + db.id = None + assert _run([db]) == [] + + def test_db_with_empty_id_skips(self): + db = _make_db() + db.id = "" + assert _run([db]) == [] + + def test_db_with_no_name_skips(self): + db = _make_db() + db.name = None + assert _run([db]) == [] + + def test_db_with_empty_name_skips(self): + db = _make_db() + db.name = "" + assert _run([db]) == [] + + +# =========================================================================== +# TestSDKFallbacks — SDK-first / nested snake_case / ARM camelCase +# =========================================================================== + + +class TestSDKFallbacks: + def test_status_via_nested_snake_case_emits(self): + db = _make_db() + db.status = None + db.properties = SimpleNamespace(status="Online") + assert len(_run([db])) == 1 + + def test_creation_date_via_nested_camel_case_emits(self): + db = _make_db() + db.creation_date = None + db.properties = SimpleNamespace(creationDate=_old_enough(30)) + assert len(_run([db])) == 1 + + def test_creation_date_via_nested_snake_case_emits(self): + db = _make_db() + db.creation_date = None + db.properties = SimpleNamespace(creation_date=_old_enough(30)) + assert len(_run([db])) == 1 + + def test_elastic_pool_id_via_nested_camel_case_skips(self): + db = _make_db() + db.elastic_pool_id = None + db.properties = SimpleNamespace(elasticPoolId="/subscriptions/sub/pool1") + assert _run([db]) == [] + + def test_secondary_type_via_nested_camel_case_skips(self): + db = _make_db() + db.secondary_type = None + db.properties = SimpleNamespace(secondaryType="Geo") + assert _run([db]) == [] + + def test_paused_date_via_nested_camel_case_skips(self): + db = _make_db() + db.paused_date = None + db.resumed_date = None + db.properties = SimpleNamespace(pausedDate=_old_enough(2)) + assert _run([db]) == [] + + def test_paused_and_resumed_via_nested_camel_case_does_not_skip(self): + db = _make_db() + db.paused_date = None + db.resumed_date = None + db.properties = SimpleNamespace(pausedDate=_old_enough(5), resumedDate=_old_enough(1)) + assert len(_run([db])) == 1 + + def test_source_database_id_via_nested_camel_case_does_not_skip(self): + """source_database_id alone is not a skip signal even via nested path.""" + db = _make_db() + db.source_database_id = None + db.properties = SimpleNamespace(sourceDatabaseId="/subscriptions/sub/db/src") + assert len(_run([db])) == 1 + + +# =========================================================================== +# Unit tests — _query_metric +# =========================================================================== + + +def _now(): + return datetime.now(timezone.utc) + + +def _window(): + return _now() - timedelta(days=14), _now() + + +def _mon_returning(response): + mon = MagicMock() + mon.metrics.list.return_value = response + return mon + + +class TestQueryMetric: + def test_metric_absent_from_response_returns_none(self): + mon = _mon_returning(SimpleNamespace(value=[])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result is None + + def test_empty_series_no_data_items_returns_none(self): + ts = SimpleNamespace(data=[]) + metric = SimpleNamespace(name=SimpleNamespace(value="cpu_percent"), timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result is None + + def test_no_timeseries_at_all_returns_none(self): + metric = SimpleNamespace(name=SimpleNamespace(value="cpu_percent"), timeseries=[]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result is None + + def test_all_none_datapoints_returns_zero(self): + """Usable series with data items but all None → confirmed zero (spec 9.6 rule 2).""" + dp = SimpleNamespace(maximum=None) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name=SimpleNamespace(value="cpu_percent"), timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result == 0.0 + + def test_zero_datapoint_returns_zero(self): + dp = SimpleNamespace(maximum=0.0) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name=SimpleNamespace(value="cpu_percent"), timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result == 0.0 + + def test_nonzero_datapoint_returns_value(self): + dp = SimpleNamespace(maximum=5.0) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name=SimpleNamespace(value="cpu_percent"), timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result == 5.0 + + def test_max_of_multiple_datapoints(self): + dps = [ + SimpleNamespace(maximum=1.0), + SimpleNamespace(maximum=5.0), + SimpleNamespace(maximum=2.0), + ] + ts = SimpleNamespace(data=dps) + metric = SimpleNamespace(name=SimpleNamespace(value="cpu_percent"), timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result == 5.0 + + def test_total_aggregation_uses_total_attr(self): + dp = SimpleNamespace(total=42.0, maximum=0.0) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace( + name=SimpleNamespace(value="connection_successful"), timeseries=[ts] + ) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric( + mon, "rid", "connection_successful", "Total", "total", w_start, w_end + ) + assert result == 42.0 + + def test_exception_returns_none(self): + mon = MagicMock() + mon.metrics.list.side_effect = RuntimeError("Network error") + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result is None + + def test_metric_name_matched_case_insensitively(self): + """Metric name matching is case-insensitive.""" + dp = SimpleNamespace(maximum=3.0) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name=SimpleNamespace(value="CPU_PERCENT"), timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result == 3.0 + + def test_plain_string_metric_name_matched(self): + """Metric name as plain string (not LocalizableString) is handled.""" + dp = SimpleNamespace(maximum=1.0) + ts = SimpleNamespace(data=[dp]) + metric = SimpleNamespace(name="cpu_percent", timeseries=[ts]) + mon = _mon_returning(SimpleNamespace(value=[metric])) + w_start, w_end = _window() + result = _query_metric(mon, "rid", "cpu_percent", "Maximum", "maximum", w_start, w_end) + assert result == 1.0 + + +# =========================================================================== +# Unit tests — _is_replica_secondary +# =========================================================================== + + +class TestIsReplicaSecondaryUnit: + def test_secondary_type_non_empty_is_replica(self): + assert _is_replica_secondary(_make_db(secondary_type="Geo")) is True + + def test_secondary_type_named_is_replica(self): + assert _is_replica_secondary(_make_db(secondary_type="Named")) is True + + def test_source_database_id_alone_is_not_replica(self): + """Spec 9.4: source_database_id is not a standalone replica indicator.""" + db = _make_db(source_database_id="/subscriptions/sub/databases/source") + assert _is_replica_secondary(db) is False + + def test_neither_field_is_not_replica(self): + assert _is_replica_secondary(_make_db()) is False + + def test_nested_secondary_type_is_replica(self): + db = _make_db() + db.secondary_type = None + db.properties = SimpleNamespace(secondaryType="Geo") + assert _is_replica_secondary(db) is True + + +# =========================================================================== +# Unit tests — _is_paused +# =========================================================================== + + +class TestIsPausedUnit: + def test_status_paused_is_paused(self): + assert _is_paused(_make_db(status="Paused")) is True + + def test_paused_date_no_resumed_is_paused(self): + assert _is_paused(_make_db(paused_date=_old_enough(2))) is True + + def test_paused_date_with_later_resumed_is_not_paused(self): + db = _make_db(paused_date=_old_enough(5), resumed_date=_old_enough(1)) + assert _is_paused(db) is False + + def test_paused_date_with_earlier_resumed_is_paused(self): + db = _make_db(paused_date=_old_enough(1), resumed_date=_old_enough(5)) + assert _is_paused(db) is True + + def test_no_paused_date_is_not_paused(self): + assert _is_paused(_make_db()) is False + + def test_online_status_without_paused_date_is_not_paused(self): + assert _is_paused(_make_db(status="Online")) is False diff --git a/tests/cleancloud/providers/azure/test_azure_sql_idle.py b/tests/cleancloud/providers/azure/test_azure_sql_idle.py deleted file mode 100644 index 3d04bf2..0000000 --- a/tests/cleancloud/providers/azure/test_azure_sql_idle.py +++ /dev/null @@ -1,205 +0,0 @@ -from types import SimpleNamespace - -import pytest - -from cleancloud.providers.azure.rules.sql_database_idle import ( - find_idle_sql_databases, -) - - -def _make_server(name, location="eastus"): - return SimpleNamespace( - id=f"/subscriptions/sub-123/resourceGroups/rg-test/providers/Microsoft.Sql/servers/{name}", - name=name, - location=location, - ) - - -def _make_database( - server_name, - db_name, - sku_name="S0", - sku_tier="Standard", - location="eastus", - max_size_bytes=268435456000, - tags=None, -): - return SimpleNamespace( - id=( - f"/subscriptions/sub-123/resourceGroups/rg-test/providers/Microsoft.Sql/servers" - f"/{server_name}/databases/{db_name}" - ), - name=db_name, - location=location, - sku=SimpleNamespace(name=sku_name, tier=sku_tier), - max_size_bytes=max_size_bytes, - tags=tags, - ) - - -def _make_metric_response(total_value=0): - """Create a mock Azure Monitor metrics response.""" - data_point = SimpleNamespace(total=total_value) - timeseries = SimpleNamespace(data=[data_point]) - metric = SimpleNamespace(timeseries=[timeseries]) - return SimpleNamespace(value=[metric]) - - -@pytest.fixture -def mock_sql_client(mocker): - return mocker.MagicMock() - - -@pytest.fixture -def mock_monitor_client(mocker): - return mocker.MagicMock() - - -def test_idle_db_detected(mock_sql_client, mock_monitor_client): - """Standard tier DB with zero connections should be flagged.""" - server = _make_server("sql-server-1") - db = _make_database("sql-server-1", "app-db", sku_name="S0", sku_tier="Standard") - - mock_sql_client.servers.list.return_value = [server] - mock_sql_client.databases.list_by_server.return_value = [db] - mock_monitor_client.metrics.list.return_value = _make_metric_response(total_value=0) - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert len(findings) == 1 - finding = findings[0] - assert finding.provider == "azure" - assert finding.rule_id == "azure.sql.database.idle" - assert finding.resource_type == "azure.sql.database" - assert finding.confidence.value == "high" - assert finding.risk.value == "high" - assert finding.details["db_name"] == "app-db" - assert finding.details["server_name"] == "sql-server-1" - assert finding.details["sku_name"] == "S0" - assert finding.details["sku_tier"] == "Standard" - assert finding.details["connections_14d"] == 0 - assert "$15/month" in finding.details["estimated_monthly_cost"] - assert finding.estimated_monthly_cost_usd is not None - assert finding.estimated_monthly_cost_usd > 0 - - -def test_active_db_skipped(mock_sql_client, mock_monitor_client): - """DB with non-zero connections should NOT be flagged.""" - server = _make_server("sql-server-1") - db = _make_database("sql-server-1", "active-db", sku_name="S2", sku_tier="Standard") - - mock_sql_client.servers.list.return_value = [server] - mock_sql_client.databases.list_by_server.return_value = [db] - mock_monitor_client.metrics.list.return_value = _make_metric_response(total_value=42) - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert len(findings) == 0 - - -def test_basic_tier_skipped(mock_sql_client, mock_monitor_client): - """Basic tier DBs should NOT be flagged (< $5/month).""" - server = _make_server("sql-server-1") - db = _make_database("sql-server-1", "cheap-db", sku_name="Basic", sku_tier="Basic") - - mock_sql_client.servers.list.return_value = [server] - mock_sql_client.databases.list_by_server.return_value = [db] - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert len(findings) == 0 - # Monitor should not even be queried for Basic tier - mock_monitor_client.metrics.list.assert_not_called() - - -def test_system_db_skipped(mock_sql_client, mock_monitor_client): - """System database 'master' should NOT be flagged.""" - server = _make_server("sql-server-1") - db = _make_database("sql-server-1", "master", sku_name="S0", sku_tier="Standard") - - mock_sql_client.servers.list.return_value = [server] - mock_sql_client.databases.list_by_server.return_value = [db] - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert len(findings) == 0 - mock_monitor_client.metrics.list.assert_not_called() - - -def test_region_filter(mock_sql_client, mock_monitor_client): - """Only servers in the filtered region should be checked.""" - server_east = _make_server("sql-east", location="eastus") - server_west = _make_server("sql-west", location="westus") - db_east = _make_database("sql-east", "db-east", location="eastus") - db_west = _make_database("sql-west", "db-west", location="westus") - - mock_sql_client.servers.list.return_value = [server_east, server_west] - mock_sql_client.databases.list_by_server.side_effect = lambda rg, name: { - "sql-east": [db_east], - "sql-west": [db_west], - }[name] - mock_monitor_client.metrics.list.return_value = _make_metric_response(total_value=0) - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - region_filter="eastus", - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert len(findings) == 1 - assert findings[0].details["server_name"] == "sql-east" - - -def test_metric_failure_conservative_skip(mock_sql_client, mock_monitor_client): - """If metric query fails, DB should NOT be flagged (conservative).""" - server = _make_server("sql-server-1") - db = _make_database("sql-server-1", "unknown-db", sku_name="P1", sku_tier="Premium") - - mock_sql_client.servers.list.return_value = [server] - mock_sql_client.databases.list_by_server.return_value = [db] - mock_monitor_client.metrics.list.side_effect = Exception("Azure Monitor unavailable") - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert len(findings) == 0 - - -def test_empty_subscription(mock_sql_client, mock_monitor_client): - """No servers should return empty findings.""" - mock_sql_client.servers.list.return_value = [] - - findings = find_idle_sql_databases( - subscription_id="sub-123", - credential=None, - client=mock_sql_client, - monitor_client=mock_monitor_client, - ) - - assert findings == []