|
| 1 | +"""L.5 — Cross-Component Intelligence. |
| 2 | +
|
| 3 | +Detect patterns that appear across multiple components — shared suspicious |
| 4 | +dependencies, common network domains, unusual API usage clusters, and |
| 5 | +components that deviate from the norm of their category. |
| 6 | +""" |
| 7 | + |
| 8 | +import json |
| 9 | +import logging |
| 10 | +import sqlite3 |
| 11 | +from collections import Counter |
| 12 | + |
| 13 | +log = logging.getLogger(__name__) |
| 14 | + |
| 15 | + |
| 16 | +def analyze_cross_component(conn: sqlite3.Connection) -> dict: |
| 17 | + """Analyze patterns across all scanned components. |
| 18 | +
|
| 19 | + Returns a dict with: |
| 20 | + - shared_domains: network domains used by multiple components |
| 21 | + - shared_imports: unusual imports shared by multiple components |
| 22 | + - outlier_components: components that deviate significantly from norms |
| 23 | + - domain_clusters: groups of components connecting to same domains |
| 24 | + """ |
| 25 | + rows = conn.execute( |
| 26 | + "SELECT domain, repo_url, imports, ha_apis, network_domains, " |
| 27 | + "py_files, js_files, total_lines " |
| 28 | + "FROM component_fingerprints " |
| 29 | + "ORDER BY created_at DESC" |
| 30 | + ).fetchall() |
| 31 | + |
| 32 | + if not rows: |
| 33 | + return {"message": "No fingerprint data available"} |
| 34 | + |
| 35 | + # Deduplicate: keep latest fingerprint per domain/repo |
| 36 | + seen = set() |
| 37 | + fingerprints = [] |
| 38 | + for row in rows: |
| 39 | + key = row["domain"] or row["repo_url"] |
| 40 | + if key in seen: |
| 41 | + continue |
| 42 | + seen.add(key) |
| 43 | + fingerprints.append({ |
| 44 | + "id": key, |
| 45 | + "imports": json.loads(row["imports"]), |
| 46 | + "ha_apis": json.loads(row["ha_apis"]), |
| 47 | + "network_domains": json.loads(row["network_domains"]), |
| 48 | + "py_files": row["py_files"], |
| 49 | + "js_files": row["js_files"], |
| 50 | + "total_lines": row["total_lines"], |
| 51 | + }) |
| 52 | + |
| 53 | + if len(fingerprints) < 2: |
| 54 | + return {"message": "Need at least 2 scanned components for cross-analysis"} |
| 55 | + |
| 56 | + # 1. Shared network domains (domains used by 2+ components) |
| 57 | + domain_counter: Counter = Counter() |
| 58 | + domain_users: dict[str, list[str]] = {} |
| 59 | + for fp in fingerprints: |
| 60 | + for d in fp["network_domains"]: |
| 61 | + domain_counter[d] += 1 |
| 62 | + domain_users.setdefault(d, []).append(fp["id"]) |
| 63 | + shared_domains = [ |
| 64 | + {"domain": d, "count": c, "components": domain_users[d]} |
| 65 | + for d, c in domain_counter.most_common() |
| 66 | + if c >= 2 |
| 67 | + ] |
| 68 | + |
| 69 | + # 2. Unusual shared imports (non-stdlib imports used by 2+ components) |
| 70 | + stdlib = { |
| 71 | + "os", "sys", "json", "re", "logging", "pathlib", "typing", "datetime", |
| 72 | + "collections", "functools", "itertools", "math", "hashlib", "uuid", |
| 73 | + "asyncio", "time", "io", "copy", "abc", "enum", "dataclasses", |
| 74 | + "contextlib", "unittest", "http", "urllib", "ssl", "socket", |
| 75 | + "threading", "multiprocessing", "subprocess", "shutil", "tempfile", |
| 76 | + "configparser", "argparse", "textwrap", "string", "struct", "base64", |
| 77 | + "homeassistant", "voluptuous", "aiohttp", # HA common deps |
| 78 | + } |
| 79 | + import_counter: Counter = Counter() |
| 80 | + import_users: dict[str, list[str]] = {} |
| 81 | + for fp in fingerprints: |
| 82 | + for imp in fp["imports"]: |
| 83 | + if imp not in stdlib: |
| 84 | + import_counter[imp] += 1 |
| 85 | + import_users.setdefault(imp, []).append(fp["id"]) |
| 86 | + shared_imports = [ |
| 87 | + {"import": imp, "count": c, "components": import_users[imp]} |
| 88 | + for imp, c in import_counter.most_common() |
| 89 | + if c >= 2 |
| 90 | + ] |
| 91 | + |
| 92 | + # 3. Outlier components (significantly larger/smaller than average) |
| 93 | + if len(fingerprints) >= 3: |
| 94 | + lines = [fp["total_lines"] for fp in fingerprints if fp["total_lines"] > 0] |
| 95 | + if lines: |
| 96 | + avg_lines = sum(lines) / len(lines) |
| 97 | + outliers = [] |
| 98 | + for fp in fingerprints: |
| 99 | + if fp["total_lines"] > 0: |
| 100 | + ratio = fp["total_lines"] / avg_lines if avg_lines else 0 |
| 101 | + if ratio > 3 or ratio < 0.1: |
| 102 | + outliers.append({ |
| 103 | + "component": fp["id"], |
| 104 | + "total_lines": fp["total_lines"], |
| 105 | + "avg_lines": round(avg_lines), |
| 106 | + "ratio": round(ratio, 2), |
| 107 | + "direction": "larger" if ratio > 3 else "smaller", |
| 108 | + }) |
| 109 | + else: |
| 110 | + outliers = [] |
| 111 | + else: |
| 112 | + outliers = [] |
| 113 | + |
| 114 | + # 4. Suspicious patterns: components connecting to the same unusual domains |
| 115 | + suspicious_domains = {"pastebin.com", "hastebin.com", "transfer.sh", "ngrok.io", |
| 116 | + "webhook.site", "requestbin.com", "pipedream.com"} |
| 117 | + suspicious_hits = [ |
| 118 | + {"domain": d, "components": domain_users[d]} |
| 119 | + for d in domain_counter |
| 120 | + if any(s in d for s in suspicious_domains) and domain_counter[d] >= 1 |
| 121 | + ] |
| 122 | + |
| 123 | + return { |
| 124 | + "total_components": len(fingerprints), |
| 125 | + "shared_domains": shared_domains[:20], |
| 126 | + "shared_imports": shared_imports[:20], |
| 127 | + "outlier_components": outliers, |
| 128 | + "suspicious_domains": suspicious_hits, |
| 129 | + } |
0 commit comments