diff --git a/.gitignore b/.gitignore index beead43..47d9407 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,9 @@ ENV/ .idea/ cache/ + +# Internal project notes (not for distribution) +docs/internal/ + +# Local workspace / scratch +workspace/ diff --git a/graphfaker/__init__.py b/graphfaker/__init__.py index d6a382d..b938835 100644 --- a/graphfaker/__init__.py +++ b/graphfaker/__init__.py @@ -5,7 +5,8 @@ __version__ = "0.2.0" from .core import GraphFaker +from .fetchers.trust import TrustGraphFetcher from .fetchers.wiki import WikiFetcher from .logger import configure_logging, logger -__all__ = ["GraphFaker", "logger", "configure_logging", "add_file_logging"] +__all__ = ["GraphFaker", "TrustGraphFetcher", "logger", "configure_logging", "add_file_logging"] diff --git a/graphfaker/cli.py b/graphfaker/cli.py index d6966cc..be24c28 100644 --- a/graphfaker/cli.py +++ b/graphfaker/cli.py @@ -2,14 +2,16 @@ Command-line interface for GraphFaker. """ -from venv import logger +import os + import typer + from graphfaker.core import GraphFaker from graphfaker.enums import FetcherType from graphfaker.fetchers.osm import OSMGraphFetcher from graphfaker.fetchers.flights import FlightGraphFetcher +from graphfaker.logger import logger from graphfaker.utils import parse_date_range -import os app = typer.Typer() @@ -52,6 +54,32 @@ def gen( help="Year, Month and day range (YYYY-MM-DD,YYYY-MM-DD) for flight data. e.g. '2024-01-01,2024-01-15'.", ), + # for FetcherType.TRUST source + total_users: int = typer.Option( + 10000, help="Number of user nodes for trust graph." + ), + avg_trust_links: int = typer.Option( + 15, help="Average outgoing trust edges per user." + ), + reciprocity: float = typer.Option( + 0.7, help="Fraction of trust edges that are mutual (0.0 to 1.0)." + ), + num_communities: int = typer.Option( + None, help="Number of community clusters. Defaults to sqrt(total_users)." + ), + community_mixing: float = typer.Option( + 0.15, help="Fraction of edges crossing community boundaries (0.0 to 1.0)." + ), + avg_distrust_links: float = typer.Option( + 2.0, help="Average DISTRUSTS edges per user (Poisson-distributed)." + ), + bot_fraction: float = typer.Option( + 0.10, help="Fraction of nodes that are bots (0.0 to 1.0)." + ), + seed: int = typer.Option( + None, help="Random seed for reproducibility." + ), + # common export: str = typer.Option("graph.graphml", help="File path to export GraphML"), ): @@ -83,7 +111,23 @@ def gen( logger.info( f"Fetched OSM graph with {g.number_of_nodes()} nodes and {g.number_of_edges()} edges." ) - else: + elif fetcher == FetcherType.TRUST: + g = gf.generate_graph( + source="trust", + total_users=total_users, + avg_trust_links=avg_trust_links, + reciprocity=reciprocity, + num_communities=num_communities, + community_mixing=community_mixing, + avg_distrust_links=avg_distrust_links, + bot_fraction=bot_fraction, + seed=seed, + ) + logger.info( + f"Generated trust graph with {g.number_of_nodes()} nodes and {g.number_of_edges()} edges." + ) + + elif fetcher == FetcherType.FLIGHTS: # Flight fetcher parsed_date_range = parse_date_range(date_range) if date_range else None diff --git a/graphfaker/core.py b/graphfaker/core.py index 9aa69a8..6c80a9d 100644 --- a/graphfaker/core.py +++ b/graphfaker/core.py @@ -9,6 +9,7 @@ from faker import Faker from graphfaker.fetchers.osm import OSMGraphFetcher from graphfaker.fetchers.flights import FlightGraphFetcher +from graphfaker.fetchers.trust import TrustGraphFetcher from graphfaker.logger import logger fake = Faker() @@ -289,6 +290,35 @@ def _generate_faker(self, total_nodes=100, total_edges=1000): self.generate_edges(total_edges=total_edges) return self.G + def _generate_trust( + self, + total_users: int = 10000, + avg_trust_links: int = 15, + reciprocity: float = 0.7, + num_communities: Optional[int] = None, + community_mixing: float = 0.15, + avg_distrust_links: float = 2.0, + bot_fraction: float = 0.10, + seed: Optional[int] = None, + ): + """Generate a directed social trust graph via TrustGraphFetcher.""" + try: + G = TrustGraphFetcher.build_graph( + total_users=total_users, + avg_trust_links=avg_trust_links, + reciprocity=reciprocity, + num_communities=num_communities, + community_mixing=community_mixing, + avg_distrust_links=avg_distrust_links, + bot_fraction=bot_fraction, + seed=seed, + ) + self.G = G + return G + except Exception as e: + logger.error(f"Failed to generate trust graph: {e}") + raise + def generate_graph( self, source: str = "faker", @@ -305,6 +335,15 @@ def generate_graph( year: int = 2024, month: int = 1, date_range: Optional[tuple] = None, + # Trust graph parameters + total_users: int = 10000, + avg_trust_links: int = 15, + reciprocity: float = 0.7, + num_communities: Optional[int] = None, + community_mixing: float = 0.15, + avg_distrust_links: float = 2.0, + bot_fraction: float = 0.10, + seed: Optional[int] = None, ) -> nx.DiGraph: """ Unified entrypoint: choose 'random' or 'osm'. @@ -338,8 +377,22 @@ def generate_graph( month=month, date_range=date_range, ) + elif source == "trust": + return self._generate_trust( + total_users=total_users, + avg_trust_links=avg_trust_links, + reciprocity=reciprocity, + num_communities=num_communities, + community_mixing=community_mixing, + avg_distrust_links=avg_distrust_links, + bot_fraction=bot_fraction, + seed=seed, + ) else: - raise ValueError(f"Unknown source '{source}'. Use 'random' or 'osm'.") + raise ValueError( + f"Unknown source '{source}'. " + f"Use 'faker', 'osm', 'flights', or 'trust'." + ) def export_graph(self, G: nx.Graph = None, source: str = None, path: str = "graph.graphml"): """ diff --git a/graphfaker/enums.py b/graphfaker/enums.py index 4b3e845..c193697 100644 --- a/graphfaker/enums.py +++ b/graphfaker/enums.py @@ -7,3 +7,4 @@ class FetcherType(str, Enum): OSM = "osm" FLIGHTS = "flights" FAKER = "faker" + TRUST = "trust" diff --git a/graphfaker/fetchers/trust.py b/graphfaker/fetchers/trust.py new file mode 100644 index 0000000..524b7ee --- /dev/null +++ b/graphfaker/fetchers/trust.py @@ -0,0 +1,683 @@ +""" +Trust graph fetcher for generating realistic directed social trust graphs. + +Produces homogeneous User→User graphs with configurable topology: +- Power-law degree distribution (Barabási-Albert preferential attachment) +- Community structure (Stochastic Block Model) +- Configurable reciprocity (directed trust links) +- Distrust links (for compromised account simulation) +- Small-world shortcuts (Watts-Strogatz rewiring) +""" + +import math +import random +import secrets +from typing import Optional + +import networkx as nx +import numpy as np +from faker import Faker + +from graphfaker.logger import logger + +fake = Faker() + + +class TrustGraphFetcher: + """Generates realistic directed social trust graphs. + + All methods are static — the class acts as a namespace, + consistent with the existing fetcher pattern (OSMGraphFetcher, + FlightGraphFetcher). + """ + + @staticmethod + def generate_users( + G: nx.DiGraph, + total_users: int, + community_labels: dict, + ) -> None: + """Add User nodes with Faker-generated attributes. + + Args: + G: Target directed graph. + total_users: Number of user nodes to create. + community_labels: Mapping of node index -> community ID. + """ + # Direct dict update is faster than G.add_node() for existing nodes + node_data = G.nodes + for i in range(total_users): + node_id = f"user_{i}" + node_data[node_id].update( + type="User", + name=fake.name(), + public_key=secrets.token_hex(32), + created_at=str(fake.date_time_between(start_date="-3y")), + community=community_labels.get(i, 0), + is_bot=False, + is_compromised=False, + ) + + @staticmethod + def _build_community_sizes( + total_users: int, + num_communities: Optional[int], + seed: Optional[int] = None, + min_community_size: int = 50, + ) -> list: + """Compute community sizes for the SBM using a log-normal distribution. + + Draws sizes from a log-normal distribution to produce a realistic + mix of a few large communities and many smaller ones, then rounds + and clamps to guarantee each community has at least + ``min_community_size`` members. + + Args: + total_users: Total number of users. + num_communities: Number of communities. If None, defaults to ~500 users per community. + seed: Random seed for reproducibility. + min_community_size: Floor for every community (default 50). + + Returns: + List of community sizes summing to total_users. + """ + if num_communities is None: + num_communities = max(2, round(total_users / 500)) + num_communities = min(num_communities, total_users) + + # For very small graphs where log-normal doesn't make sense, + # fall back to equal partitioning + if num_communities <= 1 or total_users < num_communities * min_community_size: + base_size = total_users // num_communities + sizes = [base_size] * num_communities + sizes[-1] += total_users - sum(sizes) + return sizes + + rng = np.random.default_rng(seed) + raw = rng.lognormal(mean=0.0, sigma=1.0, size=num_communities) + raw = raw / raw.sum() # normalize to fractions + + # Scale to total_users and round + sizes = [max(min_community_size, int(round(f * total_users))) for f in raw] + + # Fix the total: redistribute difference to the largest community + diff = total_users - sum(sizes) + largest_idx = sizes.index(max(sizes)) + sizes[largest_idx] += diff + + # Safety: if adjustment pushed largest below floor, redistribute + if sizes[largest_idx] < min_community_size: + sizes[largest_idx] = min_community_size + diff = total_users - sum(sizes) + # spread across all communities proportionally + for i in range(abs(diff)): + sizes[i % num_communities] += 1 if diff > 0 else -1 + + return sizes + + @staticmethod + def _build_probability_matrix( + sizes: list, + avg_trust_links: int, + community_mixing: float, + total_users: int, + ) -> list: + """Build the SBM connection probability matrix. + + Partitions the target average degree into within-community and + between-community contributions using community_mixing as the + fraction of total edges that cross community boundaries. + + With variable community sizes, each diagonal block gets its own + within-community probability. Off-diagonal blocks use a single + uniform between-community probability to keep the matrix symmetric + (required by NetworkX's undirected SBM). + + Args: + sizes: List of community sizes (one per community). + avg_trust_links: Target average degree. + community_mixing: Fraction of total edges that are between communities (0.0 to 1.0). + total_users: Total number of users. + + Returns: + len(sizes) x len(sizes) symmetric probability matrix. + """ + num_communities = len(sizes) + within_per_node = (1.0 - community_mixing) * avg_trust_links + between_per_node = community_mixing * avg_trust_links + + # Between-community: use mean community size for a single symmetric value + mean_size = total_users / num_communities + p_between = min(1.0, between_per_node / max(1, total_users - mean_size)) + + p_matrix = [] + for i in range(num_communities): + row = [] + p_within_i = min(1.0, within_per_node / max(1, sizes[i] - 1)) + for j in range(num_communities): + row.append(p_within_i if i == j else p_between) + p_matrix.append(row) + return p_matrix + + @staticmethod + def _apply_preferential_attachment( + G: nx.Graph, + community_labels: dict, + pa_fraction: float, + rng: random.Random, + ) -> None: + """Rewire within-community edges using preferential attachment. + + For each within-community edge, with probability pa_fraction, + replace the target with a degree-proportional random node in the + same community. Creates hub nodes (power-law-ish degree distribution). + + Only within-community edges are rewired to preserve community structure. + + Args: + G: Undirected graph to modify in-place. + community_labels: Mapping of node index -> community ID. + pa_fraction: Probability of rewiring each within-community edge. + rng: Random instance for reproducibility. + """ + comm_nodes: dict = {} + for node, comm in community_labels.items(): + comm_nodes.setdefault(comm, []).append(node) + + # Cache: degree array per community, indexed by position in comm_nodes[comm] + comm_degrees: dict = {} + comm_node_idx: dict = {} + for comm, members in comm_nodes.items(): + idx_map = {node: i for i, node in enumerate(members)} + comm_node_idx[comm] = idx_map + comm_degrees[comm] = np.array( + [G.degree(w) + 1 for w in members], dtype=np.float64 + ) + + within_edges = [ + (u, v) for u, v in G.edges() + if community_labels.get(u) == community_labels.get(v) + ] + + for u, v in within_edges: + if rng.random() >= pa_fraction: + continue + + comm = community_labels[u] + members = comm_nodes[comm] + degrees = comm_degrees[comm] + + # Weighted sample using cached degree array + searchsorted + cumsum = degrees.cumsum() + total = cumsum[-1] + r = rng.random() * total + chosen_idx = int(np.searchsorted(cumsum, r)) + chosen_idx = min(chosen_idx, len(members) - 1) + new_target = members[chosen_idx] + + if new_target != u and not G.has_edge(u, new_target): + G.remove_edge(u, v) + G.add_edge(u, new_target) + + # Incrementally update degree cache + idx_map = comm_node_idx[comm] + if v in idx_map: + degrees[idx_map[v]] = max(1, degrees[idx_map[v]] - 1) + if new_target in idx_map: + degrees[idx_map[new_target]] += 1 + + @staticmethod + def _apply_reciprocity( + G_undirected: nx.Graph, + reciprocity: float, + rng: random.Random, + ) -> nx.DiGraph: + """Convert an undirected graph to directed with configurable reciprocity. + + For each undirected edge: + - With probability `reciprocity`: create both A→B and B→A (mutual trust) + - With probability `1 - reciprocity`: create only one direction (random) + + All edges get relationship="TRUSTS". + + Args: + G_undirected: Source undirected graph. + reciprocity: Fraction of edges that become mutual. + rng: Random instance for reproducibility. + + Returns: + New directed graph with TRUSTS edges. + """ + G_dir = nx.DiGraph() + G_dir.add_nodes_from(G_undirected.nodes(data=True)) + + for u, v in G_undirected.edges(): + if rng.random() < reciprocity: + G_dir.add_edge(u, v, relationship="TRUSTS") + G_dir.add_edge(v, u, relationship="TRUSTS") + else: + if rng.random() < 0.5: + G_dir.add_edge(u, v, relationship="TRUSTS") + else: + G_dir.add_edge(v, u, relationship="TRUSTS") + + return G_dir + + @staticmethod + def _add_organic_distrust( + G: nx.DiGraph, + avg_distrust_links: float, + rng: random.Random, + np_rng: np.random.Generator, + ) -> None: + """Add Poisson-distributed DISTRUSTS edges per user. + + Each user draws k ~ Poisson(avg_distrust_links) distrust targets, + producing a realistic distribution where most users distrust at + least one entity (~86.5% at lambda=2.0) but some naturally have none. + + Args: + G: Directed graph to modify in-place. + avg_distrust_links: Lambda for Poisson draw per user. + rng: Random instance for reproducibility. + np_rng: NumPy random generator for Poisson draws. + """ + if avg_distrust_links <= 0: + return + + nodes = list(G.nodes()) + n = len(nodes) + node_to_idx = {node: i for i, node in enumerate(nodes)} + draws = np_rng.poisson(lam=avg_distrust_links, size=n) + + # Pre-build successor index sets for all nodes: O(E) total + successor_idxs: list = [set() for _ in range(n)] + for u, v in G.edges(): + ui = node_to_idx.get(u) + vi = node_to_idx.get(v) + if ui is not None and vi is not None: + successor_idxs[ui].add(vi) + + for idx in range(n): + k = int(draws[idx]) + if k == 0: + continue + + excluded = successor_idxs[idx] + excluded.add(idx) # no self-loops + + n_candidates = n - len(excluded) + if n_candidates <= 0: + continue + k = min(k, n_candidates) + + # Rejection sampling: with ~15 excluded out of 10k+, + # collision rate is <0.2%, so this is nearly O(k) + selected: set = set() + while len(selected) < k: + r = np_rng.integers(0, n) + if r not in excluded and r not in selected: + selected.add(r) + + src = nodes[idx] + for si in selected: + G.add_edge(src, nodes[si], relationship="DISTRUSTS") + + @staticmethod + def _add_bot_clusters( + G: nx.DiGraph, + num_compromised: int, + bots_per_cluster: list, + rng: random.Random, + np_rng: np.random.Generator, + ) -> dict: + """Add bot cluster substructures around compromised accounts. + + Selects high-degree existing users as compromised accounts, then + creates dense bot clusters connected to them — matching the + attack pattern where compromised accounts bridge to bot-generated + account clusters. + + Args: + G: Directed graph to modify in-place. + num_compromised: Number of existing users to mark as compromised. + bots_per_cluster: List of bot counts, one per compromised account. + rng: Random instance for reproducibility. + np_rng: NumPy random generator. + + Returns: + Dict mapping compromised node IDs to list of their bot node IDs. + """ + if num_compromised <= 0 or not bots_per_cluster: + return {} + + # Select compromised accounts: above-median out-degree users + user_nodes = [n for n, d in G.nodes(data=True) if not d.get("is_bot", False)] + out_degrees = {n: G.out_degree(n) for n in user_nodes} + median_deg = sorted(out_degrees.values())[len(out_degrees) // 2] + high_degree = [n for n, d in out_degrees.items() if d >= median_deg] + num_compromised = min(num_compromised, len(high_degree)) + compromised = rng.sample(high_degree, num_compromised) + + for node in compromised: + G.nodes[node]["is_compromised"] = True + + # Build community map for concentrated distrust later + communities_map: dict = {} + for node, data in G.nodes(data=True): + c = data.get("community", -1) + if c >= 0: + communities_map.setdefault(c, []).append(node) + + cluster_map = {} + if seed_val := np_rng.integers(0, 2**31): + Faker.seed(int(seed_val)) + + for ci, comp_node in enumerate(compromised): + n_bots = bots_per_cluster[ci] if ci < len(bots_per_cluster) else bots_per_cluster[-1] + bot_ids = [] + for bi in range(n_bots): + bot_id = f"bot_{ci}_{bi}" + G.add_node( + bot_id, + type="User", + is_bot=True, + is_compromised=False, + name=fake.name(), + public_key=secrets.token_hex(32), + created_at=str(fake.date_time_between(start_date="-1y")), + community=-1, + ) + bot_ids.append(bot_id) + + # Wire compromised <-> bots + for bot_id in bot_ids: + G.add_edge(comp_node, bot_id, relationship="TRUSTS") + G.add_edge(bot_id, comp_node, relationship="TRUSTS") + + # Dense intra-cluster bot trust (~50% pairwise) + for i, b1 in enumerate(bot_ids): + for b2 in bot_ids[i + 1:]: + if rng.random() < 0.5: + G.add_edge(b1, b2, relationship="TRUSTS") + if rng.random() < 0.5: + G.add_edge(b2, b1, relationship="TRUSTS") + + # Concentrated distrust: nearby users flag the compromised account + comp_comm = G.nodes[comp_node].get("community", -1) + if comp_comm >= 0 and comp_comm in communities_map: + comm_members = [ + n for n in communities_map[comp_comm] + if n != comp_node and not G.nodes[n].get("is_compromised", False) + ] + num_distrusters = max(1, len(comm_members) // 10) + num_distrusters = min(num_distrusters, len(comm_members)) + distrusters = rng.sample(comm_members, num_distrusters) + for d_node in distrusters: + if not G.has_edge(d_node, comp_node) or \ + G.edges[d_node, comp_node].get("relationship") != "DISTRUSTS": + G.add_edge(d_node, comp_node, relationship="DISTRUSTS") + + cluster_map[comp_node] = bot_ids + + return cluster_map + + @staticmethod + def _rewire_small_world( + G: nx.DiGraph, + rewire_prob: float, + rng: random.Random, + ) -> None: + """Rewire edges for small-world shortcuts. + + For a fraction of directed edges, replace the target with a random + node to create long-range connections. This reduces average path + length while preserving clustering structure. + + Args: + G: Directed graph to modify in-place. + rewire_prob: Probability of rewiring each edge. + rng: Random instance for reproducibility. + """ + nodes = list(G.nodes()) + edges_to_rewire = [ + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "TRUSTS" and rng.random() < rewire_prob + ] + + for u, v in edges_to_rewire: + new_target = rng.choice(nodes) + if new_target != u and not G.has_edge(u, new_target): + G.remove_edge(u, v) + G.add_edge(u, new_target, relationship="TRUSTS") + + @staticmethod + def _fast_sbm( + sizes: list, + p_matrix: list, + seed: Optional[int] = None, + ) -> nx.Graph: + """Generate an undirected SBM graph using vectorized numpy sampling. + + For each block (i, j) in the probability matrix: + 1. Draw edge count from Binomial(n_possible_pairs, p) + 2. Sample that many unique random pairs via np.random.choice + 3. Assemble into scipy COO sparse matrix -> NetworkX Graph + + Complexity: O(E + B^2) where E = edges, B = number of blocks. + """ + from scipy.sparse import coo_matrix + + rng = np.random.default_rng(seed) + n = sum(sizes) + num_blocks = len(sizes) + offsets = [0] + for s in sizes: + offsets.append(offsets[-1] + s) + + all_rows: list = [] + all_cols: list = [] + + for i in range(num_blocks): + for j in range(i, num_blocks): # upper triangle + diagonal + ni, nj = sizes[i], sizes[j] + p = p_matrix[i][j] + if p <= 0: + continue + + if i == j: + # Within-block: sample from upper triangle (no self-loops) + n_possible = ni * (ni - 1) // 2 + if n_possible == 0: + continue + n_edges = int(rng.binomial(n_possible, p)) + if n_edges == 0: + continue + + # Sample flat indices into upper triangle + flat = rng.choice(n_possible, size=n_edges, replace=False) + + # Convert flat upper-triangle index to (row, col) + row = ( + ni - 2 + - np.floor( + np.sqrt(-8.0 * flat + 4.0 * ni * (ni - 1) - 7.0) + / 2.0 + - 0.5 + ).astype(np.intp) + ) + col = ( + flat + + row + + 1 + - ni * (ni - 1) // 2 + + ((ni - row) * (ni - row - 1)) // 2 + ) + + row = row + offsets[i] + col = col + offsets[i] + else: + # Between-block: sample from full ni x nj grid + n_possible = ni * nj + n_edges = int(rng.binomial(n_possible, p)) + if n_edges == 0: + continue + + flat = rng.choice(n_possible, size=n_edges, replace=False) + row = flat // nj + offsets[i] + col = flat % nj + offsets[j] + + all_rows.append(row) + all_cols.append(col) + + if all_rows: + rows = np.concatenate(all_rows) + cols = np.concatenate(all_cols) + else: + rows = np.array([], dtype=np.intp) + cols = np.array([], dtype=np.intp) + + # Build symmetric adjacency (undirected) + data = np.ones(len(rows), dtype=np.int8) + adj = coo_matrix((data, (rows, cols)), shape=(n, n)) + adj = adj + adj.T # symmetrize + + G = nx.from_scipy_sparse_array(adj, create_using=nx.Graph()) + return G + + @staticmethod + def build_graph( + total_users: int = 10000, + avg_trust_links: int = 15, + reciprocity: float = 0.7, + num_communities: Optional[int] = None, + community_mixing: float = 0.15, + avg_distrust_links: float = 2.0, + bot_fraction: float = 0.10, + rewire_prob: float = 0.05, + seed: Optional[int] = None, + ) -> nx.DiGraph: + """Generate a realistic directed social trust graph. + + Pipeline: + 1. Build community structure via Stochastic Block Model + 2. Preferential attachment rewiring for hub emergence + 3. Convert to directed edges with configurable reciprocity + 4. Small-world rewiring for realistic path lengths + 5. Relabel nodes to user_N and attach Faker-generated attributes + 6. Add organic Poisson-distributed distrust links + 7. Add bot clusters around compromised accounts + + Args: + total_users: Number of user nodes. + avg_trust_links: Target average outgoing trust edges per user. + reciprocity: Fraction of edges that are mutual (0.0 to 1.0). + num_communities: Number of community clusters. None = auto (~500 users per community). + community_mixing: Fraction of edges crossing community boundaries (0.0 to 1.0). + avg_distrust_links: Average DISTRUSTS edges per user (Poisson lambda). + bot_fraction: Fraction of nodes that are bots (0.0 to 1.0). Default 0.10 (10%). + rewire_prob: Probability of rewiring each edge for small-world shortcuts. + seed: Random seed for reproducibility. + + Returns: + nx.DiGraph with User nodes and TRUSTS/DISTRUSTS edges. + """ + rng = random.Random(seed) + np_rng = np.random.default_rng(seed) + + logger.info( + f"Generating trust graph: {total_users} users, " + f"~{avg_trust_links} avg links, " + f"reciprocity={reciprocity}, " + f"communities={'auto' if num_communities is None else num_communities}" + ) + + # Step 1: Community structure via SBM + sizes = TrustGraphFetcher._build_community_sizes( + total_users, num_communities, seed=seed + ) + actual_num_communities = len(sizes) + + p_matrix = TrustGraphFetcher._build_probability_matrix( + sizes, avg_trust_links, community_mixing, total_users + ) + + G_undirected = TrustGraphFetcher._fast_sbm(sizes, p_matrix, seed=seed) + + logger.info( + f"SBM generated: {G_undirected.number_of_nodes()} nodes, " + f"{G_undirected.number_of_edges()} undirected edges, " + f"{actual_num_communities} communities" + ) + + # Build community label mapping from SBM partition + community_labels = {} + node_idx = 0 + for comm_id, size in enumerate(sizes): + for _ in range(size): + community_labels[node_idx] = comm_id + node_idx += 1 + + # Step 1b: Preferential attachment for hub emergence + TrustGraphFetcher._apply_preferential_attachment( + G_undirected, community_labels, pa_fraction=0.4, rng=rng + ) + + # Step 2: Convert to directed with reciprocity + G = TrustGraphFetcher._apply_reciprocity(G_undirected, reciprocity, rng) + del G_undirected # Free undirected graph early + + logger.info( + f"Directed graph: {G.number_of_edges()} edges " + f"(reciprocity={reciprocity})" + ) + + # Step 3: Small-world rewiring (TRUSTS only) + if rewire_prob > 0: + TrustGraphFetcher._rewire_small_world(G, rewire_prob, rng) + + # Step 4: Relabel integer nodes to user_N and add attributes + mapping = {i: f"user_{i}" for i in range(total_users)} + G = nx.relabel_nodes(G, mapping, copy=False) + + if seed is not None: + Faker.seed(seed) + TrustGraphFetcher.generate_users(G, total_users, community_labels) + + # Step 5: Add organic distrust + if avg_distrust_links > 0: + TrustGraphFetcher._add_organic_distrust(G, avg_distrust_links, rng, np_rng) + distrust_count = sum( + 1 for _, _, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ) + logger.info(f"Added {distrust_count} organic distrust links") + + # Step 6: Add bot clusters (derive structure from bot_fraction) + total_bots = round(total_users * bot_fraction) + if total_bots > 0: + # Target ~15 bots per cluster, scale compromised accounts accordingly + bpc = min(15, total_bots) + num_compromised = max(1, total_bots // bpc) + bpc = total_bots // num_compromised + remainder = total_bots - (num_compromised * bpc) + # Distribute bots across clusters, spreading remainder + bots_per_cluster = [bpc] * num_compromised + for i in range(remainder): + bots_per_cluster[i] += 1 + + cluster_map = TrustGraphFetcher._add_bot_clusters( + G, num_compromised, bots_per_cluster, rng, np_rng + ) + actual_bots = sum(len(v) for v in cluster_map.values()) + logger.info( + f"Added {len(cluster_map)} bot clusters " + f"({actual_bots} bot nodes total, bot_fraction={bot_fraction})" + ) + + logger.info( + f"Trust graph complete: {G.number_of_nodes()} nodes, " + f"{G.number_of_edges()} edges" + ) + + return G diff --git a/pyproject.toml b/pyproject.toml index fb93a8e..ad88a39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ keywords = ["faker", "graph-data", "flights", "osmnx", "graphs", "graphfaker"] dependencies = [ "faker>=37.1.0", "networkx>=3.4.2", + "numpy>=1.24.0", "osmnx==2.0.2", "pandas>=2.2.2", "requests>=2.32.3", diff --git a/tests/test_fetchers_trust.py b/tests/test_fetchers_trust.py new file mode 100644 index 0000000..a8e604d --- /dev/null +++ b/tests/test_fetchers_trust.py @@ -0,0 +1,355 @@ +# tests/test_fetchers_trust.py +import random + +import pytest +import networkx as nx +import numpy as np + +from graphfaker.fetchers.trust import TrustGraphFetcher + + +class TestBuildCommunities: + def test_default_communities(self): + sizes = TrustGraphFetcher._build_community_sizes(100, None, seed=42) + assert sum(sizes) == 100 + assert len(sizes) == 2 # 100 / 500 rounds to 0, clamped to min 2 + + def test_explicit_communities(self): + sizes = TrustGraphFetcher._build_community_sizes(10000, 5, seed=42) + assert sum(sizes) == 10000 + assert len(sizes) == 5 + assert all(s >= 50 for s in sizes) + + def test_more_communities_than_users(self): + sizes = TrustGraphFetcher._build_community_sizes(3, 10, seed=42) + assert sum(sizes) == 3 + assert len(sizes) == 3 # clamped to total_users + + def test_single_community(self): + sizes = TrustGraphFetcher._build_community_sizes(50, 1, seed=42) + assert sizes == [50] + + def test_variable_community_sizes(self): + """Verify log-normal produces non-uniform sizes with min floor.""" + sizes = TrustGraphFetcher._build_community_sizes(10000, 20, seed=42) + assert sum(sizes) == 10000 + assert len(sizes) == 20 + assert all(s >= 50 for s in sizes) + # Sizes should NOT all be equal (log-normal is skewed) + assert len(set(sizes)) > 1 + + def test_reproducible_with_seed(self): + s1 = TrustGraphFetcher._build_community_sizes(10000, 20, seed=99) + s2 = TrustGraphFetcher._build_community_sizes(10000, 20, seed=99) + assert s1 == s2 + + +class TestProbabilityMatrix: + def test_matrix_shape(self): + sizes = [250, 250, 250, 250] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=15, + community_mixing=0.15, total_users=1000, + ) + assert len(matrix) == 4 + assert all(len(row) == 4 for row in matrix) + + def test_within_greater_than_between(self): + sizes = [250, 250, 250, 250] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=15, + community_mixing=0.15, total_users=1000, + ) + p_within = matrix[0][0] + p_between = matrix[0][1] + assert p_within > p_between + + def test_within_greater_than_between_variable_sizes(self): + """Even with the largest community, p_within > p_between.""" + sizes = [2000, 800, 600, 400, 200] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=15, + community_mixing=0.15, total_users=4000, + ) + # Check for every row (community) + for i in range(len(sizes)): + p_within = matrix[i][i] + p_between = matrix[i][(i + 1) % len(sizes)] + assert p_within > p_between, f"Community {i} (size={sizes[i]}): p_within={p_within} <= p_between={p_between}" + + def test_probabilities_capped_at_one(self): + sizes = [5, 5] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=500, + community_mixing=0.5, total_users=10, + ) + for row in matrix: + for p in row: + assert 0.0 <= p <= 1.0 + + +class TestApplyReciprocity: + @pytest.fixture + def simple_undirected(self): + G = nx.Graph() + G.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4)]) + return G + + def test_full_reciprocity(self, simple_undirected): + rng = random.Random(42) + G_dir = TrustGraphFetcher._apply_reciprocity(simple_undirected, 1.0, rng) + assert isinstance(G_dir, nx.DiGraph) + # With reciprocity=1.0, every undirected edge becomes two directed edges + assert G_dir.number_of_edges() == 8 # 4 undirected * 2 + + def test_zero_reciprocity(self, simple_undirected): + rng = random.Random(42) + G_dir = TrustGraphFetcher._apply_reciprocity(simple_undirected, 0.0, rng) + # With reciprocity=0.0, every undirected edge becomes one directed edge + assert G_dir.number_of_edges() == 4 + + def test_edge_relationship_attribute(self, simple_undirected): + rng = random.Random(42) + G_dir = TrustGraphFetcher._apply_reciprocity(simple_undirected, 0.5, rng) + for u, v, data in G_dir.edges(data=True): + assert data["relationship"] == "TRUSTS" + + +class TestOrganicDistrust: + def test_organic_distrust_distribution(self): + """Most nodes should have >=1 DISTRUSTS edge at avg=2.0.""" + rng = random.Random(42) + np_rng = np.random.default_rng(42) + G = nx.DiGraph() + n = 1000 + for i in range(n): + G.add_node(f"user_{i}") + # Add some trust edges so the graph is non-trivial + for i in range(n - 1): + G.add_edge(f"user_{i}", f"user_{i+1}", relationship="TRUSTS") + + TrustGraphFetcher._add_organic_distrust(G, 2.0, rng, np_rng) + + distrust_edges = [ + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ] + distrust_sources = set(u for u, _ in distrust_edges) + + # At lambda=2.0, ~86.5% should have >=1 distrust link + assert len(distrust_sources) / n > 0.75, ( + f"Only {len(distrust_sources)}/{n} users have distrust links" + ) + # Total should be roughly n * avg (within 50% tolerance) + assert n * 1.0 < len(distrust_edges) < n * 3.5, ( + f"Expected ~{n * 2} distrust edges, got {len(distrust_edges)}" + ) + + def test_zero_distrust(self): + """avg_distrust_links=0.0 should produce no DISTRUSTS edges.""" + rng = random.Random(42) + np_rng = np.random.default_rng(42) + G = nx.DiGraph() + for i in range(10): + G.add_node(f"user_{i}") + for i in range(9): + G.add_edge(f"user_{i}", f"user_{i+1}", relationship="TRUSTS") + + TrustGraphFetcher._add_organic_distrust(G, 0.0, rng, np_rng) + + distrust_edges = [ + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ] + assert len(distrust_edges) == 0 + + def test_distrust_no_overlap_with_trust(self): + """DISTRUSTS targets must not have TRUSTS from the same source.""" + rng = random.Random(42) + np_rng = np.random.default_rng(42) + G = nx.DiGraph() + n = 200 + for i in range(n): + G.add_node(f"user_{i}") + for i in range(n - 1): + G.add_edge(f"user_{i}", f"user_{i+1}", relationship="TRUSTS") + + TrustGraphFetcher._add_organic_distrust(G, 2.0, rng, np_rng) + + trust_set = set( + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "TRUSTS" + ) + distrust_set = set( + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ) + overlap = trust_set & distrust_set + assert len(overlap) == 0, f"Found {len(overlap)} overlapping TRUSTS/DISTRUSTS edges" + + +class TestBotClusters: + @pytest.fixture + def graph_with_bots(self): + """Build a small graph and add bot clusters.""" + G = TrustGraphFetcher.build_graph( + total_users=200, + avg_trust_links=5, + reciprocity=0.7, + num_communities=4, + community_mixing=0.15, + avg_distrust_links=1.0, + bot_fraction=0.05, + seed=42, + ) + return G + + def test_bot_nodes_created(self, graph_with_bots): + """Verify expected number of bot nodes exist.""" + bot_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_bot")] + assert len(bot_nodes) == 10 # round(200 * 0.05) = 10 + + def test_compromised_nodes_marked(self, graph_with_bots): + """Verify compromised account count — derived from total bots / ~15 per cluster.""" + comp_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_compromised")] + assert len(comp_nodes) >= 1 + + def test_bot_cluster_connectivity(self, graph_with_bots): + """Each bot should be connected to exactly one compromised account in both directions.""" + G = graph_with_bots + comp_nodes = [n for n, d in G.nodes(data=True) if d.get("is_compromised")] + bot_nodes = [n for n, d in G.nodes(data=True) if d.get("is_bot")] + + for bot in bot_nodes: + # Each bot must have bidirectional TRUSTS with exactly one compromised node + linked_comp = [c for c in comp_nodes if G.has_edge(c, bot) and G.has_edge(bot, c)] + assert len(linked_comp) == 1, ( + f"Bot {bot} should connect to exactly 1 compromised node, found {len(linked_comp)}" + ) + + def test_bot_ids_format(self, graph_with_bots): + """Bot node IDs should match bot_* pattern.""" + bot_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_bot")] + for bot in bot_nodes: + assert bot.startswith("bot_"), f"Unexpected bot ID format: {bot}" + + def test_no_bots_when_zero(self): + """bot_fraction=0.0 should produce no bots or compromised markers.""" + G = TrustGraphFetcher.build_graph( + total_users=100, + avg_trust_links=5, + bot_fraction=0.0, + seed=42, + ) + bot_nodes = [n for n, d in G.nodes(data=True) if d.get("is_bot")] + comp_nodes = [n for n, d in G.nodes(data=True) if d.get("is_compromised")] + assert len(bot_nodes) == 0 + assert len(comp_nodes) == 0 + + def test_convergent_distrust_on_compromised(self, graph_with_bots): + """Compromised accounts should have multiple incoming DISTRUSTS edges.""" + G = graph_with_bots + comp_nodes = [n for n, d in G.nodes(data=True) if d.get("is_compromised")] + for comp in comp_nodes: + incoming_distrust = [ + u for u in G.predecessors(comp) + if G.edges[u, comp].get("relationship") == "DISTRUSTS" + ] + assert len(incoming_distrust) >= 1, ( + f"Compromised node {comp} has {len(incoming_distrust)} incoming DISTRUSTS" + ) + + def test_bot_community_is_negative_one(self, graph_with_bots): + """Bot nodes should have community=-1.""" + bot_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_bot")] + for bot in bot_nodes: + assert graph_with_bots.nodes[bot]["community"] == -1 + + +class TestBuildGraph: + @pytest.fixture + def small_trust_graph(self): + return TrustGraphFetcher.build_graph( + total_users=100, + avg_trust_links=5, + reciprocity=0.7, + num_communities=4, + community_mixing=0.15, + avg_distrust_links=1.0, + bot_fraction=0.10, + seed=42, + ) + + def test_node_count(self, small_trust_graph): + # 100 users + round(100*0.10)=10 bots = 110 + assert small_trust_graph.number_of_nodes() == 110 + + def test_is_directed(self, small_trust_graph): + assert isinstance(small_trust_graph, nx.DiGraph) + + def test_node_attributes(self, small_trust_graph): + node_data = small_trust_graph.nodes["user_0"] + assert node_data["type"] == "User" + assert "name" in node_data + assert "public_key" in node_data + assert len(node_data["public_key"]) == 64 # 32 bytes hex + assert "created_at" in node_data + assert "community" in node_data + assert "is_bot" in node_data + assert "is_compromised" in node_data + + def test_edge_relationships(self, small_trust_graph): + relationships = set() + for u, v, data in small_trust_graph.edges(data=True): + relationships.add(data.get("relationship")) + assert "TRUSTS" in relationships + assert "DISTRUSTS" in relationships + + def test_has_edges(self, small_trust_graph): + assert small_trust_graph.number_of_edges() > 0 + + def test_node_id_format(self, small_trust_graph): + for node in small_trust_graph.nodes(): + assert node.startswith("user_") or node.startswith("bot_") + + def test_community_labels_assigned(self, small_trust_graph): + communities = set() + for _, data in small_trust_graph.nodes(data=True): + communities.add(data.get("community")) + assert 4 in [c for c in communities if c >= 0] or len([c for c in communities if c >= 0]) == 4 + # Bot nodes have community=-1 + assert -1 in communities + + def test_reproducible_with_seed(self): + g1 = TrustGraphFetcher.build_graph(total_users=50, seed=123) + g2 = TrustGraphFetcher.build_graph(total_users=50, seed=123) + assert g1.number_of_nodes() == g2.number_of_nodes() + assert g1.number_of_edges() == g2.number_of_edges() + assert set(g1.nodes()) == set(g2.nodes()) + + def test_default_parameters(self): + # Smoke test with defaults (but small scale) + g = TrustGraphFetcher.build_graph(total_users=50, seed=1) + assert g.number_of_nodes() > 50 # 50 users + bots + assert g.number_of_edges() > 0 + + +class TestCoreIntegration: + def test_generate_graph_trust_source(self): + from graphfaker.core import GraphFaker + gf = GraphFaker() + g = gf.generate_graph( + source="trust", + total_users=50, + avg_trust_links=5, + bot_fraction=0.06, + seed=42, + ) + assert isinstance(g, nx.DiGraph) + assert g.number_of_nodes() == 53 # 50 users + round(50*0.06)=3 bots + + def test_graphfaker_stores_graph(self): + from graphfaker.core import GraphFaker + gf = GraphFaker() + g = gf.generate_graph(source="trust", total_users=30, seed=42) + assert gf.G is g