Snapchat
diff --git a/‎python/gigl/distributed/graph_store/storage_main.py‎
Lines changed: 122 additions & 0 deletions b/‎python/gigl/distributed/graph_store/storage_main.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎python/gigl/env/distributed.py‎
Lines changed: 33 additions & 0 deletions b/‎python/gigl/env/distributed.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎python/tests/integration/distributed/graph_store/__init__.py‎ b/‎python/tests/integration/distributed/graph_store/__init__.py‎
diff --git a/‎python/tests/integration/distributed/graph_store/graph_store_integration_test.py‎
Lines changed: 181 additions & 0 deletions b/‎python/tests/integration/distributed/graph_store/graph_store_integration_test.py‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎python/tests/unit/env/__init__.py‎ b/‎python/tests/unit/env/__init__.py‎
@@ -0,0 +1,122 @@
+"""Built-in GiGL Graph Store Server.
+
+Derivved from https://github.com/alibaba/graphlearn-for-pytorch/blob/main/examples/distributed/server_client_mode/sage_supervised_server.py
+
+"""
+import argparse
+import os
+
+import graphlearn_torch as glt
+import torch
+
+from gigl.common import Uri, UriFactory
+from gigl.common.logger import Logger
+from gigl.distributed import build_dataset_from_task_config_uri
+from gigl.distributed.dist_dataset import DistDataset
+from gigl.distributed.graph_store.remote_dataset import register_dataset
+from gigl.distributed.utils import get_graph_store_info
+from gigl.env.distributed import GraphStoreInfo
+
+logger = Logger()
+
+
+def _run_storage_process(
+    storage_rank: int,
+    cluster_info: GraphStoreInfo,
+    dataset: DistDataset,
+) -> None:
+    logger.info(
+        f"Initializing storage node {storage_rank} / {cluster_info.num_storage_nodes } on {cluster_info.cluster_master_ip}:{cluster_info.cluster_master_port}. Cluster rank: {os.environ.get('RANK')}"
+    )
+    register_dataset(dataset)
+    glt.distributed.init_server(
+        num_servers=cluster_info.num_storage_nodes,
+        server_rank=storage_rank,
+        dataset=dataset,
+        master_addr=cluster_info.cluster_master_ip,
+        master_port=cluster_info.cluster_master_port,
+        num_clients=cluster_info.compute_cluster_world_size,
+    )
+
+    logger.info(
+        f"Waiting for storage node {storage_rank} / {cluster_info.num_storage_nodes} to exit"
+    )
+    glt.distributed.wait_and_shutdown_server()
+    logger.info(f"Storage node {storage_rank} exited")
+
+
+def storage_node_process(
+    storage_rank: int,
+    cluster_info: GraphStoreInfo,
+    task_config_uri: Uri,
+    is_inference: bool,
+    tf_record_uri_pattern: str = ".*-of-.*\.tfrecord(\.gz)?$",
+) -> None:
+    """Run a storage node process
+
+    Should be called *once* per storage node (machine).
+
+    Args:
+        storage_rank (int): The rank of the storage node.
+        cluster_info (GraphStoreInfo): The cluster information.
+        task_config_uri (Uri): The task config URI.
+        is_inference (bool): Whether the process is an inference process.
+        tf_record_uri_pattern (str): The TF Record URI pattern.
+    """
+    init_method = f"tcp://{cluster_info.storage_cluster_master_ip}:{cluster_info.storage_cluster_master_port}"
+    logger.info(
+        f"Initializing storage node {storage_rank} / {cluster_info.num_storage_nodes}. OS rank: {os.environ['RANK']}, OS world size: {os.environ['WORLD_SIZE']} init method: {init_method}"
+    )
+    torch.distributed.init_process_group(
+        backend="gloo",
+        world_size=cluster_info.num_storage_nodes,
+        rank=storage_rank,
+        init_method=init_method,
+        group_name="gigl_server_comms",
+    )
+    logger.info(
+        f"Storage node {storage_rank} / {cluster_info.num_storage_nodes} process group initialized"
+    )
+    dataset = build_dataset_from_task_config_uri(
+        task_config_uri=task_config_uri,
+        is_inference=is_inference,
+        _tfrecord_uri_pattern=tf_record_uri_pattern,
+    )
+    server_processes = []
+    mp_context = torch.multiprocessing.get_context("spawn")
+    # TODO(kmonte): Enable more than one server process per machine
+    for i in range(1):
+        server_process = mp_context.Process(
+            target=_run_storage_process,
+            args=(
+                storage_rank + i,  # storage_rank
+                cluster_info,  # cluster_info
+                dataset,  # dataset
+            ),
+        )
+        server_processes.append(server_process)
+    for server_process in server_processes:
+        server_process.start()
+    for server_process in server_processes:
+        server_process.join()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task_config_uri", type=str, required=True)
+    parser.add_argument("--resource_config_uri", type=str, required=True)
+    parser.add_argument("--is_inference", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"Running storage node with arguments: {args}")
+
+    is_inference = args.is_inference
+    torch.distributed.init_process_group()
+    cluster_info = get_graph_store_info()
+    # Tear down the """"global""" process group so we can have a server-specific process group.
+    torch.distributed.destroy_process_group()
+    storage_node_process(
+        storage_rank=cluster_info.storage_node_rank,
+        cluster_info=cluster_info,
+        task_config_uri=UriFactory.create_uri(args.task_config_uri),
+        is_inference=is_inference,
+    )
@@ -1,5 +1,6 @@
 """Information about distributed environments."""
 
+import os
 from dataclasses import dataclass
 from typing import Final
 
@@ -61,3 +62,35 @@ def num_cluster_nodes(self) -> int:
     @property
     def compute_cluster_world_size(self) -> int:
         return self.num_compute_nodes * self.num_processes_per_compute
+
+    @property
+    def storage_node_rank(self) -> int:
+        """Get the rank of the storage node in the storage cluster.
+
+        Raises:
+            ValueError: If the node is not in the storage cluster.
+        """
+        global_rank = int(os.environ["RANK"])
+        if not (
+            self.num_compute_nodes
+            <= global_rank
+            < self.num_compute_nodes + self.num_storage_nodes
+        ):
+            raise ValueError(
+                f"Global rank {global_rank} is not a storage rank. Expected storage rank to be in [{self.num_compute_nodes}, {self.num_compute_nodes + self.num_storage_nodes})"
+            )
+        return global_rank - self.num_compute_nodes
+
+    @property
+    def compute_node_rank(self) -> int:
+        """Get the rank of the compute node in the compute cluster.
+
+        Raises:
+            ValueError: If the node is not in the compute cluster.
+        """
+        global_rank = int(os.environ["RANK"])
+        if not 0 <= global_rank < self.num_compute_nodes:
+            raise ValueError(
+                f"Global rank {global_rank} is not a compute rank. Expected compute rank to be in [0, {self.num_compute_nodes})"
+            )
+        return global_rank
@@ -0,0 +1,181 @@
+import os
+import unittest
+from unittest import mock
+
+import torch
+import torch.multiprocessing as mp
+from graphlearn_torch.distributed import init_client, shutdown_client
+
+from gigl.common import Uri
+from gigl.common.logger import Logger
+from gigl.distributed.graph_store.storage_main import storage_node_process
+from gigl.distributed.utils import get_free_port
+from gigl.env.distributed import (
+    COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY,
+    GraphStoreInfo,
+)
+from gigl.src.mocking.lib.versioning import get_mocked_dataset_artifact_metadata
+from gigl.src.mocking.mocking_assets.mocked_datasets_for_pipeline_tests import (
+    CORA_USER_DEFINED_NODE_ANCHOR_MOCKED_DATASET_INFO,
+)
+
+logger = Logger()
+
+
+def _run_client_process(
+    client_rank: int,
+    cluster_info: GraphStoreInfo,
+) -> None:
+    client_global_rank = (
+        cluster_info.compute_node_rank * cluster_info.num_processes_per_compute
+        + client_rank
+    )
+    logger.info(
+        f"Initializing client process {client_global_rank} / {cluster_info.compute_cluster_world_size}. on {cluster_info.cluster_master_ip}:{cluster_info.cluster_master_port}. OS rank: {os.environ['RANK']}, local client rank: {client_rank} on port: {cluster_info.cluster_master_port}"
+    )
+    # TODO(kmonte): Add gigl.*.init_client as a helper function to do this.
+    torch.distributed.init_process_group(
+        backend="gloo",
+        world_size=cluster_info.compute_cluster_world_size,
+        rank=client_global_rank,
+        init_method=f"tcp://{cluster_info.compute_cluster_master_ip}:{cluster_info.compute_cluster_master_port}",
+        group_name="gigl_client_comms",
+    )
+    logger.info(
+        f"Client {client_global_rank} / {cluster_info.compute_cluster_world_size} process group initialized"
+    )
+    init_client(
+        num_servers=cluster_info.num_storage_nodes,
+        num_clients=cluster_info.compute_cluster_world_size,
+        client_rank=client_global_rank,
+        master_addr=cluster_info.cluster_master_ip,
+        master_port=cluster_info.cluster_master_port,
+        client_group_name="gigl_client_rpc",
+    )
+
+    torch.distributed.barrier()
+    logger.info(
+        f"{client_global_rank} / {cluster_info.compute_cluster_world_size} Shutting down client"
+    )
+    shutdown_client()
+
+
+def _client_process(
+    client_rank: int,
+    cluster_info: GraphStoreInfo,
+) -> None:
+    logger.info(
+        f"Initializing client node {client_rank} / {cluster_info.num_compute_nodes}. OS rank: {os.environ['RANK']}, OS world size: {os.environ['WORLD_SIZE']}, local client rank: {client_rank}"
+    )
+
+    mp_context = torch.multiprocessing.get_context("spawn")
+    client_processes = []
+    for i in range(cluster_info.num_processes_per_compute):
+        client_process = mp_context.Process(
+            target=_run_client_process,
+            args=[
+                i,  # client_rank
+                cluster_info,  # cluster_info
+            ],
+        )
+        client_processes.append(client_process)
+    for client_process in client_processes:
+        client_process.start()
+    for client_process in client_processes:
+        client_process.join()
+
+
+def _run_server_processes(
+    cluster_info: GraphStoreInfo,
+    task_config_uri: Uri,
+    is_inference: bool,
+) -> None:
+    logger.info(
+        f"Initializing server processes. OS rank: {os.environ['RANK']}, OS world size: {os.environ['WORLD_SIZE']}"
+    )
+    storage_node_process(
+        storage_rank=cluster_info.storage_node_rank,
+        cluster_info=cluster_info,
+        task_config_uri=task_config_uri,
+        is_inference=is_inference,
+        tf_record_uri_pattern=".*tfrecord",
+    )
+
+
+class TestUtils(unittest.TestCase):
+    def test_graph_store_locally(self):
+        # Simulating two server machine, two compute machines.
+        # Each machine has one process.
+        cora_supervised_info = get_mocked_dataset_artifact_metadata()[
+            CORA_USER_DEFINED_NODE_ANCHOR_MOCKED_DATASET_INFO.name
+        ]
+        task_config_uri = cora_supervised_info.frozen_gbml_config_uri
+        cluster_info = GraphStoreInfo(
+            num_storage_nodes=2,
+            num_compute_nodes=2,
+            num_processes_per_compute=2,
+            cluster_master_ip="localhost",
+            storage_cluster_master_ip="localhost",
+            compute_cluster_master_ip="localhost",
+            cluster_master_port=get_free_port(),
+            storage_cluster_master_port=get_free_port(),
+            compute_cluster_master_port=get_free_port(),
+        )
+
+        master_port = get_free_port()
+        ctx = mp.get_context("spawn")
+        client_processes: list = []
+        for i in range(cluster_info.num_compute_nodes):
+            with mock.patch.dict(
+                os.environ,
+                {
+                    "MASTER_ADDR": "localhost",
+                    "MASTER_PORT": str(master_port),
+                    "RANK": str(i),
+                    "WORLD_SIZE": str(cluster_info.compute_cluster_world_size),
+                    COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY: str(
+                        cluster_info.num_processes_per_compute
+                    ),
+                },
+                clear=False,
+            ):
+                client_process = ctx.Process(
+                    target=_client_process,
+                    args=[
+                        i,  # client_rank
+                        cluster_info,  # cluster_info
+                    ],
+                )
+                client_process.start()
+                client_processes.append(client_process)
+        # Start server process
+        server_processes = []
+        for i in range(cluster_info.num_storage_nodes):
+            with mock.patch.dict(
+                os.environ,
+                {
+                    "MASTER_ADDR": "localhost",
+                    "MASTER_PORT": str(master_port),
+                    "RANK": str(i + cluster_info.num_compute_nodes),
+                    "WORLD_SIZE": str(cluster_info.compute_cluster_world_size),
+                    COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY: str(
+                        cluster_info.num_processes_per_compute
+                    ),
+                },
+                clear=False,
+            ):
+                server_process = ctx.Process(
+                    target=_run_server_processes,
+                    args=[
+                        cluster_info,  # cluster_info
+                        task_config_uri,  # task_config_uri
+                        True,  # is_inference
+                    ],
+                )
+                server_process.start()
+                server_processes.append(server_process)
+
+        for client_process in client_processes:
+            client_process.join()
+        for server_process in server_processes:
+            server_process.join()