Merge branch 'master' into pr_orjson

r4victor · r4victor · commit 187f291159a6 · 2025-07-10T12:14:57.000+05:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -123,7 +123,10 @@ jobs:
     defaults:
       run:
         working-directory: runner
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
     steps:
       - uses: actions/checkout@v4
       - name: Set up Go
diff --git a/frontend/src/pages/Runs/List/index.tsx b/frontend/src/pages/Runs/List/index.tsx
@@ -48,7 +48,7 @@ export const RunList: React.FC = () => {
 
     const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll<IRun, TRunsRequestParams>({
         useLazyQuery: useLazyGetRunsQuery,
-        args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE },
+        args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE, job_submissions_limit: 1 },
         getPaginationParams: (lastRun) => ({ prev_submitted_at: lastRun.submitted_at }),
     });
 
diff --git a/frontend/src/types/run.d.ts b/frontend/src/types/run.d.ts
@@ -7,6 +7,7 @@ declare type TRunsRequestParams = {
     prev_run_id?: string;
     limit?: number;
     ascending?: boolean;
+    job_submissions_limit?: number;
 };
 
 declare type TDeleteRunsRequestParams = {
diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	osuser "os/user"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -27,6 +28,12 @@ import (
 	"github.com/prometheus/procfs"
 )
 
+type ConnectionTracker interface {
+	GetNoConnectionsSecs() int64
+	Track(ticker <-chan time.Time)
+	Stop()
+}
+
 type RunExecutor struct {
 	tempDir    string
 	homeDir    string
@@ -51,9 +58,16 @@ type RunExecutor struct {
 	timestamp       *MonotonicTimestamp
 
 	killDelay         time.Duration
-	connectionTracker *connections.ConnectionTracker
+	connectionTracker ConnectionTracker
 }
 
+// stubConnectionTracker is a no-op implementation for when procfs is not available (only required for tests on darwin)
+type stubConnectionTracker struct{}
+
+func (s *stubConnectionTracker) GetNoConnectionsSecs() int64   { return 0 }
+func (s *stubConnectionTracker) Track(ticker <-chan time.Time) {}
+func (s *stubConnectionTracker) Stop()                         {}
+
 func NewRunExecutor(tempDir string, homeDir string, workingDir string, sshPort int) (*RunExecutor, error) {
 	mu := &sync.RWMutex{}
 	timestamp := NewMonotonicTimestamp()
@@ -65,15 +79,25 @@ func NewRunExecutor(tempDir string, homeDir string, workingDir string, sshPort i
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse current user uid: %w", err)
 	}
-	proc, err := procfs.NewDefaultFS()
-	if err != nil {
-		return nil, fmt.Errorf("failed to initialize procfs: %w", err)
+
+	// Try to initialize procfs, but don't fail if it's not available (e.g., on macOS)
+	var connectionTracker ConnectionTracker
+
+	if runtime.GOOS == "linux" {
+		proc, err := procfs.NewDefaultFS()
+		if err != nil {
+			return nil, fmt.Errorf("failed to initialize procfs: %w", err)
+		}
+		connectionTracker = connections.NewConnectionTracker(connections.ConnectionTrackerConfig{
+			Port:            uint64(sshPort),
+			MinConnDuration: 10 * time.Second, // shorter connections are likely from dstack-server
+			Procfs:          proc,
+		})
+	} else {
+		// Use stub connection tracker (only required for tests on darwin)
+		connectionTracker = &stubConnectionTracker{}
 	}
-	connectionTracker := connections.NewConnectionTracker(connections.ConnectionTrackerConfig{
-		Port:            uint64(sshPort),
-		MinConnDuration: 10 * time.Second, // shorter connections are likely from dstack-server
-		Procfs:          proc,
-	})
+
 	return &RunExecutor{
 		tempDir:    tempDir,
 		homeDir:    homeDir,
diff --git a/runner/internal/metrics/metrics_test.go b/runner/internal/metrics/metrics_test.go
@@ -1,13 +1,17 @@
 package metrics
 
 import (
+	"runtime"
 	"testing"
 
 	"github.com/dstackai/dstack/runner/internal/schemas"
 	"github.com/stretchr/testify/assert"
 )
 
 func TestGetAMDGPUMetrics_OK(t *testing.T) {
+	if runtime.GOOS == "darwin" {
+		t.Skip("Skipping on macOS")
+	}
 	collector, err := NewMetricsCollector()
 	assert.NoError(t, err)
 
@@ -39,6 +43,9 @@ func TestGetAMDGPUMetrics_OK(t *testing.T) {
 }
 
 func TestGetAMDGPUMetrics_ErrorGPUUtilNA(t *testing.T) {
+	if runtime.GOOS == "darwin" {
+		t.Skip("Skipping on macOS")
+	}
 	collector, err := NewMetricsCollector()
 	assert.NoError(t, err)
 	metrics, err := collector.getAMDGPUMetrics("gpu,gfx,gfx_clock,vram_used,vram_total\n0,N/A,N/A,283,196300\n")
diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py
@@ -559,7 +559,8 @@ def get_shim_commands(
     backend_shim_env: Optional[Dict[str, str]] = None,
     arch: Optional[str] = None,
 ) -> List[str]:
-    commands = get_shim_pre_start_commands(
+    commands = get_setup_cloud_instance_commands()
+    commands += get_shim_pre_start_commands(
         base_path=base_path,
         bin_path=bin_path,
         arch=arch,
@@ -641,6 +642,23 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
     return url_template.format(version=version, arch=arch)
 
 
+def get_setup_cloud_instance_commands() -> list[str]:
+    return [
+        # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
+        # Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have.
+        (
+            "/bin/sh -c '"  # wrap in /bin/sh to avoid interfering with other cloud init commands
+            " grep -q nvidia /etc/docker/daemon.json"
+            " && ! grep -q native.cgroupdriver /etc/docker/daemon.json"
+            " && jq '\\''.\"exec-opts\" = ((.\"exec-opts\" // []) + [\"native.cgroupdriver=cgroupfs\"])'\\'' /etc/docker/daemon.json > /tmp/daemon.json"
+            " && sudo mv /tmp/daemon.json /etc/docker/daemon.json"
+            " && sudo service docker restart"
+            " || true"
+            "'"
+        ),
+    ]
+
+
 def get_shim_pre_start_commands(
     base_path: Optional[PathLike] = None,
     bin_path: Optional[PathLike] = None,
diff --git a/src/dstack/_internal/core/backends/cudo/compute.py b/src/dstack/_internal/core/backends/cudo/compute.py
@@ -65,12 +65,13 @@ def create_instance(
         public_keys = instance_config.get_public_keys()
         memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
-        commands = get_shim_commands(authorized_keys=public_keys)
         gpus_no = len(instance_offer.instance.resources.gpus)
-        shim_commands = " ".join([" && ".join(commands)])
-        startup_script = (
-            shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
-        )
+        if gpus_no > 0:
+            # we'll need jq for patching /etc/docker/daemon.json, see get_shim_commands()
+            commands = install_jq_commands()
+        else:
+            commands = install_docker_commands()
+        commands += get_shim_commands(authorized_keys=public_keys)
 
         try:
             resp_data = self.api_client.create_virtual_machine(
@@ -85,7 +86,7 @@ def create_instance(
                 memory_gib=memory_size,
                 vcpus=instance_offer.instance.resources.cpus,
                 vm_id=vm_id,
-                start_script=startup_script,
+                start_script=" && ".join(commands),
                 password=None,
                 customSshKeys=public_keys,
             )
@@ -151,6 +152,19 @@ def _get_image_id(cuda: bool) -> str:
     return image_name
 
 
-def install_docker_script():
-    commands = 'export DEBIAN_FRONTEND="noninteractive" && mkdir -p /etc/apt/keyrings && curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && apt-get update && apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin'
-    return commands
+def install_jq_commands():
+    return [
+        "export DEBIAN_FRONTEND=noninteractive",
+        "apt-get --assume-yes install jq",
+    ]
+
+
+def install_docker_commands():
+    return [
+        "export DEBIAN_FRONTEND=noninteractive",
+        "mkdir -p /etc/apt/keyrings",
+        "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
+        'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null',
+        "apt-get update",
+        "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin",
+    ]
diff --git a/src/dstack/_internal/core/backends/gcp/compute.py b/src/dstack/_internal/core/backends/gcp/compute.py
@@ -8,6 +8,7 @@
 import google.cloud.compute_v1 as compute_v1
 from cachetools import TTLCache, cachedmethod
 from google.cloud import tpu_v2
+from google.cloud.compute_v1.types.compute import Instance
 from gpuhunt import KNOWN_TPUS
 
 import dstack._internal.core.backends.gcp.auth as auth
@@ -19,6 +20,7 @@
     ComputeWithGatewaySupport,
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
+    ComputeWithPrivateGatewaySupport,
     ComputeWithVolumeSupport,
     generate_unique_gateway_instance_name,
     generate_unique_instance_name,
@@ -83,6 +85,7 @@ class GCPCompute(
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
     ComputeWithGatewaySupport,
+    ComputeWithPrivateGatewaySupport,
     ComputeWithVolumeSupport,
     Compute,
 ):
@@ -395,11 +398,7 @@ def update_provisioning_data(
         if instance.status in ["PROVISIONING", "STAGING"]:
             return
         if instance.status == "RUNNING":
-            if allocate_public_ip:
-                hostname = instance.network_interfaces[0].access_configs[0].nat_i_p
-            else:
-                hostname = instance.network_interfaces[0].network_i_p
-            provisioning_data.hostname = hostname
+            provisioning_data.hostname = _get_instance_ip(instance, allocate_public_ip)
             provisioning_data.internal_ip = instance.network_interfaces[0].network_i_p
             return
         raise ProvisioningError(
@@ -500,7 +499,7 @@ def create_gateway(
         request.instance_resource = gcp_resources.create_instance_struct(
             disk_size=10,
             image_id=_get_gateway_image_id(),
-            machine_type="e2-small",
+            machine_type="e2-medium",
             accelerators=[],
             spot=False,
             user_data=get_gateway_user_data(configuration.ssh_key_pub),
@@ -512,6 +511,7 @@ def create_gateway(
             service_account=self.config.vm_service_account,
             network=self.config.vpc_resource_name,
             subnetwork=subnetwork,
+            allocate_public_ip=configuration.public_ip,
         )
         operation = self.instances_client.insert(request=request)
         gcp_resources.wait_for_extended_operation(operation, "instance creation")
@@ -522,7 +522,7 @@ def create_gateway(
             instance_id=instance_name,
             region=configuration.region,  # used for instance termination
             availability_zone=zone,
-            ip_address=instance.network_interfaces[0].access_configs[0].nat_i_p,
+            ip_address=_get_instance_ip(instance, configuration.public_ip),
             backend_data=json.dumps({"zone": zone}),
         )
 
@@ -1024,3 +1024,9 @@ def _is_tpu_provisioning_data(provisioning_data: JobProvisioningData) -> bool:
         backend_data_dict = json.loads(provisioning_data.backend_data)
         is_tpu = backend_data_dict.get("is_tpu", False)
     return is_tpu
+
+
+def _get_instance_ip(instance: Instance, public_ip: bool) -> str:
+    if public_ip:
+        return instance.network_interfaces[0].access_configs[0].nat_i_p
+    return instance.network_interfaces[0].network_i_p
diff --git a/src/dstack/_internal/core/backends/lambdalabs/compute.py b/src/dstack/_internal/core/backends/lambdalabs/compute.py
@@ -1,4 +1,5 @@
 import hashlib
+import shlex
 import subprocess
 import tempfile
 from threading import Thread
@@ -98,7 +99,7 @@ def update_provisioning_data(
                 arch=provisioning_data.instance_type.resources.cpu_arch,
             )
             # shim is assumed to be run under root
-            launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
+            launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
             thread = Thread(
                 target=_start_runner,
                 kwargs={
diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py
@@ -559,6 +559,10 @@ def _get_status_message(self) -> Optional[str]:
             return self.status.value
 
         last_job = self.jobs[0]
+        # FIXME: status_message should not require all job submissions for status calculation
+        # since it's very expensive and is not required for anything else.
+        # May return a different status if not all job submissions requested.
+        # TODO: Calculate status_message by looking at job models directly instead job submissions.
         last_job_termination_reason = last_job.get_last_termination_reason()
 
         if len(self.jobs) == 1:
diff --git a/src/dstack/_internal/server/background/tasks/process_fleets.py b/src/dstack/_internal/server/background/tasks/process_fleets.py
@@ -58,7 +58,7 @@ async def _process_next_fleet():
 
 
 async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
-    logger.info("Processing fleet %s", fleet_model.name)
+    logger.debug("Processing fleet %s", fleet_model.name)
     # Refetch to load related attributes.
     # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
     res = await session.execute(
diff --git a/src/dstack/_internal/server/routers/runs.py b/src/dstack/_internal/server/routers/runs.py
@@ -61,6 +61,8 @@ async def list_runs(
             repo_id=body.repo_id,
             username=body.username,
             only_active=body.only_active,
+            include_jobs=body.include_jobs,
+            job_submissions_limit=body.job_submissions_limit,
             prev_submitted_at=body.prev_submitted_at,
             prev_run_id=body.prev_run_id,
             limit=body.limit,
diff --git a/src/dstack/_internal/server/schemas/runs.py b/src/dstack/_internal/server/schemas/runs.py
@@ -9,12 +9,24 @@
 
 
 class ListRunsRequest(CoreModel):
-    project_name: Optional[str]
-    repo_id: Optional[str]
-    username: Optional[str]
+    project_name: Optional[str] = None
+    repo_id: Optional[str] = None
+    username: Optional[str] = None
     only_active: bool = False
-    prev_submitted_at: Optional[datetime]
-    prev_run_id: Optional[UUID]
+    include_jobs: bool = Field(
+        True,
+        description=("Whether to include `jobs` in the response"),
+    )
+    job_submissions_limit: Optional[int] = Field(
+        None,
+        ge=0,
+        description=(
+            "Limit number of job submissions returned per job to avoid large responses."
+            "Drops older job submissions. No effect with `include_jobs: false`"
+        ),
+    )
+    prev_submitted_at: Optional[datetime] = None
+    prev_run_id: Optional[UUID] = None
     limit: int = Field(100, ge=0, le=100)
     ascending: bool = False
 
diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py
diff --git a/src/dstack/api/_public/runs.py b/src/dstack/api/_public/runs.py
diff --git a/src/dstack/api/server/_runs.py b/src/dstack/api/server/_runs.py
diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py