[Fix] Scenarios using gpu

FerTV · FerTV · commit 0695eb9d4e97 · 2025-06-19T13:57:23.000+02:00
diff --git a/app/deployer.py b/app/deployer.py
@@ -668,12 +668,6 @@ def run_frontend(self):
                     "/var/run/docker.sock not found, please check if Docker is running and Docker Compose is installed."
                 )
 
-        try:
-            subprocess.check_call(["nvidia-smi"])
-            self.gpu_available = True
-        except Exception:
-            logging.info("No GPU available for the frontend, nodes will be deploy in CPU mode")
-
         network_name = f"{os.environ['USER']}_nebula-net-base"
 
         # Create the Docker network
@@ -684,7 +678,6 @@ def run_frontend(self):
         environment = {
             "NEBULA_CONTROLLER_NAME": os.environ["USER"],
             "NEBULA_PRODUCTION": self.production,
-            "NEBULA_GPU_AVAILABLE": self.gpu_available,
             "NEBULA_ADVANCED_ANALYTICS": self.advanced_analytics,
             "NEBULA_FRONTEND_LOG": "/nebula/app/logs/frontend.log",
             "NEBULA_LOGS_DIR": "/nebula/app/logs/",
@@ -756,6 +749,12 @@ def run_controller(self):
                 )
 
         network_name = f"{os.environ['USER']}_nebula-net-base"
+        
+        try:
+            subprocess.check_call(["nvidia-smi"])
+            self.gpu_available = True
+        except Exception:
+            logging.info("No GPU available for the frontend, nodes will be deploy in CPU mode")
 
         # Create the Docker network
         base = DockerUtils.create_docker_network(network_name)
@@ -790,6 +789,11 @@ def run_controller(self):
             ],
             extra_hosts={"host.docker.internal": "host-gateway"},
             port_bindings={self.controller_port: self.controller_port},
+            device_requests=[{
+                "Driver": "nvidia",
+                "Count": -1,
+                "Capabilities": [["gpu"]],
+            }] if self.gpu_available else None,
         )
 
         networking_config = client.api.create_networking_config({
diff --git a/nebula/controller/Dockerfile b/nebula/controller/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:22.04
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
diff --git a/nebula/frontend/app.py b/nebula/frontend/app.py
@@ -31,7 +31,6 @@ class Settings:
         resources_threshold (float): Threshold for resource usage alerts (default: 0.0).
         port (int): Port for the Nebula frontend service (default: 6060).
         production (bool): Whether the application is running in production mode.
-        gpu_available (bool): Whether GPU resources are available.
         advanced_analytics (bool): Whether advanced analytics features are enabled.
         host_platform (str): Underlying host operating platform (e.g., 'unix').
         log_dir (str): Directory path where application logs are stored.
@@ -51,7 +50,6 @@ class Settings:
     resources_threshold: float = 0.0
     port: int = os.environ.get("NEBULA_FRONTEND_PORT", 6060)
     production: bool = os.environ.get("NEBULA_PRODUCTION", "False") == "True"
-    gpu_available: bool = os.environ.get("NEBULA_GPU_AVAILABLE", "False") == "True"
     advanced_analytics: bool = os.environ.get("NEBULA_ADVANCED_ANALYTICS", "False") == "True"
     host_platform: str = os.environ.get("NEBULA_HOST_PLATFORM", "unix")
     log_dir: str = os.environ.get("NEBULA_LOGS_DIR")
@@ -2091,7 +2089,6 @@ async def nebula_dashboard_deployment(request: Request, session: dict = Depends(
             "request": request,
             "scenario_running": scenario_running,
             "user_logged_in": session.get("user"),
-            "gpu_available": settings.gpu_available,
         },
     )
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM ubuntu:22.04`
	`1`	`+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04`
`2`	`2`
`3`	`3`	`ENV DEBIAN_FRONTEND=noninteractive`
`4`	`4`