diff --git a/README.md b/README.md index ca2d3c255..701b7f856 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,7 @@ Your node is now running, the control panel will be available at `http://localho - [Logging & accessing logs](docs/networking.md) - [Control Panel: Local development](controlpanel/README.md) - [Docker Deployment Guide](docs/dockerDeployment.md) +- [C2D GPU Guide](docs/GPU.md) ## Control Panel diff --git a/docs/GPU.md b/docs/GPU.md new file mode 100644 index 000000000..a83c742ef --- /dev/null +++ b/docs/GPU.md @@ -0,0 +1,484 @@ +Supporting GPUs for c2d jobs comes down to: + +- define gpu list for each c2d env +- pass docker args for each gpu +- set a price for each gpu + +## Nvidia GPU Example + +Start by installing nvidia cuda drivers (ie:https://docs.nvidia.com/cuda/cuda-installation-guide-linux/), then install nvidia container toolkit (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + +Once that is done, check if you can get gpu details by running 'nvidia-smi': + +``` +root@gpu-1:/repos/ocean/ocean-node# nvidia-smi +Fri Apr 25 06:00:34 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.163.01 Driver Version: 550.163.01 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA GeForce GTX 1060 3GB Off | 00000000:01:00.0 Off | N/A | +| 0% 39C P8 6W / 120W | 2MiB / 3072MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ +``` + +Now, time to get the id of the gpu: + +```bash +root@gpu-1:/repos/ocean/ocean-node# nvidia-smi --query-gpu=name,uuid --format=csv +name, uuid +NVIDIA GeForce GTX 1060 3GB, GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81 +``` + +Now, we can define the gpu for node: + +```json +{ + "id": "myGPU", + "description": "NVIDIA GeForce GTX 1060 3GB", + "type": "gpu", + "total": 1, + "init": { + "deviceRequests": { + "Driver": "nvidia", + "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], + "Capabilities": [["gpu"]] + } + } +} +``` + +Don't forget to add it to fees definition and free definition (if desired). + +Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS: + +```json +[ + { + "socketPath": "/var/run/docker.sock", + "resources": [ + { + "id": "myGPU", + "description": "NVIDIA GeForce GTX 1060 3GB", + "type": "gpu", + "total": 1, + "init": { + "deviceRequests": { + "Driver": "nvidia", + "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], + "Capabilities": [["gpu"]] + } + } + }, + { "id": "disk", "total": 1000000000 } + ], + "storageExpiry": 604800, + "maxJobDuration": 3600, + "fees": { + "1": [ + { + "feeToken": "0x123", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "nyGPU", "price": 3 } + ] + } + ] + }, + "free": { + "maxJobDuration": 60, + "maxJobs": 3, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 1000000000 }, + { "id": "disk", "max": 1000000000 }, + { "id": "myGPU", "max": 1 } + ] + } + } +] +``` + +And you should have it in your compute envs: + +```bash +root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/computeEnvironments +``` + +```json +[ + { + "id": "0xd6b10b27aab01a72070a5164c07d0517755838b9cb9857e2d5649287ec3aaaa2-0x66073c81f833deaa2f8e2a508f69cf78f8a99b17ba1a64f369af921750f93914", + "runningJobs": 0, + "consumerAddress": "0x4fb80776C8eb4cAbe7730dcBCdb1fa6ecD3c460E", + "platform": { "architecture": "x86_64", "os": "Ubuntu 22.04.3 LTS" }, + "fees": { + "1": [ + { + "feeToken": "0x123", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "myGPU", "price": 3 } + ] + } + ] + }, + "storageExpiry": 604800, + "maxJobDuration": 3600, + "resources": [ + { "id": "cpu", "total": 8, "max": 8, "min": 1, "inUse": 0 }, + { + "id": "ram", + "total": 24888963072, + "max": 24888963072, + "min": 1000000000, + "inUse": 0 + }, + { + "id": "myGPU", + "description": "NVIDIA GeForce GTX 1060 3GB", + "type": "gpu", + "total": 1, + "init": { + "deviceRequests": { + "Driver": "nvidia", + "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], + "Capabilities": [["gpu"]] + } + }, + "max": 1, + "min": 0, + "inUse": 0 + }, + { "id": "disk", "total": 1000000000, "max": 1000000000, "min": 0, "inUse": 0 } + ], + "free": { + "maxJobDuration": 60, + "maxJobs": 3, + "resources": [ + { "id": "cpu", "max": 1, "inUse": 0 }, + { "id": "ram", "max": 1000000000, "inUse": 0 }, + { "id": "disk", "max": 1000000000, "inUse": 0 }, + { "id": "myGPU", "max": 1, "inUse": 0 } + ] + }, + "runningfreeJobs": 0 + } +] +``` + +Start a free job using: + +```json +{ + "command": "freeStartCompute", + "algorithm": { + "meta": { + "container": { + "image": "tensorflow/tensorflow", + "tag": "2.17.0-gpu", + "entrypoint": "python $ALGO" + }, + "rawcode": "import tensorflow as tf\nsess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))\nprint(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\ngpus = tf.config.list_physical_devices('GPU')\nfor gpu in gpus:\n\tprint('Name:', gpu.name, ' Type:', gpu.device_type)" + } + }, + "consumerAddress": "0xC7EC1970B09224B317c52d92f37F5e1E4fF6B687", + "signature": "123", + "nonce": 1, + "environment": "0xd6b10b27aab01a72070a5164c07d0517755838b9cb9857e2d5649287ec3aaaa2-0x66073c81f833deaa2f8e2a508f69cf78f8a99b17ba1a64f369af921750f93914", + "resources": [ + { + "id": "cpu", + "amount": 1 + }, + { + "id": "myGPU", + "amount": 1 + } + ] +} +``` + +And the output of `getComputeResult` should look like: + +```bash +2025-04-25 06:18:20.890217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered +2025-04-25 06:18:21.192330: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered +2025-04-25 06:18:21.292230: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered +WARNING: All log messages before absl::InitializeLog() is called are written to STDERR +I0000 00:00:1745561915.985558 1 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 +I0000 00:00:1745561915.993514 1 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 +I0000 00:00:1745561915.993799 1 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 +Num GPUs Available: 1 +Name: /physical_device:GPU:0 Type: GPU +``` + +## AMD Radeon 9070 XT ON WSL2 + +First, install ROCm (https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/wsl/install-radeon.html) + +Then define DOCKER_COMPUTE_ENVIRONMENTS with + +```json +[ + { + "socketPath": "/var/run/docker.sock", + "resources": [ + { + "id": "myGPU", + "description": "AMD Radeon RX 9070 XT", + "type": "gpu", + "total": 1, + "init": { + "advanced": { + "IpcMode": "host", + "ShmSize": 8589934592, + "CapAdd": ["SYS_PTRACE"], + "Devices": ["/dev/dxg", "/dev/dri/card0"], + "Binds": [ + "/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so", + "/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1" + ], + "SecurityOpt": { + "seccomp": "unconfined" + } + } + } + }, + { + "id": "disk", + "total": 1000000000 + } + ], + "storageExpiry": 604800, + "maxJobDuration": 3600, + "fees": { + "1": [ + { + "feeToken": "0x123", + "prices": [ + { + "id": "cpu", + "price": 1 + }, + { + "id": "nyGPU", + "price": 3 + } + ] + } + ] + }, + "free": { + "maxJobDuration": 60, + "maxJobs": 3, + "resources": [ + { + "id": "cpu", + "max": 1 + }, + { + "id": "ram", + "max": 1000000000 + }, + { + "id": "disk", + "max": 1000000000 + }, + { + "id": "myGPU", + "max": 1 + } + ] + } + } +] +``` + +aka + +```bash +export DOCKER_COMPUTE_ENVIRONMENTS="[{\"socketPath\":\"/var/run/docker.sock\",\"resources\":[{\"id\":\"myGPU\",\"description\":\"AMD Radeon RX 9070 XT\",\"type\":\"gpu\",\"total\":1,\"init\":{\"advanced\":{ +\"IpcMode\":\"host\",\"CapAdd\":[\"CAP_SYS_PTRACE\"],\"Devices\":[\"/dev/dxg\",\"/dev/dri/card0\"],\"Binds\":[\"/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so\",\"/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1\"],\"SecurityOpt\":{\"seccomp\":\"unconfined\"}}}},{\"id\":\"disk\",\"total\":1000000000}],\"storageExpiry\":604800,\"maxJobDuration\":3600,\"fees\":{\"1\":[{\"feeToken\":\"0x123\",\"prices\":[{\"id\":\"cpu\",\"price\":1},{\"id\":\"nyGPU\",\"price\":3}]}]},\"free\":{\"maxJobDuration\":60,\"maxJobs\":3,\"resources\":[{\"id\":\"cpu\",\"max\":1},{\"id\":\"ram\",\"max\":1000000000},{\"id\":\"disk\",\"max\":1000000000},{\"id\":\"myGPU\",\"max\":1}]}}]" +``` + +you should have it in your compute envs: + +```bash +root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/computeEnvironments +``` + +```json +[ + { + "id": "0xbb5773e734e1b188165dac88d9a3dc8ac28bc9f5624b45fa8bbd8fca043de7c1-0x2c2761f938cf186eeb81f71dee06ad7edb299493e39c316c390d0c0691e6585c", + "runningJobs": 0, + "consumerAddress": "0x4fb80776C8eb4cAbe7730dcBCdb1fa6ecD3c460E", + "platform": { + "architecture": "x86_64", + "os": "Ubuntu 24.04.2 LTS" + }, + "fees": { + "1": [ + { + "feeToken": "0x123", + "prices": [ + { + "id": "cpu", + "price": 1 + }, + { + "id": "nyGPU", + "price": 3 + } + ] + } + ] + }, + "storageExpiry": 604800, + "maxJobDuration": 3600, + "resources": [ + { + "id": "cpu", + "total": 16, + "max": 16, + "min": 1, + "inUse": 0 + }, + { + "id": "ram", + "total": 33617674240, + "max": 33617674240, + "min": 1000000000, + "inUse": 0 + }, + { + "id": "myGPU", + "description": "AMD Radeon RX 9070 XT", + "type": "gpu", + "total": 1, + "init": { + "advanced": { + "IpcMode": "host", + "CapAdd": ["CAP_SYS_PTRACE"], + "Devices": ["/dev/dxg", "/dev/dri/card0"], + "Binds": [ + "/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so", + "/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1" + ], + "SecurityOpt": { + "seccomp": "unconfined" + } + } + }, + "max": 1, + "min": 0, + "inUse": 0 + }, + { + "id": "disk", + "total": 1000000000, + "max": 1000000000, + "min": 0, + "inUse": 0 + } + ], + "free": { + "maxJobDuration": 60, + "maxJobs": 3, + "resources": [ + { + "id": "cpu", + "max": 1, + "inUse": 0 + }, + { + "id": "ram", + "max": 1000000000, + "inUse": 0 + }, + { + "id": "disk", + "max": 1000000000, + "inUse": 0 + }, + { + "id": "myGPU", + "max": 1, + "inUse": 0 + } + ] + }, + "runningfreeJobs": 0 + } +] +``` + +Start a free job with + +```json +{ + "command": "freeStartCompute", + "datasets": [ + { + "fileObject": { + "type": "url", + "url": "https://raw.githubusercontent.com/oceanprotocol/test-algorithm/master/javascript/algo.js", + "method": "get" + } + } + ], + "algorithm": { + "meta": { + "container": { + "image": "rocm/tensorflow", + "tag": "rocm6.4-py3.12-tf2.18-dev", + "entrypoint": "python $ALGO" + }, + "rawcode": "import tensorflow as tf\nsess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))\nprint(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\ngpus = tf.config.list_physical_devices('GPU')\nfor gpu in gpus:\n\tprint('Name:', gpu.name, ' Type:', gpu.device_type)" + } + }, + "consumerAddress": "0xC7EC1970B09224B317c52d92f37F5e1E4fF6B687", + "signature": "123", + "nonce": 1, + "environment": "0xbb5773e734e1b188165dac88d9a3dc8ac28bc9f5624b45fa8bbd8fca043de7c1-0x2c2761f938cf186eeb81f71dee06ad7edb299493e39c316c390d0c0691e6585c", + "resources": [ + { + "id": "cpu", + "amount": 1 + }, + { + "id": "myGPU", + "amount": 1 + } + ] +} +``` + +and get the results + +```bash +2025-04-25 15:16:15.218050: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. +To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. +WARNING: All log messages before absl::InitializeLog() is called are written to STDERR +I0000 00:00:1745594260.720023 1 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2874 MB memory: -> device: 0, name: AMD Radeon RX 9070 XT, pci bus id: 0000:0d:00.0 +2025-04-25 15:17:44.018225: I tensorflow/core/common_runtime/direct_session.cc:378] Device mapping: +/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: AMD Radeon RX 9070 XT, pci bus id: 0000:0d:00.0 + +Num GPUs Available: 1 +Name: /physical_device:GPU:0 Type: GPU +Warning: Resource leak detected by SharedSignalPool, 385 Signals leaked. +pid:1 tid:0x7f4476ac1740 [~VaMgr] frag_map_ size is not 1. +``` diff --git a/src/@types/C2D/C2D.ts b/src/@types/C2D/C2D.ts index 578c21c15..e7c6e30af 100644 --- a/src/@types/C2D/C2D.ts +++ b/src/@types/C2D/C2D.ts @@ -27,14 +27,36 @@ export interface ComputeResourcesPricingInfo { price: number // price per unit per minute } +export interface ArgumentValues { + [key: string]: string | number | boolean | any[] // Supports multiple value types +} + +export interface dockerDeviceRequest { + Driver: string + Count?: number + DeviceIDs: string[] + Capabilities?: any + Options?: any +} + +// docker hw can be defined with either deviceRequests (simpler, if you have a driver), or in advanced way +// advanced way means you have to defined different params like devices, cggroups, caps, etc +export interface dockerHwInit { + deviceRequests?: dockerDeviceRequest + advanced?: ArgumentValues + runtime?: string +} + export interface ComputeResource { id: ComputeResourceType + description?: string type?: string kind?: string total: number // total number of specific resource min: number // min number of resource needed for a job max: number // max number of resource for a job inUse?: number // for display purposes + init?: dockerHwInit } export interface ComputeResourceRequest { id: string @@ -77,6 +99,12 @@ export interface ComputeEnvironmentBaseConfig { platform: RunningPlatform } +export interface ComputeRuntimes { + [key: string]: { + path?: string + runtimeArgs?: string[] // Optional runtime arguments + } +} export interface ComputeEnvironment extends ComputeEnvironmentBaseConfig { id: string // v1 runningJobs: number diff --git a/src/components/c2d/compute_engine_base.ts b/src/components/c2d/compute_engine_base.ts index b2f49ddbb..336698d25 100644 --- a/src/components/c2d/compute_engine_base.ts +++ b/src/components/c2d/compute_engine_base.ts @@ -12,7 +12,8 @@ import type { ComputeResource, ComputeResourcesPricingInfo, DBComputeJobPayment, - DBComputeJob + DBComputeJob, + dockerDeviceRequest } from '../../@types/C2D/C2D.js' import { C2DClusterType } from '../../@types/C2D/C2D.js' import { C2DDatabase } from '../database/C2DDatabase.js' @@ -145,6 +146,14 @@ export abstract class C2DEngine { isFree: boolean ): ComputeResource { const paid = this.getResource(env.resources, id) + if (!paid) { + return { + id, + total: 0, + max: 0, + min: 0 + } + } let free = null if (isFree && 'free' in env && 'resources' in env.free) { free = this.getResource(env.free.resources, id) @@ -192,7 +201,7 @@ export abstract class C2DEngine { for (const device of elements) { let desired = this.getResourceRequest(resources, device) const minMax = this.getMaxMinResource(device, env, isFree) - if (!desired && minMax.min > 0) { + if (!desired && minMax.min >= 0) { // it's required desired = minMax.min } else { @@ -208,7 +217,7 @@ export abstract class C2DEngine { ) } } - properResources.push({ id: device, amount: minMax.min }) + properResources.push({ id: device, amount: desired }) } return properResources @@ -295,6 +304,88 @@ export abstract class C2DEngine { return null } + public getDockerDeviceRequest( + requests: ComputeResourceRequest[], + resources: ComputeResource[] + ) { + if (!resources) return null + const ret: dockerDeviceRequest[] = [] + for (const resource of requests) { + const res = this.getResource(resources, resource.id) + if (res.init && res.init.deviceRequests) { + ret.push(res.init.deviceRequests) + } + } + return ret + } + + public getDockerAdvancedConfig( + requests: ComputeResourceRequest[], + resources: ComputeResource[] + ) { + const ret = { + Devices: [] as any[], + GroupAdd: [] as string[], + SecurityOpt: [] as string[], + Binds: [] as string[], + CapAdd: [] as string[], + CapDrop: [] as string[], + IpcMode: null as string, + ShmSize: 0 as number + } + for (const resource of requests) { + const res = this.getResource(resources, resource.id) + if (res.init && res.init.advanced) { + for (const [key, value] of Object.entries(res.init.advanced)) { + switch (key) { + case 'IpcMode': + ret.IpcMode = value as string + break + case 'ShmSize': + ret.ShmSize = value as number + break + case 'GroupAdd': + for (const grp of value as string[]) { + if (!ret.GroupAdd.includes(grp)) ret.GroupAdd.push(grp) + } + break + case 'CapAdd': + for (const grp of value as string[]) { + if (!ret.CapAdd.includes(grp)) ret.CapAdd.push(grp) + } + break + case 'CapDrop': + for (const grp of value as string[]) { + if (!ret.CapDrop.includes(grp)) ret.CapDrop.push(grp) + } + break + case 'Devices': + for (const device of value as string[]) { + if (!ret.Devices.find((d) => d.PathOnHost === device)) + ret.Devices.push({ + PathOnHost: device, + PathInContainer: device, + CgroupPermissions: 'rwm' + }) + } + break + case 'SecurityOpt': + for (const [secKeys, secValues] of Object.entries(value)) + if (!ret.SecurityOpt.includes(secKeys + '=' + secValues)) + ret.SecurityOpt.push(secKeys + '=' + secValues) + break + case 'Binds': + for (const grp of value as string[]) { + if (!ret.Binds.includes(grp)) ret.Binds.push(grp) + } + break + } + } + } + } + return ret + } + public getEnvPricesForToken( env: ComputeEnvironment, chainId: number, diff --git a/src/components/c2d/compute_engine_docker.ts b/src/components/c2d/compute_engine_docker.ts index 423ed9e93..18556baaf 100644 --- a/src/components/c2d/compute_engine_docker.ts +++ b/src/components/c2d/compute_engine_docker.ts @@ -178,6 +178,7 @@ export class C2DEngineDocker extends C2DEngine { max: sysinfo.MemTotal, min: 1e9 }) + if (envConfig.resources) { for (const res of envConfig.resources) { // allow user to add other resources @@ -188,6 +189,36 @@ export class C2DEngineDocker extends C2DEngine { } } } + /* TODO - get namedresources & discreete one + if (sysinfo.GenericResources) { + for (const [key, value] of Object.entries(sysinfo.GenericResources)) { + for (const [type, val] of Object.entries(value)) { + // for (const resType in sysinfo.GenericResources) { + if (type === 'NamedResourceSpec') { + // if we have it, ignore it + const resourceId = val.Value + const resourceType = val.Kind + let found = false + for (const res of this.envs[0].resources) { + if (res.id === resourceId) { + found = true + break + } + } + if (!found) { + this.envs[0].resources.push({ + id: resourceId, + kind: resourceType, + total: 1, + max: 1, + min: 0 + }) + } + } + } + } + } + */ // limits for free env if ('free' in envConfig) { this.envs[0].free = {} @@ -203,6 +234,11 @@ export class C2DEngineDocker extends C2DEngine { } this.envs[0].id = this.getC2DConfig().hash + '-' + create256Hash(JSON.stringify(this.envs[0])) + + // only now set the timer + if (!this.cronTimer) { + this.setNewTimer() + } } // eslint-disable-next-line require-await @@ -214,11 +250,19 @@ export class C2DEngineDocker extends C2DEngine { */ if (!this.docker) return [] const filteredEnvs = [] + // const systemInfo = this.docker ? await this.docker.info() : null for (const computeEnv of this.envs) { if ( !chainId || (computeEnv.fees && Object.hasOwn(computeEnv.fees, String(chainId))) ) { + // TO DO - At some point in time we need to handle multiple runtimes + // console.log('********************************') + // console.log(systemInfo.GenericResources) + // console.log('********************************') + // if (systemInfo.Runtimes) computeEnv.runtimes = systemInfo.Runtimes + // if (systemInfo.DefaultRuntime) + // computeEnv.defaultRuntime = systemInfo.DefaultRuntime const { totalJobs, totalFreeJobs, usedResources, usedFreeResources } = await this.getUsedResources(computeEnv) computeEnv.runningJobs = totalJobs @@ -313,6 +357,7 @@ export class C2DEngineDocker extends C2DEngine { jobId: string ): Promise { if (!this.docker) return [] + // TO DO - iterate over resources and get default runtime const isFree: boolean = !(payment && payment.lockTx) // C2D - Check image, check arhitecture, etc @@ -522,6 +567,7 @@ export class C2DEngineDocker extends C2DEngine { if (jobs.length === 0) { CORE_LOGGER.info('No C2D jobs found for engine ' + this.getC2DConfig().hash) + this.setNewTimer() return } else { CORE_LOGGER.info(`Got ${jobs.length} jobs for engine ${this.getC2DConfig().hash}`) @@ -577,7 +623,14 @@ export class C2DEngineDocker extends C2DEngine { ) { delete volume.DriverOpts CORE_LOGGER.info('Retrying again without DriverOpts options...') - return this.createDockerVolume(volume) + try { + return this.createDockerVolume(volume) + } catch (e) { + CORE_LOGGER.error( + `Unable to create docker volume without DriverOpts: ${e.message}` + ) + return false + } } return false } @@ -667,19 +720,19 @@ export class C2DEngineDocker extends C2DEngine { // create the volume & create container // TO DO C2D: Choose driver & size // get env info - // const environment = await this.getJobEnvironment(job) - + const envResource = this.envs[0].resources const volume: VolumeCreateOptions = { Name: job.jobId + '-volume' } // volume - const diskSize = this.getResourceRequest(job.resources, 'disk') - if (diskSize && diskSize > 0) { + /* const diskSize = this.getResourceRequest(job.resources, 'disk') + if (diskSize && diskSize > 0) { volume.DriverOpts = { - o: 'size=' + String(diskSize) + o: 'size=' + String(diskSize), + device: 'local', + type: 'local' } - } - + } */ const volumeCreated = await this.createDockerVolume(volume, true) if (!volumeCreated) { job.status = C2DStatusNumber.VolumeCreationFailed @@ -704,11 +757,11 @@ export class C2DEngineDocker extends C2DEngine { ] } // disk - if (diskSize && diskSize > 0) { - hostConfig.StorageOpt = { - size: String(diskSize) - } - } + // if (diskSize && diskSize > 0) { + // hostConfig.StorageOpt = { + // size: String(diskSize) + // } + // } // ram const ramSize = this.getResourceRequest(job.resources, 'ram') if (ramSize && ramSize > 0) { @@ -734,7 +787,27 @@ export class C2DEngineDocker extends C2DEngine { Volumes: mountVols, HostConfig: hostConfig } - + // TO DO - iterate over resources and get default runtime + // TO DO - check resources and pass devices + const dockerDeviceRequest = this.getDockerDeviceRequest(job.resources, envResource) + if (dockerDeviceRequest) { + containerInfo.HostConfig.DeviceRequests = dockerDeviceRequest + } + const advancedConfig = this.getDockerAdvancedConfig(job.resources, envResource) + if (advancedConfig.Devices) + containerInfo.HostConfig.Devices = advancedConfig.Devices + if (advancedConfig.GroupAdd) + containerInfo.HostConfig.GroupAdd = advancedConfig.GroupAdd + if (advancedConfig.SecurityOpt) + containerInfo.HostConfig.SecurityOpt = advancedConfig.SecurityOpt + if (advancedConfig.Binds) containerInfo.HostConfig.Binds = advancedConfig.Binds + if (advancedConfig.CapAdd) containerInfo.HostConfig.CapAdd = advancedConfig.CapAdd + if (advancedConfig.CapDrop) + containerInfo.HostConfig.CapDrop = advancedConfig.CapDrop + if (advancedConfig.IpcMode) + containerInfo.HostConfig.IpcMode = advancedConfig.IpcMode + if (advancedConfig.ShmSize) + containerInfo.HostConfig.ShmSize = advancedConfig.ShmSize if (job.algorithm.meta.container.entrypoint) { const newEntrypoint = job.algorithm.meta.container.entrypoint.replace( '$ALGO', @@ -742,11 +815,9 @@ export class C2DEngineDocker extends C2DEngine { ) containerInfo.Entrypoint = newEntrypoint.split(' ') } - console.log('CREATING CONTAINER') - console.log(containerInfo) const container = await this.createDockerContainer(containerInfo, true) if (container) { - console.log('container: ', container) + console.log('Container created: ', container) job.status = C2DStatusNumber.Provisioning job.statusText = C2DStatusText.Provisioning await this.db.updateJob(job) @@ -1004,7 +1075,7 @@ export class C2DEngineDocker extends C2DEngine { // So we cannot test this from the CLI for instance... Only Option is to actually send it encrypted // OR extract the files object from the passed DDO, decrypt it and use it - console.log(job.algorithm.fileObject) + // console.log(job.algorithm.fileObject) const fullAlgoPath = this.getC2DConfig().tempFolder + '/' + job.jobId + '/data/transformations/algorithm' try {