Skip to content

Commit ef4ce85

Browse files
saibulususaibulusu
andauthored
superbench NVIDIA A100 setup profile, ansible-core, and disk space (#626)
* editing initialize.sh * NVIDIA in metadata * minor * Updating test case to use setu nvidia a100 profile. * optional step for superbench * Using a conditional in initialize.sh to allow for the user to set the docker container path. --------- Co-authored-by: saibulusu <saibulusu@microsoft.com>
1 parent fa567e3 commit ef4ce85

6 files changed

Lines changed: 174 additions & 52 deletions

File tree

src/VirtualClient/VirtualClient.Actions.FunctionalTests/SuperBenchmarkProfileTests.cs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public void SuperBenchmarkWorkloadProfileParametersAreInlinedCorrectly(string pr
3939
}
4040

4141
[Test]
42-
[TestCase("PERF-GPU-SUPERBENCH.json")]
42+
[TestCase("SETUP-NVIDIA-A100.json")]
4343
public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAndReboot(string profile)
4444
{
4545
List<string> expectedCommands = new List<string>
@@ -74,7 +74,7 @@ public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAn
7474
}
7575

7676
[Test]
77-
[TestCase("PERF-GPU-SUPERBENCH.json")]
77+
[TestCase("SETUP-NVIDIA-A100.json")]
7878
public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAndWorkloadsAfterReboot(string profile)
7979
{
8080
IEnumerable<string> expectedCommands = this.GetProfileExpectedCommands(PlatformID.Unix);
@@ -123,12 +123,7 @@ private IEnumerable<string> GetProfileExpectedCommands(PlatformID platform)
123123
$"sudo bash -c \"{setupCommand}\"",
124124
$"sudo apt-get update",
125125
$"sudo apt-get install -y nvidia-container-toolkit",
126-
$"sudo systemctl restart docker",
127-
$"sudo chmod -R 2777 \"/home/user/tools/VirtualClient\"",
128-
$"sudo git clone -b v0.9.0 https://github.com/microsoft/superbenchmark",
129-
$"sudo bash initialize.sh",
130-
$"sb deploy --host-list localhost -i superbench/superbench:v0.9.0-cuda12.1",
131-
$"sb run --host-list localhost -c default.yaml"
126+
$"sudo systemctl restart docker"
132127
};
133128
}
134129
}

src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,60 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati
260260
Assert.IsTrue(processCount == 5);
261261
}
262262

263+
[Test]
264+
public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationAndDockerContainerPath()
265+
{
266+
this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
267+
{
268+
{ nameof(SuperBenchmarkExecutor.Version), "0.0.1" },
269+
{ nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" },
270+
{ nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" },
271+
{ nameof(SuperBenchmarkExecutor.Username), "testuser" },
272+
{ nameof(SuperBenchmarkExecutor.DockerContainerPath), "/docker/path" }
273+
};
274+
275+
ProcessStartInfo expectedInfo = new ProcessStartInfo();
276+
List<string> expectedCommands = new List<string>
277+
{
278+
$"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"",
279+
$"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark",
280+
$"sudo bash initialize.sh testuser /docker/path",
281+
$"sb deploy --host-list localhost -i testContainer",
282+
$"sb run --host-list localhost -c Test.yaml"
283+
};
284+
285+
int processCount = 0;
286+
this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
287+
{
288+
Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}");
289+
processCount++;
290+
291+
return new InMemoryProcess
292+
{
293+
StartInfo = new ProcessStartInfo
294+
{
295+
FileName = exe,
296+
Arguments = arguments
297+
},
298+
ExitCode = 0,
299+
OnStart = () => true,
300+
OnHasExited = () => true
301+
};
302+
};
303+
304+
this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState()
305+
{
306+
SuperBenchmarkInitialized = false
307+
}));
308+
309+
using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
310+
{
311+
await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
312+
}
313+
314+
Assert.IsTrue(processCount == expectedCommands.Count);
315+
}
316+
263317
[Test]
264318
public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRun()
265319
{

src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,12 @@ namespace VirtualClient.Actions
77
using System.Collections.Generic;
88
using System.IO;
99
using System.IO.Abstractions;
10-
using System.Runtime.InteropServices;
1110
using System.Threading;
1211
using System.Threading.Tasks;
1312
using Microsoft.CodeAnalysis;
1413
using Microsoft.Extensions.DependencyInjection;
1514
using VirtualClient.Common;
1615
using VirtualClient.Common.Extensions;
17-
using VirtualClient.Common.Platform;
1816
using VirtualClient.Common.Telemetry;
1917
using VirtualClient.Contracts;
2018
using VirtualClient.Contracts.Metadata;
@@ -121,6 +119,18 @@ public string OutputDirectory
121119
}
122120
}
123121

122+
/// <summary>
123+
/// Path to hold all docker container data.
124+
/// </summary>
125+
public string DockerContainerPath
126+
{
127+
get
128+
{
129+
this.Parameters.TryGetValue(nameof(SuperBenchmarkExecutor.DockerContainerPath), out IConvertible dockerContainerPath);
130+
return dockerContainerPath?.ToString();
131+
}
132+
}
133+
124134
/// <summary>
125135
/// Executes the SuperBenchmark workload.
126136
/// </summary>
@@ -173,7 +183,14 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
173183
true);
174184
}
175185

176-
await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
186+
string initializeArgs = $"initialize.sh {this.Username}";
187+
188+
if (!string.IsNullOrEmpty(this.DockerContainerPath))
189+
{
190+
initializeArgs = $"initialize.sh {this.Username} {this.DockerContainerPath}";
191+
}
192+
193+
await this.ExecuteSbCommandAsync("bash", initializeArgs, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
177194
await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false);
178195

179196
state.SuperBenchmarkInitialized = true;
Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,49 @@
11
# Ansible will use sudo which needs explicit password input. This command removes that step.
22
echo '$1 ALL=(ALL) NOPASSWD:ALL' | (sudo EDITOR='tee -a' visudo)
3-
# sb binary might be in this path. This command adds this path to the PATH variable.
4-
export PATH=$PATH:/home/$1/.local/bin
3+
4+
# Remove any existing system-installed Ansible to avoid version conflicts
5+
sudo apt remove -y ansible || true
6+
sudo pip3 uninstall -y ansible ansible-base ansible-core || true
7+
8+
# Install ansible-core compatible with Python 3.8 (Ubuntu 20.04)
9+
python3 -m pip install --user "ansible-core>=2.12,<2.14"
10+
11+
# Ensure the pip user-installed ansible is in PATH and takes precedence
12+
export PATH=/home/$1/.local/bin:$PATH
13+
14+
# Configure Docker to use the data disk at path, unless not provided
15+
if [[ -n "${2:-}" ]]; then
16+
DOCKER_DATA_ROOT="$2"
17+
echo "Configuring Docker data-root at ${DOCKER_DATA_ROOT} ..."
18+
19+
# Create target path and stop Docker cleanly
20+
sudo mkdir -p "${DOCKER_DATA_ROOT}"
21+
sudo systemctl stop docker || true
22+
23+
# Write/merge daemon.json to set data-root
24+
# If jq is present and an existing file exists, merge to preserve other keys; otherwise overwrite minimal file.
25+
if command -v jq >/dev/null 2>&1 && [[ -f /etc/docker/daemon.json ]]; then
26+
TMP_JSON=$(mktemp)
27+
sudo jq --arg dr "${DOCKER_DATA_ROOT}" '. + { "data-root": $dr }' /etc/docker/daemon.json | sudo tee "${TMP_JSON}" >/dev/null
28+
sudo mv "${TMP_JSON}" /etc/docker/daemon.json
29+
else
30+
echo "{\"data-root\": \"${DOCKER_DATA_ROOT}\"}" | sudo tee /etc/docker/daemon.json >/dev/null
31+
fi
32+
33+
# Start Docker back up
34+
sudo systemctl start docker
35+
36+
# (Optional) Warm-up/check NVIDIA devices as you had in the commented section
37+
# sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi
38+
else
39+
echo "No second argument provided; skipping Docker data-root configuration."
40+
fi
41+
542
# Command to install sb dependencies.
643
python3 -m pip install .
44+
745
# Command to build sb.
846
make postinstall
47+
948
# This command initiates /dev/nvidiactl and /dev/nvidia-uvm directories, which sb checks before running.
1049
sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi

src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json

Lines changed: 3 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,12 @@
55
"RecommendedMinimumExecutionTime": "08:00:00",
66
"SupportedPlatforms": "linux-x64",
77
"SupportedOperatingSystems": "Ubuntu",
8-
"SpecialRequirements": "This is an Nvidia GPU-specialized workload. It depends upon the system having an Nvidia GPU card/chip."
8+
"SpecialRequirements": "This is an Nvidia GPU-specialized workload. It depends upon the system having an Nvidia GPU card/chip. Use a SETUP profile for GPU setup."
99
},
1010
"Parameters": {
1111
"ConfigurationFile": "default.yaml",
1212
"Username": "",
13-
"LinuxCudaVersion": "12.0",
14-
"LinuxDriverVersion": "525",
15-
"LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run"
13+
"DockerContainerPath": null
1614
},
1715
"Actions": [
1816
{
@@ -21,44 +19,10 @@
2119
"Scenario": "Models",
2220
"Username": "$.Parameters.Username",
2321
"Version": "0.9.0",
22+
"DockerContainerPath": "$.Parameters.DockerContainerPath",
2423
"ConfigurationFile": "$.Parameters.ConfigurationFile",
2524
"ContainerVersion": "superbench/superbench:v0.9.0-cuda12.1"
2625
}
2726
}
28-
],
29-
"Dependencies": [
30-
{
31-
"Type": "NvidiaCudaInstallation",
32-
"Parameters": {
33-
"Scenario": "InstallNvidiaCuda",
34-
"LinuxCudaVersion": "$.Parameters.LinuxCudaVersion",
35-
"LinuxDriverVersion": "$.Parameters.LinuxDriverVersion",
36-
"Username": "$.Parameters.Username",
37-
"LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile"
38-
}
39-
},
40-
{
41-
"Type": "DockerInstallation",
42-
"Parameters": {
43-
"Scenario": "InstallDocker"
44-
}
45-
},
46-
{
47-
"Type": "NvidiaContainerToolkitInstallation",
48-
"Parameters": {
49-
"Scenario": "InstallNvidiaContainerToolkit"
50-
}
51-
},
52-
{
53-
"Type": "LinuxPackageInstallation",
54-
"Parameters": {
55-
"Scenario": "InstallLinuxPackages",
56-
"Packages": "sshpass,python3-pip",
57-
"Packages-Apt": "nvidia-common",
58-
"Packages-Dnf": "nvidia-driver",
59-
"Packages-Yum": "nvidia-driver",
60-
"Packages-Zypper": ""
61-
}
62-
}
6327
]
6428
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
{
2+
"Description": "NVIDIA A100 GPU Driver Installation Dependency",
3+
"Metadata": {
4+
"RecommendedMinimumExecutionTime": "00:10:00",
5+
"SupportedPlatforms": "linux-x64",
6+
"SupportedOperatingSystems": "Linux",
7+
"SupportedLinuxGpuModel": "NVIDIA A100",
8+
"SupportedLinuxDistros": "Ubuntu20",
9+
"SpecialRequirements": "This is an NVIDIA GPU Driver dependency. It can only be installed on the system having an NVIDIA A100 GPU card/chip."
10+
},
11+
"Parameters": {
12+
"ConfigurationFile": "default.yaml",
13+
"Username": "",
14+
"LinuxCudaVersion": "12.0",
15+
"LinuxDriverVersion": "525",
16+
"LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run"
17+
},
18+
"Dependencies": [
19+
{
20+
"Type": "DockerInstallation",
21+
"Parameters": {
22+
"Scenario": "InstallDocker"
23+
}
24+
},
25+
{
26+
"Type": "NvidiaCudaInstallation",
27+
"Parameters": {
28+
"Scenario": "InstallNvidiaCuda",
29+
"LinuxCudaVersion": "$.Parameters.LinuxCudaVersion",
30+
"LinuxDriverVersion": "$.Parameters.LinuxDriverVersion",
31+
"Username": "$.Parameters.Username",
32+
"LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile"
33+
}
34+
},
35+
{
36+
"Type": "NvidiaContainerToolkitInstallation",
37+
"Parameters": {
38+
"Scenario": "InstallNvidiaContainerToolkit"
39+
}
40+
},
41+
{
42+
"Type": "LinuxPackageInstallation",
43+
"Parameters": {
44+
"Scenario": "InstallLinuxPackages",
45+
"Packages": "sshpass,python3-pip",
46+
"Packages-Apt": "nvidia-common",
47+
"Packages-Dnf": "nvidia-driver",
48+
"Packages-Yum": "nvidia-driver",
49+
"Packages-Zypper": ""
50+
}
51+
}
52+
]
53+
}

0 commit comments

Comments
 (0)