forked from deeplearning4j/deeplearning4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild-deploy-linux-cuda-12.6.yml
More file actions
304 lines (265 loc) · 12.1 KB
/
build-deploy-linux-cuda-12.6.yml
File metadata and controls
304 lines (265 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
on:
workflow_dispatch:
inputs:
buildThreads:
description: 'Build threads for libnd4j. Used to control memory usage of builds.'
required: true
default: 4
deployToReleaseStaging:
description: 'Whether to deploy to release staging or not.'
required: false
default: 0
releaseVersion:
description: 'Release version target'
required: false
default: 1.0.0-M3
snapshotVersion:
description: 'Snapshot version target'
required: false
default: 1.0.0-SNAPSHOT
releaseRepoId:
description: 'Release repository id'
required: false
default:
serverId:
description: 'Server id to publish to'
required: false
default: central
mvnFlags:
description: "Extra maven flags (must escape input yourself if used)"
required: false
default:
libnd4jUrl:
description: 'Whether to download libnd4j using https://github.com/KonduitAI/gh-actions-libnd4j-urls/ for this build. LIBND4J_HOME will automatically be set. Should be used when only needing to build other modules.'
required: false
default:
runsOn:
description: 'System to run on'
required: false
default: ubuntu-22.04
debug_enabled:
description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
required: false
default: false
jobs:
linux-x86_64-cuda-12-6:
strategy:
fail-fast: false
matrix:
helper: [ cudnn, "" ]
extension: [ "" ]
include:
- mvn_ext: ${{ github.event.inputs.mvnFlags }}
experimental: true
name: Extra maven flags
- debug_enabled: ${{ github.event.inputs.debug_enabled }}
experimental: true
name: Debug enabled
- runs_on: ${{ github.event.inputs.runsOn }}
experimental: true
name: OS to run on
- libnd4j_file_download: ${{ github.event.inputs.libnd4jUrl }}
experimental: true
name: OS to run on
- deploy_to_release_staging: ${{ github.event.inputs.deployToReleaseStaging }}
experimental: true
name: Whether to deploy to release staging or not
- release_version: ${{ github.event.inputs.releaseVersion }}
experimental: true
name: Release version
- snapshot_version: ${{ github.event.inputs.snapshotVersion }}
experimental: true
name: Snapshot version
- server_id: ${{ github.event.inputs.serverId }}
experimental: true
name: Server id
- release_repo_id: ${{ github.event.inputs.releaseRepoId }}
experimental: true
name: The release repository to run on
- mvn_flags: ${{ github.event.inputs.mvnFlags }}
experimental: true
name: Extra maven flags to use as part of the build
- build_threads: ${{ github.event.inputs.buildThreads }}
experimental: true
name: The number of threads to build libnd4j with
runs-on: ${{ matrix.runs_on }}
env:
CUDA_PATH: /usr/local/cuda-12.6
CUDNN_ROOT_DIR: /usr/local/cuda-12.6
steps:
- name: Cancel Previous Runs
uses: styfle/cancel-workflow-action@0.8.0
with:
access_token: ${{ github.token }}
- name: Free Disk Space
uses: jlumbroso/free-disk-space@main
with:
tool-cache: false
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Configure swap space
shell: bash
run: |
echo "=== Initial system status ==="
free -h
df -h
echo "Available disk space on root:"
df -h /
# Check if swap is already enabled
if swapon --show | grep -q "/"; then
echo "Swap already configured:"
swapon --show
else
echo "Configuring swap space..."
# Use input parameter for swap size, default to 12GB like Android workflow
SWAP_SIZE="12G"
echo "Creating ${SWAP_SIZE} swap file..."
# Create swap file
sudo fallocate -l ${SWAP_SIZE} /swapfile || {
sudo dd if=/dev/zero of=/swapfile bs=1G count=12 status=progress
}
# Set up swap
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
# Verify swap is active
echo "=== Swap configuration completed ==="
free -h
swapon --show
fi
# Tune swappiness for build workloads - using Android workflow settings
# Lower values (10-20) prefer RAM, higher values (60+) use swap more aggressively
echo "Current swappiness: $(cat /proc/sys/vm/swappiness)"
sudo sysctl vm.swappiness=80
echo "Adjusted swappiness: $(cat /proc/sys/vm/swappiness)"
sudo sysctl vm.overcommit_memory=2 # Allow overcommit
sudo sysctl vm.overcommit_ratio=80
sudo sysctl vm.vfs_cache_pressure=50
sudo sysctl vm.dirty_ratio=10
sudo sysctl vm.dirty_background_ratio=5
sudo sysctl kernel.shmmax=68719476736
# Show final memory status
echo "=== Final memory status ==="
free -h
- uses: actions/checkout@v2
- name: Set mvn build command based on matrix
shell: bash
run: |
if [ "${{ matrix.libnd4j_file_download }}" != '' ]; then
modules=':nd4j-cuda-12.6,:nd4j-cuda-12.6-preset'
elif [ "${{ matrix.helper }}" == '' ]; then
echo "Building libnd4j from source"
modules=':nd4j-cuda-12.6,:nd4j-cuda-12.6-preset,:libnd4j,:nd4j-cuda-12.6-platform'
else
echo "Building libnd4j from source"
modules=':nd4j-cuda-12.6,:nd4j-cuda-12.6-preset,:libnd4j'
fi
command="mvn ${{ matrix.mvn_ext }} -Pcuda -Dlibnd4j.generate.flatc=ON --no-transfer-progress -Dlibnd4j.cuda.compile.skip=false -Dlibnd4j.chip=cuda -pl ${modules} -Dlibnd4j.compute='8.6 9.0' -Dlibnd4j.cpu.compile.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false -Dmaven.wagon.http.retryHandler.count=3 -Possrh -Dlibnd4j.buildthreads=${{ matrix.build_threads }} -Djavacpp.platform=linux-x86_64 -Dlibnd4j.chip=cuda --also-make clean --batch-mode package deploy -DskipTests"
libnd4j_download_file_url=""
if [ "${{ matrix.helper }}" != '' ] && [ "${{ matrix.extension }}" != '' ]; then
mvn_ext=" -Djavacpp.platform.extension=-${{ matrix.helper }}-${{ matrix.extension }} -Dlibnd4j.helper=${{ matrix.helper }} -Dlibnd4j.extension=${{ matrix.extension }} -Dlibnd4j.classifier=linux-x86_64-cuda-12.6-${{ matrix.helper }}-${{matrix.extension}}"
libnd4j_download_file_url="linux-cuda-12.6-${{ matrix.extension }}-${{ matrix.helper }}"
elif [ "${{ matrix.helper }}" != '' ]; then
mvn_ext=" -Djavacpp.platform.extension=-${{ matrix.helper }} -Dlibnd4j.helper=${{ matrix.helper }} -Dlibnd4j.classifier=linux-x86_64-cuda-12.6-${{ matrix.helper }}"
libnd4j_download_file_url="linux-cuda-12.6-${{ matrix.extension }}-${{ matrix.helper }}"
else
mvn_ext=" -Dlibnd4j.classifier=linux-x86_64-cuda-12.6"
libnd4j_download_file_url="linux-cuda-12.6-${{ matrix.extension }}-${{ matrix.helper }}"
fi
command="${command} ${mvn_ext}"
echo "Setting command for helper ${{ matrix.helper }} and extension ${{ matrix.extension }} to ${command}"
echo "COMMAND=${command}" >> $GITHUB_ENV
- name: Cache protobuf install
uses: actions/cache@v4
id: cache-protobuf
with:
path: /opt/protobuf
key: ${{ matrix.runs_on }}-protobuf
restore-keys: ${{ matrix.runs_on }}-protobuf
- uses: ./.github/actions/install-protobuf-linux
- name: Set up Java for publishing to GitHub Packages
uses: actions/setup-java@v4
with:
java-version: 11
distribution: 'temurin'
server-id: ${{ github.event.inputs.serverId }}
server-username: MAVEN_USERNAME
server-password: MAVEN_PASSWORD
gpg-private-key: ${{ secrets.SONATYPE_GPG_KEY }}
gpg-passphrase: MAVEN_GPG_PASSPHRASE
cache: 'maven'
- uses: konduitai/cuda-install/.github/actions/install-cuda-ubuntu@master
env:
cuda: 12.6.0
GCC: 11
if: steps.cache-cuda-126.outputs.cache-hit != 'true'
# Set up CUDA environment paths
- name: Setup CUDA PATH
run: |
echo "PATH=/usr/local/cuda-12.6/bin:$PATH" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
# Verify the CUDA installation and PATH setup
- name: Verify CUDA Setup
run: |
echo "Verifying CUDA installation and PATH setup"
echo $PATH
ls -la /usr/local/cuda-12.6/bin
which nvcc || echo "nvcc not found in PATH"
nvcc --version || echo "nvcc command failed"
- name: Run cuda compilation on linux-x86_64
shell: bash
env:
MAVEN_GPG_KEY: ${{ secrets.SONATYPE_GPG_KEY }}
DEBIAN_FRONTEND: noninteractive
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PUBLISH_TO: central
MAVEN_USERNAME: ${{ secrets.CENTRAL_SONATYPE_TOKEN_USERNAME }}
MAVEN_PASSWORD: ${{ secrets.CENTRAL_SONATYPE_TOKEN_PASSWORD }}
MAVEN_GPG_PASSPHRASE: ${{ secrets.PACKAGES_GPG_PASS }}
PERFORM_RELEASE: ${{ matrix.deploy_to_release_staging }}
RELEASE_VERSION: ${{ matrix.release_version }}
SNAPSHOT_VERSION: ${{ matrix.snapshot_version }}
RELEASE_REPO_ID: ${{ matrix.release_repo_id }}
MODULES: ${{ matrix.mvn_flags }}
LIBND4J_HOME_SUFFIX: cuda
MAVEN_OPTS: -Xmx2g
HELPER: ${{ matrix.helper }}
EXTENSION: ${{ matrix.extension }}
LIBND4J_FILE_NAME: ${{ matrix.libnd4j_file_download }}
run: |
echo "libnd4j build threads ${{ matrix.build_threads }}"
echo "deploy to release staging repo or not ${{ matrix.deploy_to_release_staging }}"
echo "release version ${{ matrix.release_version }}"
echo "snapshot version ${{ matrix.snapshot_version }}"
echo "debug enabled ${{ matrix.debug_enabled }}"
echo "libnd4j url ${{ matrix.libnd4j_file_download }}"
echo "maven flags ${{ matrix.mvn_flags }}"
echo "snapshot version ${{ matrix.snapshot_version }}"
echo "server id ${{ matrix.server_id }}"
echo "release repo id ${{ matrix.release_repo_id }}"
# Explicitly set PATH and other environment variables for this step
export PATH=/usr/local/cuda-12.6/bin:/opt/protobuf/bin:/opt/cmake/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH
# Test that nvcc is properly in the PATH
echo "Checking nvcc availability:"
which nvcc
nvcc --version
mvn --version
cmake --version
protoc --version
sudo apt-get autoremove
sudo apt-get clean
bash ./change-cuda-versions.sh 12.6
# Note: we need this for the cudnn helpers, our cmake can't find it otherwise.
# See here: https://github.com/eclipse/deeplearning4j/blob/master/libnd4j/CMakeLists.txt#L298
if [ "$PERFORM_RELEASE" == 1 ]; then
echo "Performing release"
bash ${GITHUB_WORKSPACE}/release-specified-component.sh "${RELEASE_VERSION}" "${SNAPSHOT_VERSION}" "${RELEASE_REPO_ID}" "${COMMAND}"
else
echo "Running build and deploying to snapshots"
eval "${COMMAND}"
fi