|
1 | | -name: IXUCA-Smoke-Schedule |
| 1 | +# name: IXUCA-Smoke-Schedule |
2 | 2 |
|
3 | | -on: |
4 | | - workflow_dispatch: |
5 | | - inputs: |
6 | | - check_type: |
7 | | - description: "Type of check: scheduled or double_check" |
8 | | - required: false |
9 | | - default: "scheduled" |
10 | | - type: choice |
11 | | - options: |
12 | | - - scheduled |
13 | | - - double_check |
14 | | - schedule: |
15 | | - - cron: "0 * * * *" |
| 3 | +# on: |
| 4 | +# workflow_dispatch: |
| 5 | +# inputs: |
| 6 | +# check_type: |
| 7 | +# description: "Type of check: scheduled or double_check" |
| 8 | +# required: false |
| 9 | +# default: "scheduled" |
| 10 | +# type: choice |
| 11 | +# options: |
| 12 | +# - scheduled |
| 13 | +# - double_check |
| 14 | +# schedule: |
| 15 | +# - cron: "0 * * * *" |
16 | 16 |
|
17 | | -permissions: read-all |
| 17 | +# permissions: read-all |
18 | 18 |
|
19 | | -concurrency: |
20 | | - group: ixuca-smoke-schedule |
21 | | - cancel-in-progress: false |
| 19 | +# concurrency: |
| 20 | +# group: ixuca-smoke-schedule |
| 21 | +# cancel-in-progress: false |
22 | 22 |
|
23 | | -jobs: |
24 | | - smoke-check: |
25 | | - name: Smoke Check (run_check) |
26 | | - runs-on: iluvatar-gpu-2 |
27 | | - timeout-minutes: 20 |
28 | | - container: |
29 | | - image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 |
30 | | - env: |
31 | | - LD_LIBRARY_PATH: /usr/local/corex/lib |
32 | | - LIBRARY_PATH: /usr/local/corex/lib |
33 | | - no_proxy: "bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" |
34 | | - steps: |
35 | | - - name: Install paddle nightly |
36 | | - run: | |
37 | | - set -e |
38 | | - pip uninstall -y paddlepaddle || true |
39 | | - pip uninstall -y paddle-iluvatar-gpu || true |
40 | | - retry_count=0 |
41 | | - max_retries=3 |
42 | | - while [ $retry_count -lt $max_retries ]; do |
43 | | - if python3 -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/; then |
44 | | - echo "Paddle install success" |
45 | | - break |
46 | | - fi |
47 | | - retry_count=$((retry_count + 1)) |
48 | | - if [ $retry_count -lt $max_retries ]; then |
49 | | - echo "Install failed, retrying in 30 seconds... ($retry_count/$max_retries)" |
50 | | - sleep 30 |
51 | | - else |
52 | | - echo "Install failed after $max_retries attempts." |
53 | | - exit 1 |
54 | | - fi |
55 | | - done |
56 | | - pip show paddlepaddle |
57 | | - retry_count=0 |
58 | | - while [ $retry_count -lt $max_retries ]; do |
59 | | - if python3 -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/; then |
60 | | - echo "paddle-iluvatar-gpu install success" |
61 | | - break |
62 | | - fi |
63 | | - retry_count=$((retry_count + 1)) |
64 | | - if [ $retry_count -lt $max_retries ]; then |
65 | | - echo "paddle-iluvatar-gpu install failed, retrying in 30 seconds... ($retry_count/$max_retries)" |
66 | | - sleep 30 |
67 | | - else |
68 | | - echo "paddle-iluvatar-gpu install failed after $max_retries attempts." |
69 | | - exit 1 |
70 | | - fi |
71 | | - done |
72 | | - pip show paddle-iluvatar-gpu |
| 23 | +# jobs: |
| 24 | +# smoke-check: |
| 25 | +# name: Smoke Check (run_check) |
| 26 | +# runs-on: iluvatar-gpu-2 |
| 27 | +# timeout-minutes: 20 |
| 28 | +# container: |
| 29 | +# image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 |
| 30 | +# env: |
| 31 | +# LD_LIBRARY_PATH: /usr/local/corex/lib |
| 32 | +# LIBRARY_PATH: /usr/local/corex/lib |
| 33 | +# no_proxy: "bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" |
| 34 | +# steps: |
| 35 | +# - name: Install paddle nightly |
| 36 | +# run: | |
| 37 | +# set -e |
| 38 | +# pip uninstall -y paddlepaddle || true |
| 39 | +# pip uninstall -y paddle-iluvatar-gpu || true |
| 40 | +# retry_count=0 |
| 41 | +# max_retries=3 |
| 42 | +# while [ $retry_count -lt $max_retries ]; do |
| 43 | +# if python3 -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/; then |
| 44 | +# echo "Paddle install success" |
| 45 | +# break |
| 46 | +# fi |
| 47 | +# retry_count=$((retry_count + 1)) |
| 48 | +# if [ $retry_count -lt $max_retries ]; then |
| 49 | +# echo "Install failed, retrying in 30 seconds... ($retry_count/$max_retries)" |
| 50 | +# sleep 30 |
| 51 | +# else |
| 52 | +# echo "Install failed after $max_retries attempts." |
| 53 | +# exit 1 |
| 54 | +# fi |
| 55 | +# done |
| 56 | +# pip show paddlepaddle |
| 57 | +# retry_count=0 |
| 58 | +# while [ $retry_count -lt $max_retries ]; do |
| 59 | +# if python3 -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/; then |
| 60 | +# echo "paddle-iluvatar-gpu install success" |
| 61 | +# break |
| 62 | +# fi |
| 63 | +# retry_count=$((retry_count + 1)) |
| 64 | +# if [ $retry_count -lt $max_retries ]; then |
| 65 | +# echo "paddle-iluvatar-gpu install failed, retrying in 30 seconds... ($retry_count/$max_retries)" |
| 66 | +# sleep 30 |
| 67 | +# else |
| 68 | +# echo "paddle-iluvatar-gpu install failed after $max_retries attempts." |
| 69 | +# exit 1 |
| 70 | +# fi |
| 71 | +# done |
| 72 | +# pip show paddle-iluvatar-gpu |
73 | 73 |
|
74 | | - - name: Run Check |
75 | | - run: | |
76 | | - set -e |
77 | | - ixsmi |
78 | | - gpu_count=$(ixsmi --query-gpu=name --format=csv,noheader | wc -l) |
79 | | - echo "Detected GPU count: ${gpu_count}" |
80 | | - if [ "${gpu_count}" -le 1 ]; then |
81 | | - echo "GPU count is <= 1, card status is abnormal." |
82 | | - exit 1 |
83 | | - fi |
84 | | - export LD_LIBRARY_PATH=/usr/local/openmpi/lib/:$LD_LIBRARY_PATH |
85 | | - toolbox_bin=$(ls -d /usr/local/corex/corex-toolbox-*/bin 2>/dev/null | head -n 1) |
86 | | - if [ -z "${toolbox_bin}" ]; then |
87 | | - echo "Cannot find /usr/local/corex/corex-toolbox-*/bin" |
88 | | - exit 1 |
89 | | - fi |
90 | | - cd "${toolbox_bin}" |
91 | | - mpirun --allow-run-as-root --report-bindings -tag-output --prefix /usr/local -np 2 --bind-to none --map-by node -mca btl ^openib ./all_reduce_perf -b 8 -e 1G -f 2 -g 1 |
92 | | - cd - |
93 | | - python3 -c "import paddle; paddle.utils.run_check();" |
| 74 | +# - name: Run Check |
| 75 | +# run: | |
| 76 | +# set -e |
| 77 | +# ixsmi |
| 78 | +# gpu_count=$(ixsmi --query-gpu=name --format=csv,noheader | wc -l) |
| 79 | +# echo "Detected GPU count: ${gpu_count}" |
| 80 | +# if [ "${gpu_count}" -le 1 ]; then |
| 81 | +# echo "GPU count is <= 1, card status is abnormal." |
| 82 | +# exit 1 |
| 83 | +# fi |
| 84 | +# export LD_LIBRARY_PATH=/usr/local/openmpi/lib/:$LD_LIBRARY_PATH |
| 85 | +# toolbox_bin=$(ls -d /usr/local/corex/corex-toolbox-*/bin 2>/dev/null | head -n 1) |
| 86 | +# if [ -z "${toolbox_bin}" ]; then |
| 87 | +# echo "Cannot find /usr/local/corex/corex-toolbox-*/bin" |
| 88 | +# exit 1 |
| 89 | +# fi |
| 90 | +# cd "${toolbox_bin}" |
| 91 | +# mpirun --allow-run-as-root --report-bindings -tag-output --prefix /usr/local -np 2 --bind-to none --map-by node -mca btl ^openib ./all_reduce_perf -b 8 -e 1G -f 2 -g 1 |
| 92 | +# cd - |
| 93 | +# python3 -c "import paddle; paddle.utils.run_check();" |
0 commit comments