Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions .github/workflows/_accuracy_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'

wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}

tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}

rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
Expand Down Expand Up @@ -145,7 +160,10 @@ jobs:
docker rm -f ${runner_name} || true
fi

docker run --rm --ipc=host --pid=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
Expand All @@ -160,10 +178,11 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/

pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

Expand Down Expand Up @@ -206,3 +225,10 @@ jobs:
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}

- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
32 changes: 27 additions & 5 deletions .github/workflows/_base_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,14 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'

Expand Down Expand Up @@ -111,7 +118,11 @@ jobs:
exit 1
fi

tar -xf FastDeploy.tar.gz
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}

rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
Expand Down Expand Up @@ -182,7 +193,10 @@ jobs:
docker rm -f ${runner_name} || true
fi

docker run --rm --ipc=host --pid=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
Expand All @@ -197,17 +211,18 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/

pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

python -m pip install ${fastdeploy_wheel_url}
python -m pip install pytest

wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
wget --no-proxy https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
chmod +x ./llm-deploy-linux-amd64
./llm-deploy-linux-amd64 -python python3.10 \
-model_name ERNIE-4.5-0.3B-Paddle \
Expand Down Expand Up @@ -279,3 +294,10 @@ jobs:
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}

- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
14 changes: 12 additions & 2 deletions .github/workflows/_build_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ jobs:
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline

- name: FastDeploy Build
shell: bash
env:
Expand Down Expand Up @@ -150,7 +151,8 @@ jobs:
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host \
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
--cap-add=SYS_PTRACE --shm-size=64G \
--name ${runner_name} \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
Expand All @@ -164,6 +166,7 @@ jobs:
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "BRANCH_REF=${BRANCH_REF}" \
-e "CCACHE_MAXSIZE=50G" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}
Expand All @@ -188,7 +191,7 @@ jobs:
else
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
fi

pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
Expand Down Expand Up @@ -237,3 +240,10 @@ jobs:
target_path_stripped="${target_path#paddle-github-action/}"
WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name}
echo "wheel_path=${WHEEL_PATH}" >> $GITHUB_OUTPUT

- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
18 changes: 15 additions & 3 deletions .github/workflows/_build_linux_rl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
description: "Build Images"
required: true
type: string
default: "iregistry.baidu-int.com/tiangexiao/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-rc2"
default: "iregistry.baidu-int.com/new_rl_infra/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-v2.4.0-rc1"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
Expand Down Expand Up @@ -52,9 +52,10 @@ on:
wheel_path_rl:
description: "Output path of the generated wheel"
value: ${{ jobs.fd-build-rl.outputs.wheel_path_rl }}

jobs:
fd-build-rl:
runs-on: [self-hosted, GPU-Build]
runs-on: [self-hosted, GPU-Build-RL]
timeout-minutes: 360
outputs:
wheel_path_rl: ${{ steps.set_output.outputs.wheel_path_rl }}
Expand Down Expand Up @@ -107,6 +108,7 @@ jobs:
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline

- name: FastDeploy Build
shell: bash
env:
Expand Down Expand Up @@ -137,7 +139,8 @@ jobs:
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host \
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
--cap-add=SYS_PTRACE --shm-size=64G \
--name ${runner_name} \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache_rl:/root/.cache" \
Expand All @@ -151,6 +154,7 @@ jobs:
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "BRANCH_REF=${BRANCH_REF}" \
-e "CCACHE_MAXSIZE=50G" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}
Expand All @@ -162,6 +166,7 @@ jobs:
cd FastDeploy

# Avoid using pip cache to ensure the wheel is updated to the latest version
python -m pip uninstall paddlepaddle-gpu -y || true
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Paddle-RL-Compile/release/3.3/latest/paddlepaddle_gpu-3.3.0.dev-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu*

Expand Down Expand Up @@ -202,3 +207,10 @@ jobs:
target_path_stripped="${target_path#paddle-github-action/}"
WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name}
echo "wheel_path_rl=${WHEEL_PATH}" >> $GITHUB_OUTPUT

- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
37 changes: 31 additions & 6 deletions .github/workflows/_gpu_4cards_case_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'

wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}

tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}

rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
Expand Down Expand Up @@ -166,7 +181,10 @@ jobs:
docker rm -f ${runner_name} || true
fi

docker run --rm --ipc=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
Expand All @@ -183,6 +201,7 @@ jobs:
-e "fd_wheel_url=${fd_wheel_url}" \
-e "BASE_REF=${BASE_REF}" \
-e "IS_PR=${IS_PR}" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '

git config --global --add safe.directory /workspace/FastDeploy
Expand All @@ -191,8 +210,7 @@ jobs:

# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl

python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

python -m pip install -r scripts/unittest_requirement.txt
Expand All @@ -204,3 +222,10 @@ jobs:
export CUDA_VISIBLE_DEVICES=0,1,2,3
bash scripts/run_gpu_4cards.sh
'

- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
38 changes: 33 additions & 5 deletions .github/workflows/_logprob_test_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,27 @@ jobs:
if ls /workspace/* >/dev/null 2>&1; then
echo "ERROR: Failed to clean /workspace/* after multiple attempts"
ls -ld /workspace/*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls /workspace/* >/dev/null 2>&1; then
echo "ERROR: Force cleanup failed. Exiting..."
exit 1
else
echo "Force cleanup succeeded."
fi
fi
'
wget -q --no-proxy ${paddletest_archive_url}
tar -xf PaddleTest.tar.gz

wget -q --no-proxy ${paddletest_archive_url} || {
echo "ERROR: Failed to download archive from ${paddletest_archive_url}"
exit 1
}

tar --no-same-owner -xf PaddleTest.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}

rm -rf PaddleTest.tar.gz
cd PaddleTest
git config --global user.name "FastDeployCI"
Expand Down Expand Up @@ -152,7 +168,11 @@ jobs:
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \

docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
Expand All @@ -167,10 +187,11 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/

pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

Expand Down Expand Up @@ -221,3 +242,10 @@ jobs:
run: |
echo "logprob test failed with exit code ${{ env.LOGPROB_EXIT_CODE }}"
exit 8

- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
Loading
Loading