diff --git a/.cuda_ext.json b/.cuda_ext.json index 01a30a9c1204..c83f633f7257 100644 --- a/.cuda_ext.json +++ b/.cuda_ext.json @@ -2,11 +2,11 @@ "build": [ { "torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121", - "cuda_image": "hpcaitech/cuda-conda:12.1" + "cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.1" }, { "torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124", - "cuda_image": "hpcaitech/cuda-conda:12.4" + "cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.4" } ] } diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index fd7dc42e579d..607013851760 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -12,7 +12,7 @@ jobs: if: github.repository == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/ timeout-minutes: 90 steps: diff --git a/.github/workflows/close_inactive.yml b/.github/workflows/close_inactive.yml index e7dec4430930..a175661b427a 100644 --- a/.github/workflows/close_inactive.yml +++ b/.github/workflows/close_inactive.yml @@ -7,7 +7,7 @@ on: jobs: close-issues: if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest permissions: issues: write pull-requests: write diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 1534fa7f6a9a..7f74f83c6be2 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -15,7 +15,7 @@ on: jobs: matrix_preparation: name: Prepare Container List - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -31,7 +31,7 @@ jobs: do for cv in $CUDA_VERSIONS do - DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"") + DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tv}-${cv}\"") done done @@ -44,7 +44,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index c2cc85b3f619..bf9fb6ecb7e0 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -9,7 +9,7 @@ on: jobs: matrix_preparation: name: Prepare Container List - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} concurrency: @@ -23,7 +23,7 @@ jobs: DOCKER_IMAGE=() while read tag; do - DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"") + DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"") done <.compatibility container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" ) @@ -35,7 +35,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 1bd24b0a236e..04928d7acbbb 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -9,7 +9,7 @@ on: jobs: matrix_preparation: name: Prepare Container List - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -20,7 +20,7 @@ jobs: DOCKER_IMAGE=() while read tag; do - DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"") + DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"") done <.compatibility container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" ) @@ -32,7 +32,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml index 65d9451018c0..a12fbfd825fe 100644 --- a/.github/workflows/cuda_ext_check_before_merge.yml +++ b/.github/workflows/cuda_ext_check_before_merge.yml @@ -10,7 +10,7 @@ jobs: matrix_preparation: name: Prepare Container List if: github.repository == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted,ubuntu-latest] outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -24,7 +24,7 @@ jobs: build: name: Release bdist wheels needs: matrix_preparation - runs-on: [self-hosted, gpu] + runs-on: [self-hosted, ubuntu-latest] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} diff --git a/.github/workflows/doc_build_on_schedule_after_release.yml b/.github/workflows/doc_build_on_schedule_after_release.yml index 62dfdc67257c..863c216e7d14 100644 --- a/.github/workflows/doc_build_on_schedule_after_release.yml +++ b/.github/workflows/doc_build_on_schedule_after_release.yml @@ -11,7 +11,7 @@ jobs: build-doc: name: Trigger Documentation Build Workflow if: github.repository == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest steps: - name: trigger workflow in ColossalAI-Documentation run: | diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml index 68e13a971e7e..91fc16148c0e 100644 --- a/.github/workflows/doc_check_on_pr.yml +++ b/.github/workflows/doc_check_on_pr.yml @@ -15,7 +15,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-i18n cancel-in-progress: true @@ -33,7 +33,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-doc cancel-in-progress: true diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index 99a3f18a0d03..04a4c044f580 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -15,7 +15,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: any_changed: ${{ steps.changed-files.outputs.any_changed }} changed_files: ${{ steps.changed-files.outputs.all_changed_files }} @@ -56,7 +56,7 @@ jobs: needs: detect-changed-doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm timeout-minutes: 30 defaults: diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index 902aba77469a..42ec8a9de2c4 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/.github/workflows/draft_github_release_post_after_merge.yml b/.github/workflows/draft_github_release_post_after_merge.yml index 53bfa9e8deb6..fbd7f735eb7b 100644 --- a/.github/workflows/draft_github_release_post_after_merge.yml +++ b/.github/workflows/draft_github_release_post_after_merge.yml @@ -12,7 +12,7 @@ jobs: release: name: Draft Release Post if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest steps: - uses: actions/checkout@v2 with: diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index 7039ed9c285b..e5b0ec0ec057 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -14,7 +14,7 @@ jobs: github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' name: Check the examples user want - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 15 steps: diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index af8da0383ebe..ff6e62b723da 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -17,7 +17,7 @@ jobs: if: | github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest outputs: matrix: ${{ steps.setup-matrix.outputs.matrix }} anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} @@ -90,7 +90,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 30 concurrency: diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index db55c305be1d..cc17e9a30300 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -10,7 +10,7 @@ jobs: matrix_preparation: if: github.repository == 'hpcaitech/ColossalAI' name: Prepare matrix for weekly check - runs-on: ubuntu-latest + runs-on: ubunt[self-hosted, ubuntu-latest]u-latest outputs: matrix: ${{ steps.setup-matrix.outputs.matrix }} steps: @@ -34,7 +34,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 30 steps: diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml index 23aac9b544b0..fe37dfcbf070 100644 --- a/.github/workflows/release_docker_after_publish.yml +++ b/.github/workflows/release_docker_after_publish.yml @@ -46,7 +46,7 @@ jobs: notify: name: Notify Lark via webhook needs: release - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest if: ${{ always() }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release_nightly_on_schedule.yml b/.github/workflows/release_nightly_on_schedule.yml index 072a943aef19..b54a3859d845 100644 --- a/.github/workflows/release_nightly_on_schedule.yml +++ b/.github/workflows/release_nightly_on_schedule.yml @@ -9,7 +9,7 @@ jobs: publish: if: github.repository == 'hpcaitech/ColossalAI' name: Build and publish Python 🐍 distributions 📦 to PyPI - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] timeout-minutes: 20 outputs: status: ${{ steps.publish.outcome }} @@ -36,7 +36,7 @@ jobs: notify: name: Notify Lark via webhook needs: publish - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI' steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release_pypi_after_merge.yml b/.github/workflows/release_pypi_after_merge.yml index b987b4397c17..e60c3ce6f683 100644 --- a/.github/workflows/release_pypi_after_merge.yml +++ b/.github/workflows/release_pypi_after_merge.yml @@ -12,7 +12,7 @@ jobs: build-n-publish: if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' && github.event.pull_request.merged == true && github.base_ref == 'main' name: Build and publish Python 🐍 distributions 📦 to PyPI - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] timeout-minutes: 20 steps: - uses: actions/checkout@v2 @@ -35,7 +35,7 @@ jobs: notify: name: Notify Lark via webhook needs: build-n-publish - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest if: ${{ always() }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml index 3d3cfc696039..a7f53c64910a 100644 --- a/.github/workflows/release_test_pypi_before_merge.yml +++ b/.github/workflows/release_test_pypi_before_merge.yml @@ -9,7 +9,7 @@ jobs: build-n-publish: if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' name: Build and publish Python 🐍 distributions 📦 to Test PyPI - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-latest]-latest timeout-minutes: 20 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/report_leaderboard_to_lark.yml b/.github/workflows/report_leaderboard_to_lark.yml index 00d8e9e1f5fd..70a13270d50c 100644 --- a/.github/workflows/report_leaderboard_to_lark.yml +++ b/.github/workflows/report_leaderboard_to_lark.yml @@ -10,7 +10,7 @@ jobs: generate-and-publish: if: github.repository == 'hpcaitech/ColossalAI' name: Generate leaderboard report and publish to Lark - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] timeout-minutes: 20 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml index c9dc541b8a33..1c17b63a9752 100644 --- a/.github/workflows/report_test_coverage.yml +++ b/.github/workflows/report_test_coverage.yml @@ -8,7 +8,7 @@ on: jobs: report-test-coverage: - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] if: ${{ github.event.workflow_run.conclusion == 'success' }} steps: - name: "Download artifact" diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index c6ac2b7bdec2..5f580e4c17c7 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb timeout-minutes: 60 defaults: diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml index 21545098af74..a67335690dac 100644 --- a/.github/workflows/run_chatgpt_unit_tests.yml +++ b/.github/workflows/run_chatgpt_unit_tests.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data timeout-minutes: 30 defaults: diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml index 326ef4526a43..f08831e5f8fe 100644 --- a/.github/workflows/run_colossalqa_unit_tests.yml +++ b/.github/workflows/run_colossalqa_unit_tests.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 volumes: - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa - /data/scratch/llama-tiny:/data/scratch/llama-tiny diff --git a/.github/workflows/submodule.yml b/.github/workflows/submodule.yml index 4ffb261183f1..14d85d1d99d3 100644 --- a/.github/workflows/submodule.yml +++ b/.github/workflows/submodule.yml @@ -7,7 +7,7 @@ on: jobs: sync-submodule: - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] if: github.repository == 'hpcaitech/ColossalAI' steps: - name: Checkout diff --git a/.github/workflows/translate_comment.yml b/.github/workflows/translate_comment.yml index 83c127b3caa4..36113aaad0bd 100644 --- a/.github/workflows/translate_comment.yml +++ b/.github/workflows/translate_comment.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-[self-hosted, ubuntu-latest] steps: - uses: usthe/issues-translate-action@v2.7 with: diff --git a/tests/kit/model_zoo/transformers/gpt.py b/tests/kit/model_zoo/transformers/gpt.py index f2b139beca83..44d859e48b17 100644 --- a/tests/kit/model_zoo/transformers/gpt.py +++ b/tests/kit/model_zoo/transformers/gpt.py @@ -113,6 +113,7 @@ def date_gen_for_double_heads(): problem_type="single_label_classification", pad_token_id=1022, tie_word_embeddings=True, + attn_implementation="eager", ) config_for_token_classification = copy.deepcopy(config) diff --git a/tests/test_zero/test_gemini/test_inference.py b/tests/test_zero/test_gemini/test_inference.py index 6da19d41030b..61e9e14f533e 100644 --- a/tests/test_zero/test_gemini/test_inference.py +++ b/tests/test_zero/test_gemini/test_inference.py @@ -114,7 +114,6 @@ def run_dist(rank, world_size, port): exam_inference() -@pytest.mark.skip("this test failed") @pytest.mark.dist @pytest.mark.parametrize("world_size", [1, 4]) def test_inference(world_size): diff --git a/version.txt b/version.txt index 76914ddc02f8..8f0916f768f0 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.4.9 +0.5.0