Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .cuda_ext.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
"build": [
{
"torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
"cuda_image": "hpcaitech/cuda-conda:12.1"
"cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.1"
},
{
"torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
"cuda_image": "hpcaitech/cuda-conda:12.4"
"cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.4"
}
]
}
2 changes: 1 addition & 1 deletion .github/workflows/build_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
timeout-minutes: 90
steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/close_inactive.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
jobs:
close-issues:
if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
permissions:
issues: write
pull-requests: write
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/compatiblity_test_on_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
jobs:
matrix_preparation:
name: Prepare Container List
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
Expand All @@ -31,7 +31,7 @@ jobs:
do
for cv in $CUDA_VERSIONS
do
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tv}-${cv}\"")
done
done

Expand All @@ -44,7 +44,7 @@ jobs:
name: Test for PyTorch Compatibility
needs: matrix_preparation
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
runs-on: [self-hosted, ubuntu-latest]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/compatiblity_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
matrix_preparation:
name: Prepare Container List
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
concurrency:
Expand All @@ -23,7 +23,7 @@ jobs:
DOCKER_IMAGE=()
while read tag; do
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"")
done <.compatibility
container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
Expand All @@ -35,7 +35,7 @@ jobs:
name: Test for PyTorch Compatibility
needs: matrix_preparation
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
runs-on: [self-hosted, ubuntu-latest]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/compatiblity_test_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
matrix_preparation:
name: Prepare Container List
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
Expand All @@ -20,7 +20,7 @@ jobs:
DOCKER_IMAGE=()

while read tag; do
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"")
done <.compatibility

container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
Expand All @@ -32,7 +32,7 @@ jobs:
name: Test for PyTorch Compatibility
needs: matrix_preparation
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu]
runs-on: [self-hosted, ubuntu-latest]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/cuda_ext_check_before_merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
matrix_preparation:
name: Prepare Container List
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
runs-on: [self-hosted,ubuntu-latest]
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
Expand All @@ -24,7 +24,7 @@ jobs:
build:
name: Release bdist wheels
needs: matrix_preparation
runs-on: [self-hosted, gpu]
runs-on: [self-hosted, ubuntu-latest]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
build-doc:
name: Trigger Documentation Build Workflow
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
steps:
- name: trigger workflow in ColossalAI-Documentation
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/doc_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
if: |
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-i18n
cancel-in-progress: true
Expand All @@ -33,7 +33,7 @@ jobs:
if: |
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-doc
cancel-in-progress: true
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/doc_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
if: |
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
outputs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
Expand Down Expand Up @@ -56,7 +56,7 @@ jobs:
needs: detect-changed-doc
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm
timeout-minutes: 30
defaults:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/doc_test_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
name: Test the changed Doc
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm
timeout-minutes: 60
steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
release:
name: Draft Release Post
if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
steps:
- uses: actions/checkout@v2
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/example_check_on_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
name: Check the examples user want
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
Expand Down Expand Up @@ -45,7 +45,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
timeout-minutes: 15
steps:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
if: |
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
outputs:
matrix: ${{ steps.setup-matrix.outputs.matrix }}
anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
timeout-minutes: 30
concurrency:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/example_check_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
matrix_preparation:
if: github.repository == 'hpcaitech/ColossalAI'
name: Prepare matrix for weekly check
runs-on: ubuntu-latest
runs-on: ubunt[self-hosted, ubuntu-latest]u-latest
outputs:
matrix: ${{ steps.setup-matrix.outputs.matrix }}
steps:
Expand All @@ -34,7 +34,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
timeout-minutes: 30
steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release_docker_after_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
notify:
name: Notify Lark via webhook
needs: release
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
if: ${{ always() }}
steps:
- uses: actions/checkout@v2
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release_nightly_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
publish:
if: github.repository == 'hpcaitech/ColossalAI'
name: Build and publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
timeout-minutes: 20
outputs:
status: ${{ steps.publish.outcome }}
Expand All @@ -36,7 +36,7 @@ jobs:
notify:
name: Notify Lark via webhook
needs: publish
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
steps:
- uses: actions/checkout@v2
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release_pypi_after_merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
build-n-publish:
if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' && github.event.pull_request.merged == true && github.base_ref == 'main'
name: Build and publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
timeout-minutes: 20
steps:
- uses: actions/checkout@v2
Expand All @@ -35,7 +35,7 @@ jobs:
notify:
name: Notify Lark via webhook
needs: build-n-publish
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
if: ${{ always() }}
steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release_test_pypi_before_merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
build-n-publish:
if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI'
name: Build and publish Python 🐍 distributions 📦 to Test PyPI
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/report_leaderboard_to_lark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
generate-and-publish:
if: github.repository == 'hpcaitech/ColossalAI'
name: Generate leaderboard report and publish to Lark
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
timeout-minutes: 20
steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/report_test_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:

jobs:
report-test-coverage:
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
if: ${{ github.event.workflow_run.conclusion == 'success' }}
steps:
- name: "Download artifact"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_chatgpt_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb
timeout-minutes: 60
defaults:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_chatgpt_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data
timeout-minutes: 30
defaults:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_colossalqa_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
volumes:
- /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
- /data/scratch/llama-tiny:/data/scratch/llama-tiny
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/submodule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:

jobs:
sync-submodule:
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
if: github.repository == 'hpcaitech/ColossalAI'
steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/translate_comment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:

jobs:
build:
runs-on: ubuntu-latest
runs-on: ubuntu-[self-hosted, ubuntu-latest]
steps:
- uses: usthe/issues-translate-action@v2.7
with:
Expand Down
1 change: 1 addition & 0 deletions tests/kit/model_zoo/transformers/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def date_gen_for_double_heads():
problem_type="single_label_classification",
pad_token_id=1022,
tie_word_embeddings=True,
attn_implementation="eager",
)

config_for_token_classification = copy.deepcopy(config)
Expand Down
1 change: 0 additions & 1 deletion tests/test_zero/test_gemini/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def run_dist(rank, world_size, port):
exam_inference()


@pytest.mark.skip("this test failed")
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 4])
def test_inference(world_size):
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.4.9
0.5.0