Skip to content

Commit cef5d55

Browse files
authored
Merge pull request #2012 from codalab/develop
Rebase "datasets" branch
2 parents 84a8712 + 2e95f84 commit cef5d55

303 files changed

Lines changed: 6084 additions & 1306 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.circleci/config.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ jobs:
3838
docker compose -f docker-compose.yml -f docker-compose.selenium.yml exec django python manage.py collectstatic --noinput
3939
4040
- run: docker-compose exec django flake8 src/
41+
- run: docker pull codalab/codalab-legacy:py37 # not available without "not e2e" tests as they pull ahead of time
42+
- run: docker pull codalab/codalab-legacy:py3 # not available without "not e2e" tests as they pull ahead of time
43+
- run: docker pull vergilgxw/autotable:v2 # not available without "not e2e" tests as they pull ahead of time
4144

4245
- run:
4346
name: pytest
@@ -48,6 +51,13 @@ jobs:
4851
command: docker compose -f docker-compose.yml -f docker-compose.selenium.yml exec django py.test src/tests/functional/ -m e2e
4952
no_output_timeout: 60m
5053

54+
# Example to run specific set of tests (for debugging individual tests from a batch of tests)
55+
# - run:
56+
# name: e2e tests - competitions
57+
# command: docker compose -f docker-compose.yml -f docker-compose.selenium.yml exec django py.test src/tests/functional/test_competitions.py -m e2e
58+
# no_output_timeout: 60m
59+
60+
5161
- store_artifacts:
5262
path: artifacts/
5363

.github/workflows/ghPages-dev.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: github-pages-dev
2+
on:
3+
push:
4+
branches:
5+
- develop
6+
permissions:
7+
contents: write
8+
jobs:
9+
deploy-dev:
10+
runs-on: ubuntu-latest
11+
defaults:
12+
run:
13+
working-directory: documentation/
14+
steps:
15+
- uses: actions/checkout@v4
16+
- name: Configure Git Credentials
17+
run: |
18+
git config user.name github-actions[bot]
19+
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
20+
- uses: actions/setup-python@v5
21+
with:
22+
python-version: 3.x
23+
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
24+
- run: /home/runner/.local/bin/uv sync
25+
- run: git fetch origin gh-pages --depth=1 && PDF=1 /home/runner/.local/bin/uv run mike deploy -u dev --push

.github/workflows/ghPages-prod.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: github-pages-prod
2+
on:
3+
push:
4+
tags:
5+
- '*'
6+
permissions:
7+
contents: write
8+
jobs:
9+
deploy-prod:
10+
runs-on: ubuntu-latest
11+
defaults:
12+
run:
13+
working-directory: documentation/
14+
steps:
15+
- uses: actions/checkout@v4
16+
- name: Configure Git Credentials
17+
run: |
18+
git config user.name github-actions[bot]
19+
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
20+
- uses: actions/setup-python@v5
21+
with:
22+
python-version: 3.x
23+
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
24+
- run: /home/runner/.local/bin/uv sync
25+
- run: git fetch origin gh-pages --depth=1 && PDF=1 /home/runner/.local/bin/uv run mike deploy -u ${{ github.ref_name }} latest --push

Dockerfile.compute_worker

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,28 @@
1-
FROM --platform=linux/amd64 python:3.9
1+
FROM --platform=linux/amd64 fedora:42
22

33
# This makes output not buffer and return immediately, nice for seeing results in stdout
44
ENV PYTHONUNBUFFERED 1
55

66
# Install Docker
7-
RUN apt-get update && curl -fsSL https://get.docker.com | sh
7+
RUN dnf -y install dnf-plugins-core && \
8+
dnf-3 config-manager --add-repo https://download.docker.com/linux/fedora/docker-ce.repo && \
9+
dnf -y update && \
10+
dnf install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin && \
11+
dnf install -y python3.9 && \
12+
dnf clean all && \
13+
rm -rf /var/cache /var/log/dnf* /var/log/yum.*
814

915

10-
RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.8.3
16+
RUN curl -sSL https://install.python-poetry.org | python3.9 - --version 1.8.3
1117
# Poetry location so future commands (below) work
1218
ENV PATH $PATH:/root/.local/bin
1319
# Want poetry to use system python of docker container
1420
RUN poetry config virtualenvs.create false
1521
RUN poetry config virtualenvs.in-project false
1622
COPY ./compute_worker/pyproject.toml ./
1723
COPY ./compute_worker/poetry.lock ./
18-
RUN poetry install
24+
# To use python3.9 instead of system python
25+
RUN poetry config virtualenvs.prefer-active-python true && poetry install
1926

2027
ADD compute_worker .
2128

Dockerfile.compute_worker_gpu

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,12 @@
1-
FROM --platform=linux/amd64 python:3.9
2-
3-
# This makes output not buffer and return immediately, nice for seeing results in stdout
4-
ENV PYTHONUNBUFFERED 1
5-
6-
# Install Docker
7-
RUN apt-get update && curl -fsSL https://get.docker.com | sh
8-
9-
1+
FROM --platform=linux/amd64 codalab/competitions-v2-compute-worker:latest
102
# Nvidia Container Toolkit for cuda use with docker
113
# [source](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
12-
RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
13-
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
14-
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
15-
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
16-
RUN apt-get update -y;
17-
RUN apt-get install -y nvidia-container-toolkit
4+
# Include deps
5+
RUN dnf -y config-manager addrepo --from-repofile=https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo && \
6+
dnf -y update && \
7+
dnf -y install nvidia-container-runtime nvidia-container-toolkit --exclude container-selinux && \
8+
dnf clean all && \
9+
rm -rf /var/cache /var/log/dnf* /var/log/yum.*
1810
# Make it explicit that we're using GPUs
1911
# BB - not convinced we need this
2012
ENV USE_GPU 1
21-
22-
RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.8.3
23-
# Poetry location so future commands (below) work
24-
ENV PATH $PATH:/root/.local/bin
25-
# Want poetry to use system python of docker container
26-
RUN poetry config virtualenvs.create false
27-
RUN poetry config virtualenvs.in-project false
28-
COPY ./compute_worker/pyproject.toml ./
29-
COPY ./compute_worker/poetry.lock ./
30-
RUN poetry install
31-
32-
ADD compute_worker .
33-
34-
CMD celery -A compute_worker worker \
35-
-l info \
36-
-Q compute-worker \
37-
-n compute-worker@%n \
38-
--concurrency=1

compute_worker/compute_worker.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,9 @@ def get_detailed_results_file_path(self):
302302
async def send_detailed_results(self, file_path):
303303
logger.info(f"Updating detailed results {file_path} - {self.detailed_results_url}")
304304
self._put_file(self.detailed_results_url, file=file_path, content_type='text/html')
305-
async with websockets.connect(self.websocket_url) as websocket:
305+
websocket_url = f"{self.websocket_url}?kind=detailed_results"
306+
logger.info(f"Connecting to {websocket_url} for detailed results")
307+
async with websockets.connect(websocket_url) as websocket:
306308
await websocket.send(json.dumps({
307309
"kind": 'detailed_result_update',
308310
}))
@@ -390,10 +392,14 @@ async def _send_data_through_socket(self, error_message):
390392
- Docker image pull failure logs
391393
- Execution time limit exceeded logs
392394
"""
393-
logger.info(f"Connecting to {self.websocket_url} to send docker image pull error")
395+
# Create a unique websocket URL for error messages
396+
websocket_url = f"{self.websocket_url}?kind=error_logs"
397+
logger.info(f"Connecting to {websocket_url} to send error message")
398+
399+
logger.info(f"Connecting to {websocket_url} to send docker image pull error")
394400

395401
# connect to web socket
396-
websocket = await websockets.connect(self.websocket_url)
402+
websocket = await websockets.connect(websocket_url)
397403

398404
# define websocket errors
399405
websocket_errors = (socket.gaierror, websockets.WebSocketException, websockets.ConnectionClosedError, ConnectionRefusedError)
@@ -416,7 +422,7 @@ async def _send_data_through_socket(self, error_message):
416422
# no error in websocket message sending
417423
logger.info(f"Error sent successfully through websocket")
418424

419-
logger.info(f"Disconnecting from websocket {self.websocket_url}")
425+
logger.info(f"Disconnecting from websocket {websocket_url}")
420426

421427
# close websocket
422428
await websocket.close()
@@ -500,8 +506,11 @@ async def _run_container_engine_cmd(self, engine_cmd, kind):
500506
}
501507

502508
# Start websocket, it will reconnect in the stdout/stderr listener loop below
503-
logger.info(f"Connecting to {self.websocket_url}")
504-
websocket = await websockets.connect(self.websocket_url)
509+
# This ensures each task has its own independent WebSocket connection
510+
websocket_url = f"{self.websocket_url}?kind={kind}"
511+
logger.debug(f"WORKER_MARKER: Connecting to {websocket_url}")
512+
websocket = await websockets.connect(websocket_url)
513+
# websocket = await websockets.connect(self.websocket_url) # old BB
505514
websocket_errors = (socket.gaierror, websockets.WebSocketException, websockets.ConnectionClosedError, ConnectionRefusedError)
506515

507516
# Function to read a line, if the line is larger than the buffer size we will
@@ -522,7 +531,7 @@ async def _readline_or_chunk(stream):
522531
logs = [self.logs[kind][key] for key in ('stdout', 'stderr')]
523532
for value in logs:
524533
try:
525-
out = await asyncio.wait_for(_readline_or_chunk(value["stream"]), timeout=.1)
534+
out = await asyncio.wait_for(_readline_or_chunk(value["stream"]), timeout=0.1)
526535
if out:
527536
value["data"] += out
528537
print("WS: " + str(out))
@@ -535,32 +544,36 @@ async def _readline_or_chunk(stream):
535544
except asyncio.TimeoutError:
536545
continue
537546
except websocket_errors:
547+
logger.debug("\n\nWebsocket error (line 538)\n\n")
538548
try:
539549
# do we need to await websocket.close() on the old socket? before making a new one probably not?
540550
await websocket.close()
541551
except Exception as e:
542552
logger.error(e)
543-
logger.info(e)
544553
# TODO: catch proper exceptions here..! What can go wrong failing to close?
545554
pass
546555

547556
# try to reconnect a few times
548557
tries = 0
549558
while tries < 3 and not websocket.open:
550559
try:
551-
websocket = await websockets.connect(self.websocket_url)
560+
logger.debug(f"\n\nAttempting to reconnect in 2 seconds (attempt {tries+1}/3)")
561+
websocket = await websockets.connect(websocket_url)
562+
logger.debug(f"\n\nSuccessfully reconnected to {websocket_url}")
552563
except websocket_errors:
564+
logger.error(f"\n\nReconnection attempt {tries+1} failed: {websocket_errors}")
553565
await asyncio.sleep(2)
554566
tries += 1
555567

556568
self.logs[kind]["end"] = time.time()
557569

558-
logger.info(f"Process exited with {proc.returncode}")
559-
logger.info(f"Disconnecting from websocket {self.websocket_url}")
570+
logger.debug(f"Process exited with {proc.returncode}")
571+
logger.debug(f"Disconnecting from websocket {websocket_url}")
560572

561573
# Communicate that the program is closing
562574
self.completed_program_counter += 1
563575

576+
logger.debug(f"WORKER_MARKER: Disconnecting from {websocket_url}, program counter = {self.completed_program_counter}")
564577
await websocket.close()
565578

566579
def _get_host_path(self, *paths):
@@ -609,6 +622,9 @@ async def _run_program_directory(self, program_dir, kind):
609622
logger.info(
610623
"Program directory missing metadata, assuming it's going to be handled by ingestion"
611624
)
625+
# Copy submission files into prediction output
626+
# This is useful for results submissions but wrongly uses storage
627+
shutil.copytree(program_dir, self.output_dir)
612628
return
613629
else:
614630
raise SubmissionException("Program directory missing 'metadata.yaml/metadata'")

docker-compose.selenium.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ services:
44
environment:
55
- SELENIUM_HOSTNAME=selenium
66
- SUBMISSIONS_API_URL=http://django:36475/api
7+
- WEBSOCKET_ALLOWED_ORIGINS=*
78
ports:
89
- 36475:36475
910

1011
selenium:
11-
image: selenium/standalone-firefox:124.0
12+
image: selenium/standalone-firefox:120.0
1213
volumes:
1314
- ./src/tests/functional/test_files:/test_files/
1415
- ./artifacts:/artifacts/:z

docker-compose.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ services:
5353
# Minio local storage helper
5454
#-----------------------------------------------
5555
minio:
56-
image: minio/minio:RELEASE.2020-10-03T02-19-42Z
56+
image: minio/minio:RELEASE.2025-04-22T22-12-26Z
5757
command: server /export
5858
volumes:
5959
- ./var/minio:/export
@@ -62,11 +62,11 @@ services:
6262
- $MINIO_PORT:9000
6363
env_file: .env
6464
healthcheck:
65-
test: ["CMD", "nc", "-z", "minio", "9000"]
65+
test: ["CMD", "curl", "-I", "http://minio:9000/minio/health/live"]
6666
interval: 5s
6767
retries: 5
6868
createbuckets:
69-
image: minio/mc
69+
image: minio/mc:RELEASE.2025-07-21T05-28-08Z
7070
depends_on:
7171
minio:
7272
condition: service_healthy
@@ -78,7 +78,7 @@ services:
7878
/bin/sh -c "
7979
set -x;
8080
if [ -n \"$MINIO_ACCESS_KEY\" ] && [ -n \"$MINIO_SECRET_KEY\" ] && [ -n \"$MINIO_PORT\" ]; then
81-
until /usr/bin/mc config host add minio_docker http://minio:$MINIO_PORT $MINIO_ACCESS_KEY $MINIO_SECRET_KEY && break; do
81+
until /usr/bin/mc alias set minio_docker http://minio:$MINIO_PORT $MINIO_ACCESS_KEY $MINIO_SECRET_KEY && break; do
8282
echo '...waiting...' && sleep 5;
8383
done;
8484
/usr/bin/mc mb minio_docker/$AWS_STORAGE_BUCKET_NAME || echo 'Bucket $AWS_STORAGE_BUCKET_NAME already exists.';

documentation/.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
.cache/
9+
# Virtual environments
10+
.venv
11+
12+
uv.lock
13+
14+
site/*

documentation/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

0 commit comments

Comments
 (0)