Skip to content

Commit c4a160d

Browse files
authored
Add path sanitizing checks to bionemo.core.data.load() [BIO-409] (#1546)
### Description Make sure that tar paths from files downloaded by bionemo.core.data.load are safe and local. ### Type of changes <!-- Mark the relevant option with an [x] --> - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### Pre-submit Checklist <!--- Ensure all items are completed before submitting --> - [x] I have tested these changes locally - [x] I have updated the documentation accordingly - [x] I have added/updated tests as needed - [x] All existing tests pass successfully <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Implemented path-traversal protection for ZIP and TAR archive extraction, preventing malicious archives from writing files outside their designated extraction directory during loading. * **Tests** * Added comprehensive test coverage for archive extraction security validations, including path traversal attack scenarios and safe extraction verification. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: John St John <jstjohn@nvidia.com>
1 parent 9d772b2 commit c4a160d

2 files changed

Lines changed: 150 additions & 3 deletions

File tree

sub-packages/bionemo-core/src/bionemo/core/data/load.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
import os
1919
import shutil
2020
import sys
21+
import tarfile
2122
import tempfile
23+
import zipfile
2224
from dataclasses import dataclass
2325
from pathlib import Path
2426
from typing import TYPE_CHECKING, Literal, Optional, Sequence, TextIO
@@ -225,6 +227,47 @@ def load(
225227
return path
226228

227229

230+
def _validate_archive_members(member_names: list[str], extract_dir: str) -> None:
231+
"""Validate that no archive members would be extracted outside the target directory.
232+
233+
This prevents Zip Slip / path traversal attacks where malicious archives contain entries
234+
with relative paths (e.g., ``../../etc/passwd``) that escape the extraction directory.
235+
236+
Args:
237+
member_names: List of member names from the archive.
238+
extract_dir: The directory where files will be extracted.
239+
240+
Raises:
241+
ValueError: If any member would be extracted outside the target directory.
242+
"""
243+
safe_dir = os.path.normpath(extract_dir)
244+
for name in member_names:
245+
member_path = os.path.normpath(os.path.join(safe_dir, name))
246+
if not (member_path == safe_dir or member_path.startswith(safe_dir + os.sep)):
247+
raise ValueError(
248+
f"Archive member '{name}' would be extracted outside "
249+
f"the target directory '{safe_dir}'. This is a potential Zip Slip security risk."
250+
)
251+
252+
253+
class _SafeUntar(pooch.Untar):
254+
"""Untar processor with path traversal validation."""
255+
256+
def _extract_file(self, fname, extract_dir):
257+
with tarfile.open(fname, "r") as tar_file:
258+
_validate_archive_members([m.name for m in tar_file.getmembers()], extract_dir)
259+
super()._extract_file(fname, extract_dir)
260+
261+
262+
class _SafeUnzip(pooch.Unzip):
263+
"""Unzip processor with path traversal validation."""
264+
265+
def _extract_file(self, fname, extract_dir):
266+
with zipfile.ZipFile(fname, "r") as zip_file:
267+
_validate_archive_members(zip_file.namelist(), extract_dir)
268+
super()._extract_file(fname, extract_dir)
269+
270+
228271
def _get_processor(extension: str, unpack: bool | None, decompress: bool | None):
229272
"""Get the processor for a given file extension.
230273
@@ -242,10 +285,10 @@ def _get_processor(extension: str, unpack: bool | None, decompress: bool | None)
242285
return pooch.Decompress()
243286

244287
elif extension in {".tar", ".tar.gz"} and unpack is None:
245-
return pooch.Untar()
288+
return _SafeUntar()
246289

247290
elif extension == ".zip" and unpack is None:
248-
return pooch.Unzip()
291+
return _SafeUnzip()
249292

250293
else:
251294
return None

sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,21 @@
1818
import io
1919
import subprocess
2020
import tarfile
21+
import zipfile
2122
from pathlib import Path
2223
from unittest.mock import Mock, patch
2324

2425
import ngcbase.environ
2526
import pytest
2627

27-
from bionemo.core.data.load import default_ngc_client, default_pbss_client, load
28+
from bionemo.core.data.load import (
29+
_SafeUntar,
30+
_SafeUnzip,
31+
_validate_archive_members,
32+
default_ngc_client,
33+
default_pbss_client,
34+
load,
35+
)
2836
from bionemo.core.data.resource import get_all_resources
2937

3038

@@ -344,3 +352,99 @@ def mocked_ngc_download(url, destination, file_patterns):
344352
assert file_path.read_text() == "test"
345353

346354
mocked_ngc_client.registry.resource.download_version.assert_called_once()
355+
356+
357+
# --- Zip Slip / path traversal tests ---
358+
359+
360+
def test_validate_archive_members_rejects_path_traversal():
361+
with pytest.raises(ValueError, match="Zip Slip"):
362+
_validate_archive_members(["../../etc/passwd"], "/tmp/extract")
363+
364+
365+
def test_validate_archive_members_rejects_absolute_path():
366+
with pytest.raises(ValueError, match="Zip Slip"):
367+
_validate_archive_members(["/etc/passwd"], "/tmp/extract")
368+
369+
370+
def test_validate_archive_members_accepts_safe_paths():
371+
_validate_archive_members(["subdir/file.txt", "file.txt", "a/b/c/d.txt"], "/tmp/extract")
372+
373+
374+
def test_safe_untar_rejects_path_traversal(tmp_path):
375+
"""Verify _SafeUntar rejects tar archives with path traversal entries."""
376+
malicious_tar = tmp_path / "malicious.tar"
377+
with tarfile.open(malicious_tar, "w") as tar:
378+
data = b"malicious content"
379+
info = tarfile.TarInfo(name="../../escaped.txt")
380+
info.size = len(data)
381+
tar.addfile(info, io.BytesIO(data))
382+
383+
extract_dir = str(tmp_path / "extracted")
384+
processor = _SafeUntar()
385+
with pytest.raises(ValueError, match="Zip Slip"):
386+
processor._extract_file(str(malicious_tar), extract_dir)
387+
388+
389+
def test_safe_untar_allows_safe_archive(tmp_path):
390+
"""Verify _SafeUntar allows normal tar archives."""
391+
safe_tar = tmp_path / "safe.tar"
392+
with tarfile.open(safe_tar, "w") as tar:
393+
data = b"safe content"
394+
info = tarfile.TarInfo(name="subdir/file.txt")
395+
info.size = len(data)
396+
tar.addfile(info, io.BytesIO(data))
397+
398+
extract_dir = str(tmp_path / "extracted")
399+
processor = _SafeUntar()
400+
processor._extract_file(str(safe_tar), extract_dir)
401+
assert (tmp_path / "extracted" / "subdir" / "file.txt").read_text() == "safe content"
402+
403+
404+
def test_safe_unzip_rejects_path_traversal(tmp_path):
405+
"""Verify _SafeUnzip rejects zip archives with path traversal entries."""
406+
malicious_zip = tmp_path / "malicious.zip"
407+
with zipfile.ZipFile(malicious_zip, "w") as zf:
408+
zf.writestr("../../escaped.txt", "malicious content")
409+
410+
extract_dir = str(tmp_path / "extracted")
411+
processor = _SafeUnzip()
412+
with pytest.raises(ValueError, match="Zip Slip"):
413+
processor._extract_file(str(malicious_zip), extract_dir)
414+
415+
416+
def test_safe_unzip_allows_safe_archive(tmp_path):
417+
"""Verify _SafeUnzip allows normal zip archives."""
418+
safe_zip = tmp_path / "safe.zip"
419+
with zipfile.ZipFile(safe_zip, "w") as zf:
420+
zf.writestr("subdir/file.txt", "safe content")
421+
422+
extract_dir = str(tmp_path / "extracted")
423+
processor = _SafeUnzip()
424+
processor._extract_file(str(safe_zip), extract_dir)
425+
assert (tmp_path / "extracted" / "subdir" / "file.txt").read_text() == "safe content"
426+
427+
428+
@patch("bionemo.core.data.load._s3_download")
429+
def test_load_rejects_malicious_tar(mocked_s3_download, tmp_path):
430+
"""End-to-end test: loading a resource with a malicious tar archive raises ValueError."""
431+
(tmp_path / "foo.yaml").write_text(
432+
"""
433+
- tag: "evil"
434+
pbss: "s3://test/evil.tar"
435+
owner: Test <test@test.com>
436+
sha256: null
437+
"""
438+
)
439+
440+
def write_malicious_tar(_1, output_file: str, _2):
441+
with tarfile.open(output_file, "w") as tar:
442+
data = b"malicious"
443+
info = tarfile.TarInfo(name="../../escaped.txt")
444+
info.size = len(data)
445+
tar.addfile(info, io.BytesIO(data))
446+
447+
mocked_s3_download.side_effect = write_malicious_tar
448+
449+
with pytest.raises(ValueError, match="Zip Slip"):
450+
load("foo/evil", resources=get_all_resources(tmp_path), cache_dir=tmp_path, source="pbss")

0 commit comments

Comments
 (0)