From 26c22163f3c27adc52b351323c46d7bbb4c92291 Mon Sep 17 00:00:00 2001 From: David Stern Date: Wed, 13 May 2026 15:27:11 -0400 Subject: [PATCH 1/8] Sync LFS objects across repos. --- src/repo_sync/stack/git_ops.py | 59 +++++++++++ src/repo_sync/stack/lfs.py | 110 +++++++++++++++++++++ src/repo_sync/workflows/create_sync_prs.py | 51 ++++++++++ tests/__init__.py | 1 + tests/stack/test_lfs.py | 52 ++++++++++ tests/workflows/test_lfs_mirroring.py | 64 ++++++++++++ 6 files changed, 337 insertions(+) create mode 100644 src/repo_sync/stack/lfs.py create mode 100644 tests/__init__.py create mode 100644 tests/stack/test_lfs.py create mode 100644 tests/workflows/test_lfs_mirroring.py diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index c9a8707..c5c5caa 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -322,6 +322,14 @@ def remote_add_or_update(self, name: str, url: str) -> None: else: self._run(["remote", "add", name, url]) + def remote_remove(self, name: str) -> None: + """Remove a remote if it exists.""" + self._run(["remote", "remove", name], check=False) + + def remote_url(self, name: str) -> str: + """Return the configured URL for a remote.""" + return self._run(["remote", "get-url", name]).stdout + def log_shas(self, ref: str = "HEAD") -> list[str]: """Return all commit SHAs reachable from ref (newest first).""" result = self._run(["log", "--format=%H", ref]) @@ -352,3 +360,54 @@ def diff_patch(self, ref_a: str, ref_b: str) -> str: result.stderr, ) return result.stdout.decode("utf-8", errors="replace").strip() + + def diff_name_only(self, ref_a: str, ref_b: str) -> list[str]: + """Return changed paths between two refs.""" + env = {**os.environ, **self._env_additions} if self._env_additions else None + result = subprocess.run( + ["git", "diff", "--name-only", "-z", ref_a, ref_b], + cwd=self.repo_dir, + capture_output=True, + env=env, + ) + if result.returncode != 0: + raise VerboseCalledProcessError( + result.returncode, + ["git", "diff", "--name-only", "-z", ref_a, ref_b], + result.stdout, + result.stderr, + ) + if not result.stdout: + return [] + return [ + path + for path in result.stdout.decode("utf-8", errors="surrogateescape").split("\0") + if path + ] + + def lfs_fetch_ref(self, remote: str, ref: str) -> None: + """Fetch LFS objects referenced by a ref from a remote.""" + self._run(["lfs", "fetch", remote, ref]) + + def lfs_push_oids(self, remote: str, oids: list[str]) -> None: + """Push specific LFS object IDs to a remote.""" + if not oids: + return + + env = {**os.environ, **self._env_additions} if self._env_additions else None + input_text = "".join(f"{oid}\n" for oid in oids) + result = subprocess.run( + ["git", "lfs", "push", "--object-id", remote, "--stdin"], + cwd=self.repo_dir, + input=input_text, + capture_output=True, + text=True, + env=env, + ) + if result.returncode != 0: + raise VerboseCalledProcessError( + result.returncode, + ["git", "lfs", "push", "--object-id", remote, "--stdin"], + result.stdout, + result.stderr, + ) diff --git a/src/repo_sync/stack/lfs.py b/src/repo_sync/stack/lfs.py new file mode 100644 index 0000000..d3a09c6 --- /dev/null +++ b/src/repo_sync/stack/lfs.py @@ -0,0 +1,110 @@ +"""Helpers for detecting Git LFS pointer files in synced trees.""" + +from __future__ import annotations + +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + + +_POINTER_VERSION = "version https://git-lfs.github.com/spec/v1" +_OID_RE = re.compile(r"^oid sha256:([0-9a-f]{64})$") +_SIZE_RE = re.compile(r"^size ([0-9]+)$") +_MAX_POINTER_BYTES = 4096 + + +@dataclass(frozen=True) +class LfsPointer: + """A Git LFS pointer found in a synced tree.""" + + path: str + oid: str + size: int + + +def parse_lfs_pointer(data: bytes, path: str) -> LfsPointer | None: + """Parse a Git LFS pointer file, returning None for ordinary files.""" + if len(data) > _MAX_POINTER_BYTES: + return None + + try: + text = data.decode("utf-8") + except UnicodeDecodeError: + return None + + lines = text.splitlines() + if not lines or lines[0] != _POINTER_VERSION: + return None + + oid: str | None = None + size: int | None = None + for line in lines[1:]: + oid_match = _OID_RE.match(line) + if oid_match: + oid = oid_match.group(1) + continue + + size_match = _SIZE_RE.match(line) + if size_match: + size = int(size_match.group(1)) + + if oid is None or size is None: + return None + + return LfsPointer(path=path, oid=oid, size=size) + + +def collect_lfs_pointers( + root: str, + paths: Iterable[str] | None = None, +) -> list[LfsPointer]: + """Collect LFS pointers under root, optionally restricted to relative paths.""" + candidates = _candidate_paths(root, paths) + pointers: list[LfsPointer] = [] + + for relpath, fullpath in candidates: + try: + if not os.path.isfile(fullpath) or os.path.islink(fullpath): + continue + if os.path.getsize(fullpath) > _MAX_POINTER_BYTES: + continue + data = Path(fullpath).read_bytes() + except OSError: + continue + + pointer = parse_lfs_pointer(data, relpath) + if pointer is not None: + pointers.append(pointer) + + return pointers + + +def _candidate_paths( + root: str, + paths: Iterable[str] | None, +) -> list[tuple[str, str]]: + """Return normalized relative and absolute paths to inspect.""" + root_path = Path(root) + if paths is not None: + result: list[tuple[str, str]] = [] + for path in paths: + relpath = os.path.normpath(path) + if os.path.isabs(relpath) or relpath == ".." or relpath.startswith("../"): + continue + result.append((relpath, str(root_path / relpath))) + return result + + result = [] + for dirpath, dirnames, filenames in os.walk(root, followlinks=False): + dirnames[:] = [ + name + for name in dirnames + if not os.path.islink(os.path.join(dirpath, name)) + ] + for filename in filenames: + fullpath = os.path.join(dirpath, filename) + relpath = os.path.relpath(fullpath, root) + result.append((relpath, fullpath)) + return result diff --git a/src/repo_sync/workflows/create_sync_prs.py b/src/repo_sync/workflows/create_sync_prs.py index 71448e0..1e64f09 100644 --- a/src/repo_sync/workflows/create_sync_prs.py +++ b/src/repo_sync/workflows/create_sync_prs.py @@ -27,6 +27,7 @@ from repo_sync.stack.gh_ops import GhOps from repo_sync.stack.git_ops import GitOps +from repo_sync.stack.lfs import collect_lfs_pointers from repo_sync.stack.trailers import ( SyncOrigin, append_trailer, @@ -143,6 +144,35 @@ def _run_fixup_script(script_path: str, working_dir: str) -> bool: return False +def _mirror_lfs_objects( + source_git: GitOps, + peer_git: GitOps, + source_ref: str, + snapshot_dir: str, + changed_paths: list[str], +) -> None: + """Mirror LFS objects referenced by changed pointer files in a snapshot.""" + pointers = collect_lfs_pointers(snapshot_dir, changed_paths) + if not pointers: + return + + oids = sorted({pointer.oid for pointer in pointers}) + logger.info( + "Mirroring %d Git LFS object(s) referenced by %d changed pointer file(s).", + len(oids), len(pointers), + ) + + source_git.lfs_fetch_ref("origin", source_ref) + + target_remote = f"repo_sync_lfs_target_{os.getpid()}" + peer_url = peer_git.remote_url("origin") + source_git.remote_add_or_update(target_remote, peer_url) + try: + source_git.lfs_push_oids(target_remote, oids) + finally: + source_git.remote_remove(target_remote) + + # Docker image for the PR description agent. Built locally by the sync # workflow from docker/pr-description/Dockerfile (not pushed to a registry). # Can be overridden via environment variable for testing. @@ -473,6 +503,7 @@ def _sync_private_to_public( return False # Empty diff — all changes were internal-only. snapshot_dir, prev_snapshot_dir, diff_repo, patch_file, diff_commit = diff_result + changed_paths = GitOps(diff_repo).diff_name_only("HEAD~1", "HEAD") try: # Apply the delta to the peer repo by cherry-picking from the temp repo. @@ -542,6 +573,13 @@ def _sync_private_to_public( ) else: peer_git.commit_amend_message("repo-sync: sync from private", origin_trailer) + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref=source_sha, + snapshot_dir=snapshot_dir, + changed_paths=changed_paths, + ) # Build the PR description (identical for conflict and non-conflict). if pr_desc_cache.source_sha == source_sha: @@ -626,6 +664,7 @@ def _sync_public_to_private( Returns True if a PR was created. Raises TransientSyncError on base-branch-deleted failures. """ + changed_paths = source_git.diff_name_only(f"{source_sha}^", source_sha) # Add the source repo as a remote. Use the absolute path because # peer_git runs with cwd=peer_repo_dir, so a relative path would # resolve to the wrong location. @@ -685,6 +724,18 @@ def _sync_public_to_private( peer_git.commit_amend_message( current_msg, origin_trailer, allow_empty=True ) + lfs_snapshot_dir = tempfile.mkdtemp(prefix=f"repo-sync-lfs-{short_sha}-") + try: + source_git.archive_to_dir(source_sha, lfs_snapshot_dir) + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref=source_sha, + snapshot_dir=lfs_snapshot_dir, + changed_paths=changed_paths, + ) + finally: + shutil.rmtree(lfs_snapshot_dir, ignore_errors=True) # Build the PR description (identical for conflict and non-conflict). source_gh = GhOps(source_repo, token=os.environ.get("GH_TOKEN")) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..98f3c07 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test package for repo-sync.""" diff --git a/tests/stack/test_lfs.py b/tests/stack/test_lfs.py new file mode 100644 index 0000000..c3d5889 --- /dev/null +++ b/tests/stack/test_lfs.py @@ -0,0 +1,52 @@ +"""Tests for Git LFS pointer detection helpers.""" + +from __future__ import annotations + +from pathlib import Path + +from repo_sync.stack.lfs import LfsPointer, collect_lfs_pointers, parse_lfs_pointer + + +OID = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + + +def _pointer(oid: str = OID, size: int = 1234) -> bytes: + """Return valid Git LFS pointer file content.""" + return ( + "version https://git-lfs.github.com/spec/v1\n" + f"oid sha256:{oid}\n" + f"size {size}\n" + ).encode("utf-8") + + +def test_parse_lfs_pointer() -> None: + pointer = parse_lfs_pointer(_pointer(), "asset.bin") + + assert pointer == LfsPointer(path="asset.bin", oid=OID, size=1234) + + +def test_parse_lfs_pointer_rejects_ordinary_file() -> None: + assert parse_lfs_pointer(b"hello\n", "hello.txt") is None + + +def test_parse_lfs_pointer_rejects_invalid_oid() -> None: + data = ( + "version https://git-lfs.github.com/spec/v1\n" + "oid sha256:not-a-sha\n" + "size 1234\n" + ).encode("utf-8") + + assert parse_lfs_pointer(data, "asset.bin") is None + + +def test_collect_lfs_pointers_filters_to_changed_paths(tmp_path: Path) -> None: + (tmp_path / "asset.bin").write_bytes(_pointer()) + (tmp_path / "unchanged.bin").write_bytes(_pointer("f" * 64)) + (tmp_path / "ordinary.txt").write_text("hello\n", encoding="utf-8") + + pointers = collect_lfs_pointers( + str(tmp_path), + ["asset.bin", "ordinary.txt", "deleted.bin"], + ) + + assert pointers == [LfsPointer(path="asset.bin", oid=OID, size=1234)] diff --git a/tests/workflows/test_lfs_mirroring.py b/tests/workflows/test_lfs_mirroring.py new file mode 100644 index 0000000..a5e2834 --- /dev/null +++ b/tests/workflows/test_lfs_mirroring.py @@ -0,0 +1,64 @@ +"""Tests for Git LFS object mirroring during sync.""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +from repo_sync.stack.git_ops import GitOps +from repo_sync.workflows.create_sync_prs import _mirror_lfs_objects + + +OID = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + + +def test_mirror_lfs_objects_pushes_changed_pointer_oids(tmp_path: Path) -> None: + pointer = ( + "version https://git-lfs.github.com/spec/v1\n" + f"oid sha256:{OID}\n" + "size 1234\n" + ) + (tmp_path / "asset.bin").write_text(pointer, encoding="utf-8") + + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + peer_git.remote_url.return_value = "https://github.com/org/peer.git" + + with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=["asset.bin"], + ) + + remote = "repo_sync_lfs_target_42" + source_git.lfs_fetch_ref.assert_called_once_with("origin", "abc123") + peer_git.remote_url.assert_called_once_with("origin") + source_git.remote_add_or_update.assert_called_once_with( + remote, + "https://github.com/org/peer.git", + ) + source_git.lfs_push_oids.assert_called_once_with(remote, [OID]) + source_git.remote_remove.assert_called_once_with(remote) + + +def test_mirror_lfs_objects_skips_when_no_changed_pointer(tmp_path: Path) -> None: + (tmp_path / "ordinary.txt").write_text("hello\n", encoding="utf-8") + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=["ordinary.txt"], + ) + + source_git.lfs_fetch_ref.assert_not_called() + peer_git.remote_url.assert_not_called() + source_git.remote_add_or_update.assert_not_called() + source_git.lfs_push_oids.assert_not_called() + source_git.remote_remove.assert_not_called() From cdad67b2673cc129ed6479c60056ed1a226f9596 Mon Sep 17 00:00:00 2001 From: David Stern Date: Wed, 13 May 2026 15:35:05 -0400 Subject: [PATCH 2/8] pr review --- src/repo_sync/workflows/create_sync_prs.py | 12 ++++- tests/workflows/test_lfs_mirroring.py | 53 ++++++++++++++++++++-- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/repo_sync/workflows/create_sync_prs.py b/src/repo_sync/workflows/create_sync_prs.py index 1e64f09..0285979 100644 --- a/src/repo_sync/workflows/create_sync_prs.py +++ b/src/repo_sync/workflows/create_sync_prs.py @@ -156,11 +156,21 @@ def _mirror_lfs_objects( if not pointers: return - oids = sorted({pointer.oid for pointer in pointers}) + paths_by_oid: dict[str, list[str]] = {} + for pointer in pointers: + paths_by_oid.setdefault(pointer.oid, []).append(pointer.path) + + oids = sorted(paths_by_oid) logger.info( "Mirroring %d Git LFS object(s) referenced by %d changed pointer file(s).", len(oids), len(pointers), ) + for oid in oids: + logger.info( + "Mirroring Git LFS object %s for path(s): %s.", + oid, + ", ".join(sorted(paths_by_oid[oid])), + ) source_git.lfs_fetch_ref("origin", source_ref) diff --git a/tests/workflows/test_lfs_mirroring.py b/tests/workflows/test_lfs_mirroring.py index a5e2834..92bf685 100644 --- a/tests/workflows/test_lfs_mirroring.py +++ b/tests/workflows/test_lfs_mirroring.py @@ -2,27 +2,39 @@ from __future__ import annotations +import logging from pathlib import Path from unittest.mock import MagicMock, patch +from pytest import LogCaptureFixture + from repo_sync.stack.git_ops import GitOps from repo_sync.workflows.create_sync_prs import _mirror_lfs_objects OID = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" +PRIVATE_OID = "fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210" -def test_mirror_lfs_objects_pushes_changed_pointer_oids(tmp_path: Path) -> None: - pointer = ( +def _pointer(oid: str) -> str: + """Return valid Git LFS pointer file content.""" + return ( "version https://git-lfs.github.com/spec/v1\n" - f"oid sha256:{OID}\n" + f"oid sha256:{oid}\n" "size 1234\n" ) - (tmp_path / "asset.bin").write_text(pointer, encoding="utf-8") + + +def test_mirror_lfs_objects_pushes_changed_pointer_oids( + tmp_path: Path, + caplog: LogCaptureFixture, +) -> None: + (tmp_path / "asset.bin").write_text(_pointer(OID), encoding="utf-8") source_git = MagicMock(spec=GitOps) peer_git = MagicMock(spec=GitOps) peer_git.remote_url.return_value = "https://github.com/org/peer.git" + caplog.set_level(logging.INFO, logger="repo_sync.workflows.create_sync_prs") with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): _mirror_lfs_objects( @@ -42,6 +54,8 @@ def test_mirror_lfs_objects_pushes_changed_pointer_oids(tmp_path: Path) -> None: ) source_git.lfs_push_oids.assert_called_once_with(remote, [OID]) source_git.remote_remove.assert_called_once_with(remote) + assert OID in caplog.text + assert "asset.bin" in caplog.text def test_mirror_lfs_objects_skips_when_no_changed_pointer(tmp_path: Path) -> None: @@ -62,3 +76,34 @@ def test_mirror_lfs_objects_skips_when_no_changed_pointer(tmp_path: Path) -> Non source_git.remote_add_or_update.assert_not_called() source_git.lfs_push_oids.assert_not_called() source_git.remote_remove.assert_not_called() + + +def test_mirror_lfs_objects_does_not_push_private_stripped_pointer( + tmp_path: Path, +) -> None: + source_tree = tmp_path / "source" + source_tree.joinpath("private").mkdir(parents=True) + source_tree.joinpath("private/secret.bin").write_text( + _pointer(PRIVATE_OID), + encoding="utf-8", + ) + + snapshot = tmp_path / "stripped-snapshot" + snapshot.mkdir() + snapshot.joinpath("asset.bin").write_text(_pointer(OID), encoding="utf-8") + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + peer_git.remote_url.return_value = "https://github.com/org/public.git" + + with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(snapshot), + changed_paths=["asset.bin", "private/secret.bin"], + ) + + remote = "repo_sync_lfs_target_42" + source_git.lfs_push_oids.assert_called_once_with(remote, [OID]) + assert PRIVATE_OID not in source_git.lfs_push_oids.call_args.args[1] From ab1b496651b22a81fb198962fa54af390e6d02f5 Mon Sep 17 00:00:00 2001 From: David Stern Date: Wed, 13 May 2026 16:26:49 -0400 Subject: [PATCH 3/8] agentic pr review --- src/repo_sync/stack/git_ops.py | 76 ++++++++++++++++- src/repo_sync/stack/lfs.py | 20 ++++- src/repo_sync/workflows/create_sync_prs.py | 26 +++++- tests/stack/test_git_ops.py | 27 +++++++ tests/stack/test_lfs.py | 20 +++++ tests/workflows/test_lfs_mirroring.py | 94 +++++++++++++++++++++- 6 files changed, 254 insertions(+), 9 deletions(-) create mode 100644 tests/stack/test_git_ops.py diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index c5c5caa..4d02e5a 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -5,6 +5,7 @@ import os import subprocess from dataclasses import dataclass +from pathlib import Path from repo_sync.errors import VerboseCalledProcessError @@ -51,7 +52,6 @@ def _run(self, args: list[str], check: bool = True) -> CommandResult: stderr=result.stderr.strip(), ) - def rev_parse(self, ref: str) -> str: """Resolve a ref to a full SHA.""" return self._run(["rev-parse", ref]).stdout @@ -385,9 +385,79 @@ def diff_name_only(self, ref_a: str, ref_b: str) -> list[str]: if path ] - def lfs_fetch_ref(self, remote: str, ref: str) -> None: + def lfs_tracked_paths( + self, + paths: list[str], + source_ref: str | None = None, + ) -> set[str]: + """Return paths that are configured with the Git LFS filter.""" + if not paths: + return set() + + args = ["check-attr", "-z"] + if source_ref is not None: + args.extend(["--source", source_ref]) + args.extend(["--stdin", "filter"]) + + env = {**os.environ, **self._env_additions} if self._env_additions else None + input_text = "".join(f"{path}\0" for path in paths) + result = subprocess.run( + ["git", *args], + cwd=self.repo_dir, + input=input_text, + capture_output=True, + text=True, + env=env, + ) + if result.returncode != 0: + raise VerboseCalledProcessError( + result.returncode, + ["git", *args], + result.stdout, + result.stderr, + ) + + tracked_paths: set[str] = set() + fields = [field for field in result.stdout.split("\0") if field] + for index in range(0, len(fields), 3): + if index + 2 >= len(fields): + break + path, attr, value = fields[index:index + 3] + if attr == "filter" and value == "lfs": + tracked_paths.add(path) + return tracked_paths + + def lfs_fetch_ref( + self, + remote: str, + ref: str, + include_paths: list[str] | None = None, + ) -> None: """Fetch LFS objects referenced by a ref from a remote.""" - self._run(["lfs", "fetch", remote, ref]) + args = ["lfs", "fetch"] + if include_paths: + args.append(f"--include={','.join(include_paths)}") + args.extend([remote, ref]) + self._run(args) + + def lfs_missing_oids(self, oids: list[str]) -> list[str]: + """Return LFS object IDs that are not present in the local LFS store.""" + if not oids: + return [] + + git_common_dir = self._run(["rev-parse", "--git-common-dir"]).stdout + git_common_path = Path(git_common_dir) + if not git_common_path.is_absolute(): + git_common_path = Path(self.repo_dir) / git_common_path + + missing: list[str] = [] + for oid in oids: + object_path = ( + git_common_path / "lfs" / "objects" / oid[:2] / oid[2:4] / oid + ) + if not object_path.is_file(): + missing.append(oid) + return missing def lfs_push_oids(self, remote: str, oids: list[str]) -> None: """Push specific LFS object IDs to a remote.""" diff --git a/src/repo_sync/stack/lfs.py b/src/repo_sync/stack/lfs.py index d3a09c6..2ea2166 100644 --- a/src/repo_sync/stack/lfs.py +++ b/src/repo_sync/stack/lfs.py @@ -4,6 +4,7 @@ import os import re +import stat from dataclasses import dataclass from pathlib import Path from typing import Iterable @@ -63,15 +64,28 @@ def collect_lfs_pointers( """Collect LFS pointers under root, optionally restricted to relative paths.""" candidates = _candidate_paths(root, paths) pointers: list[LfsPointer] = [] + fail_on_read_error = paths is not None for relpath, fullpath in candidates: try: - if not os.path.isfile(fullpath) or os.path.islink(fullpath): - continue - if os.path.getsize(fullpath) > _MAX_POINTER_BYTES: + path_stat = os.lstat(fullpath) + except FileNotFoundError: + continue + except OSError: + if fail_on_read_error: + raise + continue + + if stat.S_ISLNK(path_stat.st_mode) or not stat.S_ISREG(path_stat.st_mode): + continue + + try: + if path_stat.st_size > _MAX_POINTER_BYTES: continue data = Path(fullpath).read_bytes() except OSError: + if fail_on_read_error: + raise continue pointer = parse_lfs_pointer(data, relpath) diff --git a/src/repo_sync/workflows/create_sync_prs.py b/src/repo_sync/workflows/create_sync_prs.py index 0285979..2836733 100644 --- a/src/repo_sync/workflows/create_sync_prs.py +++ b/src/repo_sync/workflows/create_sync_prs.py @@ -150,9 +150,22 @@ def _mirror_lfs_objects( source_ref: str, snapshot_dir: str, changed_paths: list[str], + attributes_git: GitOps, + attributes_ref: str, ) -> None: """Mirror LFS objects referenced by changed pointer files in a snapshot.""" pointers = collect_lfs_pointers(snapshot_dir, changed_paths) + if not pointers: + return + lfs_tracked_paths = attributes_git.lfs_tracked_paths( + sorted({pointer.path for pointer in pointers}), + source_ref=attributes_ref, + ) + pointers = [ + pointer + for pointer in pointers + if pointer.path in lfs_tracked_paths + ] if not pointers: return @@ -172,7 +185,14 @@ def _mirror_lfs_objects( ", ".join(sorted(paths_by_oid[oid])), ) - source_git.lfs_fetch_ref("origin", source_ref) + fetch_paths = sorted({pointer.path for pointer in pointers}) + source_git.lfs_fetch_ref("origin", source_ref, include_paths=fetch_paths) + missing_oids = source_git.lfs_missing_oids(oids) + if missing_oids: + raise PermanentSyncError( + "Missing Git LFS object(s) after fetch: " + + ", ".join(sorted(missing_oids)) + ) target_remote = f"repo_sync_lfs_target_{os.getpid()}" peer_url = peer_git.remote_url("origin") @@ -589,6 +609,8 @@ def _sync_private_to_public( source_ref=source_sha, snapshot_dir=snapshot_dir, changed_paths=changed_paths, + attributes_git=GitOps(diff_repo), + attributes_ref="HEAD", ) # Build the PR description (identical for conflict and non-conflict). @@ -743,6 +765,8 @@ def _sync_public_to_private( source_ref=source_sha, snapshot_dir=lfs_snapshot_dir, changed_paths=changed_paths, + attributes_git=source_git, + attributes_ref=source_sha, ) finally: shutil.rmtree(lfs_snapshot_dir, ignore_errors=True) diff --git a/tests/stack/test_git_ops.py b/tests/stack/test_git_ops.py new file mode 100644 index 0000000..5039177 --- /dev/null +++ b/tests/stack/test_git_ops.py @@ -0,0 +1,27 @@ +"""Tests for git operation helpers.""" + +from __future__ import annotations + +from pathlib import Path + +from repo_sync.stack.git_ops import GitOps + + +def test_lfs_tracked_paths_uses_source_ref(tmp_git_repo: GitOps) -> None: + repo_dir = Path(tmp_git_repo.repo_dir) + (repo_dir / ".gitattributes").write_text("*.bin filter=lfs\n", encoding="utf-8") + tmp_git_repo._run(["add", ".gitattributes"]) + tmp_git_repo._run(["commit", "-m", "track bin files"]) + bin_attrs_ref = tmp_git_repo.rev_parse("HEAD") + + (repo_dir / ".gitattributes").write_text("*.dat filter=lfs\n", encoding="utf-8") + tmp_git_repo._run(["add", ".gitattributes"]) + tmp_git_repo._run(["commit", "-m", "track dat files"]) + + assert tmp_git_repo.lfs_tracked_paths( + ["asset.bin", "asset.dat"], + source_ref=bin_attrs_ref, + ) == {"asset.bin"} + assert tmp_git_repo.lfs_tracked_paths(["asset.bin", "asset.dat"]) == { + "asset.dat" + } diff --git a/tests/stack/test_lfs.py b/tests/stack/test_lfs.py index c3d5889..3e3611f 100644 --- a/tests/stack/test_lfs.py +++ b/tests/stack/test_lfs.py @@ -4,6 +4,8 @@ from pathlib import Path +import pytest + from repo_sync.stack.lfs import LfsPointer, collect_lfs_pointers, parse_lfs_pointer @@ -50,3 +52,21 @@ def test_collect_lfs_pointers_filters_to_changed_paths(tmp_path: Path) -> None: ) assert pointers == [LfsPointer(path="asset.bin", oid=OID, size=1234)] + + +def test_collect_lfs_pointers_raises_read_errors_for_changed_paths( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + (tmp_path / "asset.bin").write_bytes(_pointer()) + original_read_bytes = Path.read_bytes + + def read_bytes(path: Path) -> bytes: + if path.name == "asset.bin": + raise PermissionError("cannot read") + return original_read_bytes(path) + + monkeypatch.setattr(Path, "read_bytes", read_bytes) + + with pytest.raises(PermissionError): + collect_lfs_pointers(str(tmp_path), ["asset.bin"]) diff --git a/tests/workflows/test_lfs_mirroring.py b/tests/workflows/test_lfs_mirroring.py index 92bf685..c7c7d41 100644 --- a/tests/workflows/test_lfs_mirroring.py +++ b/tests/workflows/test_lfs_mirroring.py @@ -6,10 +6,11 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest from pytest import LogCaptureFixture from repo_sync.stack.git_ops import GitOps -from repo_sync.workflows.create_sync_prs import _mirror_lfs_objects +from repo_sync.workflows.create_sync_prs import PermanentSyncError, _mirror_lfs_objects OID = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" @@ -33,7 +34,10 @@ def test_mirror_lfs_objects_pushes_changed_pointer_oids( source_git = MagicMock(spec=GitOps) peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) peer_git.remote_url.return_value = "https://github.com/org/peer.git" + attributes_git.lfs_tracked_paths.return_value = {"asset.bin"} + source_git.lfs_missing_oids.return_value = [] caplog.set_level(logging.INFO, logger="repo_sync.workflows.create_sync_prs") with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): @@ -43,10 +47,21 @@ def test_mirror_lfs_objects_pushes_changed_pointer_oids( source_ref="abc123", snapshot_dir=str(tmp_path), changed_paths=["asset.bin"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", ) remote = "repo_sync_lfs_target_42" - source_git.lfs_fetch_ref.assert_called_once_with("origin", "abc123") + attributes_git.lfs_tracked_paths.assert_called_once_with( + ["asset.bin"], + source_ref="attrs-ref", + ) + source_git.lfs_fetch_ref.assert_called_once_with( + "origin", + "abc123", + include_paths=["asset.bin"], + ) + source_git.lfs_missing_oids.assert_called_once_with([OID]) peer_git.remote_url.assert_called_once_with("origin") source_git.remote_add_or_update.assert_called_once_with( remote, @@ -62,6 +77,7 @@ def test_mirror_lfs_objects_skips_when_no_changed_pointer(tmp_path: Path) -> Non (tmp_path / "ordinary.txt").write_text("hello\n", encoding="utf-8") source_git = MagicMock(spec=GitOps) peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) _mirror_lfs_objects( source_git=source_git, @@ -69,9 +85,13 @@ def test_mirror_lfs_objects_skips_when_no_changed_pointer(tmp_path: Path) -> Non source_ref="abc123", snapshot_dir=str(tmp_path), changed_paths=["ordinary.txt"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", ) + attributes_git.lfs_tracked_paths.assert_not_called() source_git.lfs_fetch_ref.assert_not_called() + source_git.lfs_missing_oids.assert_not_called() peer_git.remote_url.assert_not_called() source_git.remote_add_or_update.assert_not_called() source_git.lfs_push_oids.assert_not_called() @@ -93,7 +113,10 @@ def test_mirror_lfs_objects_does_not_push_private_stripped_pointer( snapshot.joinpath("asset.bin").write_text(_pointer(OID), encoding="utf-8") source_git = MagicMock(spec=GitOps) peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) peer_git.remote_url.return_value = "https://github.com/org/public.git" + attributes_git.lfs_tracked_paths.return_value = {"asset.bin"} + source_git.lfs_missing_oids.return_value = [] with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): _mirror_lfs_objects( @@ -102,8 +125,75 @@ def test_mirror_lfs_objects_does_not_push_private_stripped_pointer( source_ref="abc123", snapshot_dir=str(snapshot), changed_paths=["asset.bin", "private/secret.bin"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", ) remote = "repo_sync_lfs_target_42" + source_git.lfs_fetch_ref.assert_called_once_with( + "origin", + "abc123", + include_paths=["asset.bin"], + ) source_git.lfs_push_oids.assert_called_once_with(remote, [OID]) assert PRIVATE_OID not in source_git.lfs_push_oids.call_args.args[1] + + +def test_mirror_lfs_objects_skips_pointer_shaped_non_lfs_file( + tmp_path: Path, +) -> None: + (tmp_path / "looks-like-pointer.txt").write_text( + _pointer(PRIVATE_OID), + encoding="utf-8", + ) + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) + attributes_git.lfs_tracked_paths.return_value = set() + + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=["looks-like-pointer.txt"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", + ) + + attributes_git.lfs_tracked_paths.assert_called_once_with( + ["looks-like-pointer.txt"], + source_ref="attrs-ref", + ) + source_git.lfs_fetch_ref.assert_not_called() + source_git.lfs_missing_oids.assert_not_called() + source_git.lfs_push_oids.assert_not_called() + + +def test_mirror_lfs_objects_fails_when_fetched_object_is_missing( + tmp_path: Path, +) -> None: + (tmp_path / "asset.bin").write_text(_pointer(OID), encoding="utf-8") + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) + attributes_git.lfs_tracked_paths.return_value = {"asset.bin"} + source_git.lfs_missing_oids.return_value = [OID] + + with pytest.raises(PermanentSyncError): + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=["asset.bin"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", + ) + + source_git.lfs_fetch_ref.assert_called_once_with( + "origin", + "abc123", + include_paths=["asset.bin"], + ) + source_git.lfs_push_oids.assert_not_called() \ No newline at end of file From 1adaa95afa1ff77826a96a50c4e44ac68c94cc07 Mon Sep 17 00:00:00 2001 From: David Stern Date: Thu, 14 May 2026 14:08:59 -0400 Subject: [PATCH 4/8] agentic pr review 2 --- src/repo_sync/stack/git_ops.py | 31 +++++--- src/repo_sync/stack/lfs.py | 7 +- src/repo_sync/workflows/create_sync_prs.py | 17 ++++- tests/stack/test_git_ops.py | 20 +++++ tests/workflows/test_lfs_mirroring.py | 89 +++++++++++++++++++--- 5 files changed, 137 insertions(+), 27 deletions(-) diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index 4d02e5a..24d57e8 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -427,18 +427,25 @@ def lfs_tracked_paths( tracked_paths.add(path) return tracked_paths - def lfs_fetch_ref( - self, - remote: str, - ref: str, - include_paths: list[str] | None = None, - ) -> None: - """Fetch LFS objects referenced by a ref from a remote.""" - args = ["lfs", "fetch"] - if include_paths: - args.append(f"--include={','.join(include_paths)}") - args.extend([remote, ref]) - self._run(args) + def lfs_fetch_paths(self, ref: str, paths: list[str]) -> None: + """Fetch LFS objects for exact paths at a ref.""" + env = {**os.environ, **self._env_additions} if self._env_additions else None + for path in paths: + result = subprocess.run( + ["git", "cat-file", "--filters", f"{ref}:{path}"], + cwd=self.repo_dir, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + env=env, + ) + if result.returncode != 0: + raise VerboseCalledProcessError( + result.returncode, + ["git", "cat-file", "--filters", f"{ref}:{path}"], + "", + result.stderr, + ) def lfs_missing_oids(self, oids: list[str]) -> list[str]: """Return LFS object IDs that are not present in the local LFS store.""" diff --git a/src/repo_sync/stack/lfs.py b/src/repo_sync/stack/lfs.py index 2ea2166..78aa1ef 100644 --- a/src/repo_sync/stack/lfs.py +++ b/src/repo_sync/stack/lfs.py @@ -60,11 +60,16 @@ def parse_lfs_pointer(data: bytes, path: str) -> LfsPointer | None: def collect_lfs_pointers( root: str, paths: Iterable[str] | None = None, + fail_on_read_error: bool | None = None, ) -> list[LfsPointer]: """Collect LFS pointers under root, optionally restricted to relative paths.""" candidates = _candidate_paths(root, paths) pointers: list[LfsPointer] = [] - fail_on_read_error = paths is not None + fail_on_read_error = ( + paths is not None + if fail_on_read_error is None + else fail_on_read_error + ) for relpath, fullpath in candidates: try: diff --git a/src/repo_sync/workflows/create_sync_prs.py b/src/repo_sync/workflows/create_sync_prs.py index 2836733..1a4fdfa 100644 --- a/src/repo_sync/workflows/create_sync_prs.py +++ b/src/repo_sync/workflows/create_sync_prs.py @@ -154,7 +154,12 @@ def _mirror_lfs_objects( attributes_ref: str, ) -> None: """Mirror LFS objects referenced by changed pointer files in a snapshot.""" - pointers = collect_lfs_pointers(snapshot_dir, changed_paths) + scan_paths = None if _lfs_attributes_changed(changed_paths) else changed_paths + pointers = collect_lfs_pointers( + snapshot_dir, + scan_paths, + fail_on_read_error=True, + ) if not pointers: return lfs_tracked_paths = attributes_git.lfs_tracked_paths( @@ -186,7 +191,7 @@ def _mirror_lfs_objects( ) fetch_paths = sorted({pointer.path for pointer in pointers}) - source_git.lfs_fetch_ref("origin", source_ref, include_paths=fetch_paths) + source_git.lfs_fetch_paths(source_ref, fetch_paths) missing_oids = source_git.lfs_missing_oids(oids) if missing_oids: raise PermanentSyncError( @@ -203,6 +208,14 @@ def _mirror_lfs_objects( source_git.remote_remove(target_remote) +def _lfs_attributes_changed(changed_paths: list[str]) -> bool: + """Return True when changed paths may affect LFS tracking rules.""" + return any( + path == ".gitattributes" or path.endswith("/.gitattributes") + for path in changed_paths + ) + + # Docker image for the PR description agent. Built locally by the sync # workflow from docker/pr-description/Dockerfile (not pushed to a registry). # Can be overridden via environment variable for testing. diff --git a/tests/stack/test_git_ops.py b/tests/stack/test_git_ops.py index 5039177..ca40183 100644 --- a/tests/stack/test_git_ops.py +++ b/tests/stack/test_git_ops.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path +from unittest.mock import MagicMock, patch from repo_sync.stack.git_ops import GitOps @@ -25,3 +26,22 @@ def test_lfs_tracked_paths_uses_source_ref(tmp_git_repo: GitOps) -> None: assert tmp_git_repo.lfs_tracked_paths(["asset.bin", "asset.dat"]) == { "asset.dat" } + + +def test_lfs_fetch_paths_uses_cat_file_filters_for_exact_paths( + tmp_git_repo: GitOps, +) -> None: + result = MagicMock() + result.returncode = 0 + result.stderr = "" + + with patch("repo_sync.stack.git_ops.subprocess.run", return_value=result) as run: + tmp_git_repo.lfs_fetch_paths("abc123", ["asset,with-comma.bin"]) + + run.assert_called_once() + assert run.call_args.args[0] == [ + "git", + "cat-file", + "--filters", + "abc123:asset,with-comma.bin", + ] diff --git a/tests/workflows/test_lfs_mirroring.py b/tests/workflows/test_lfs_mirroring.py index c7c7d41..3028062 100644 --- a/tests/workflows/test_lfs_mirroring.py +++ b/tests/workflows/test_lfs_mirroring.py @@ -56,10 +56,9 @@ def test_mirror_lfs_objects_pushes_changed_pointer_oids( ["asset.bin"], source_ref="attrs-ref", ) - source_git.lfs_fetch_ref.assert_called_once_with( - "origin", + source_git.lfs_fetch_paths.assert_called_once_with( "abc123", - include_paths=["asset.bin"], + ["asset.bin"], ) source_git.lfs_missing_oids.assert_called_once_with([OID]) peer_git.remote_url.assert_called_once_with("origin") @@ -90,7 +89,7 @@ def test_mirror_lfs_objects_skips_when_no_changed_pointer(tmp_path: Path) -> Non ) attributes_git.lfs_tracked_paths.assert_not_called() - source_git.lfs_fetch_ref.assert_not_called() + source_git.lfs_fetch_paths.assert_not_called() source_git.lfs_missing_oids.assert_not_called() peer_git.remote_url.assert_not_called() source_git.remote_add_or_update.assert_not_called() @@ -130,10 +129,9 @@ def test_mirror_lfs_objects_does_not_push_private_stripped_pointer( ) remote = "repo_sync_lfs_target_42" - source_git.lfs_fetch_ref.assert_called_once_with( - "origin", + source_git.lfs_fetch_paths.assert_called_once_with( "abc123", - include_paths=["asset.bin"], + ["asset.bin"], ) source_git.lfs_push_oids.assert_called_once_with(remote, [OID]) assert PRIVATE_OID not in source_git.lfs_push_oids.call_args.args[1] @@ -165,7 +163,7 @@ def test_mirror_lfs_objects_skips_pointer_shaped_non_lfs_file( ["looks-like-pointer.txt"], source_ref="attrs-ref", ) - source_git.lfs_fetch_ref.assert_not_called() + source_git.lfs_fetch_paths.assert_not_called() source_git.lfs_missing_oids.assert_not_called() source_git.lfs_push_oids.assert_not_called() @@ -191,9 +189,76 @@ def test_mirror_lfs_objects_fails_when_fetched_object_is_missing( attributes_ref="attrs-ref", ) - source_git.lfs_fetch_ref.assert_called_once_with( - "origin", + source_git.lfs_fetch_paths.assert_called_once_with( + "abc123", + ["asset.bin"], + ) + source_git.lfs_push_oids.assert_not_called() + + +def test_mirror_lfs_objects_fetches_exact_comma_path( + tmp_path: Path, +) -> None: + (tmp_path / "asset,with-comma.bin").write_text( + _pointer(OID), + encoding="utf-8", + ) + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) + peer_git.remote_url.return_value = "https://github.com/org/peer.git" + attributes_git.lfs_tracked_paths.return_value = {"asset,with-comma.bin"} + source_git.lfs_missing_oids.return_value = [] + + with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=["asset,with-comma.bin"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", + ) + + source_git.lfs_fetch_paths.assert_called_once_with( + "abc123", + ["asset,with-comma.bin"], + ) + + +def test_mirror_lfs_objects_scans_all_pointers_when_attributes_change( + tmp_path: Path, +) -> None: + (tmp_path / ".gitattributes").write_text( + "*.bin filter=lfs\n", + encoding="utf-8", + ) + (tmp_path / "existing.bin").write_text(_pointer(OID), encoding="utf-8") + (tmp_path / "ordinary.txt").write_text("hello\n", encoding="utf-8") + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) + peer_git.remote_url.return_value = "https://github.com/org/peer.git" + attributes_git.lfs_tracked_paths.return_value = {"existing.bin"} + source_git.lfs_missing_oids.return_value = [] + + with patch("repo_sync.workflows.create_sync_prs.os.getpid", return_value=42): + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=[".gitattributes"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", + ) + + attributes_git.lfs_tracked_paths.assert_called_once_with( + ["existing.bin"], + source_ref="attrs-ref", + ) + source_git.lfs_fetch_paths.assert_called_once_with( "abc123", - include_paths=["asset.bin"], + ["existing.bin"], ) - source_git.lfs_push_oids.assert_not_called() \ No newline at end of file From 7fc79399004b15e8b8f989882a37572dd68976f1 Mon Sep 17 00:00:00 2001 From: David Stern Date: Thu, 14 May 2026 15:00:37 -0400 Subject: [PATCH 5/8] Make sure LFS text objects don't contain repo-sync markers. --- README.md | 2 +- actions/validate-markers/action.yml | 9 +- src/repo_sync/stack/git_ops.py | 20 +++ src/repo_sync/strip/cli.py | 19 +++ src/repo_sync/strip/lfs.py | 165 +++++++++++++++++++++ src/repo_sync/workflows/create_sync_prs.py | 28 +++- tests/stack/test_git_ops.py | 25 ++++ tests/test_cli.py | 12 ++ tests/test_lfs_payload_validation.py | 78 ++++++++++ tests/workflows/test_lfs_mirroring.py | 43 ++++++ 10 files changed, 397 insertions(+), 4 deletions(-) create mode 100644 src/repo_sync/strip/lfs.py create mode 100644 tests/test_lfs_payload_validation.py diff --git a/README.md b/README.md index 2747f95..3a21df1 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ jobs: - uses: warpdotdev/repo-sync/actions/validate-markers@main ``` -this validates that all `!repo-sync` markers are properly paired, not nested, and that no symlinks exist in the repo. +this validates that all `!repo-sync` markers are properly paired, not nested, that no symlinks exist in the repo, and that text Git LFS payloads do not contain repo-sync markers. ### step 3: add the sync workflow (both repos) diff --git a/actions/validate-markers/action.yml b/actions/validate-markers/action.yml index 76996cd..ca613fd 100644 --- a/actions/validate-markers/action.yml +++ b/actions/validate-markers/action.yml @@ -1,7 +1,8 @@ name: "Validate repo-sync markers" description: > Validates that all !repo-sync markers are properly paired and non-nested, - and that no symlinks target private/ directories or escape the repository root. + that no symlinks target private/ directories or escape the repository root, + and that text Git LFS payloads do not contain repo-sync markers. inputs: paths: @@ -23,12 +24,16 @@ runs: shell: bash run: pip install "${{ github.action_path }}/../../" + - name: Configure Git LFS + shell: bash + run: git lfs install --local + - name: Run marker validation shell: bash env: PATHS_INPUT: ${{ inputs.paths }} run: | - args=("--validate-only" "${{ github.workspace }}") + args=("--validate-only" "--validate-lfs-payloads" "${{ github.workspace }}") if [ -n "$PATHS_INPUT" ]; then for p in $PATHS_INPUT; do args+=("$p") diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index 24d57e8..a294760 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -447,6 +447,26 @@ def lfs_fetch_paths(self, ref: str, paths: list[str]) -> None: result.stderr, ) + def lfs_write_path(self, ref: str, path: str, output_path: str) -> None: + """Write the LFS-smudged content for an exact path at a ref.""" + env = {**os.environ, **self._env_additions} if self._env_additions else None + command = ["git", "cat-file", "--filters", f"{ref}:{path}"] + with open(output_path, "wb") as output: + result = subprocess.run( + command, + cwd=self.repo_dir, + stdout=output, + stderr=subprocess.PIPE, + env=env, + ) + if result.returncode != 0: + raise VerboseCalledProcessError( + result.returncode, + command, + b"", + result.stderr, + ) + def lfs_missing_oids(self, oids: list[str]) -> list[str]: """Return LFS object IDs that are not present in the local LFS store.""" if not oids: diff --git a/src/repo_sync/strip/cli.py b/src/repo_sync/strip/cli.py index 2da4e07..d876434 100644 --- a/src/repo_sync/strip/cli.py +++ b/src/repo_sync/strip/cli.py @@ -14,6 +14,7 @@ import argparse import sys +from repo_sync.strip.lfs import validate_lfs_payloads from repo_sync.strip.tree import StrippingError, strip_tree @@ -41,6 +42,12 @@ def main(argv: list[str] | None = None) -> int: default=False, help="Only validate markers; do not modify files.", ) + parser.add_argument( + "--validate-lfs-payloads", + action="store_true", + default=False, + help="Validate that Git LFS payloads do not contain repo-sync markers.", + ) args = parser.parse_args(argv) @@ -48,6 +55,12 @@ def main(argv: list[str] | None = None) -> int: if args.validate_only: result = strip_tree(args.directory, validate_only=True, paths=paths) + if args.validate_lfs_payloads: + lfs_result = validate_lfs_payloads(args.directory, paths=paths) + result = result._replace( + errors=[*result.errors, *lfs_result.errors], + warnings=[*result.warnings, *lfs_result.warnings], + ) for w in result.warnings: print(f"warning: {w}", file=sys.stderr) if result.errors: @@ -55,6 +68,12 @@ def main(argv: list[str] | None = None) -> int: print(err, file=sys.stderr) return 1 return 0 + if args.validate_lfs_payloads: + print( + "--validate-lfs-payloads requires --validate-only", + file=sys.stderr, + ) + return 1 try: result = strip_tree(args.directory) diff --git a/src/repo_sync/strip/lfs.py b/src/repo_sync/strip/lfs.py new file mode 100644 index 0000000..b59e821 --- /dev/null +++ b/src/repo_sync/strip/lfs.py @@ -0,0 +1,165 @@ +"""Git LFS payload validation for repo-sync markers.""" + +from __future__ import annotations + +import json +import os +from pathlib import Path, PurePosixPath +import subprocess +import tempfile + +from repo_sync.strip.detect import is_binary +from repo_sync.strip.markers import ( + MarkerError, + has_private_file_marker, + strip_private_regions, + validate_markers, +) +from repo_sync.strip.tree import StripResult, _expand_paths + + +def validate_lfs_payload_file(payload_path: str, *, filepath: str) -> list[str]: + """Return errors if an LFS payload contains repo-sync private markers.""" + if is_binary(payload_path): + return [] + + try: + text = Path(payload_path).read_bytes().decode("utf-8") + except UnicodeDecodeError: + return [] + + lines = text.splitlines(keepends=True) + errors = validate_markers(lines, filepath=filepath) + if errors: + return [ + f"{error} (Git LFS payloads cannot contain repo-sync private markers)" + for error in errors + ] + + if has_private_file_marker(lines): + return [ + f"{filepath}: Git LFS payload contains a private-file marker; " + "LFS payloads cannot be stripped during repo sync" + ] + + try: + stripped = strip_private_regions(lines, filepath=filepath) + except MarkerError as exc: + return [str(exc)] + + if stripped != lines: + return [ + f"{filepath}: Git LFS payload contains private region markers; " + "LFS payloads cannot be stripped during repo sync" + ] + + return [] + + +def validate_lfs_payloads( + root: str, + *, + paths: list[str] | None = None, + ref: str = "HEAD", +) -> StripResult: + """Validate repo-sync marker invariants for Git LFS payloads in a repo.""" + errors: list[str] = [] + warnings: list[str] = [] + try: + lfs_paths = _git_lfs_paths(root, ref) + except subprocess.CalledProcessError as exc: + stderr = ( + exc.stderr + if isinstance(exc.stderr, str) + else (exc.stderr or b"").decode("utf-8", errors="replace") + ) + return StripResult( + [f"failed to list Git LFS files: {stderr.strip() or exc}"], + warnings, + ) + + selected_paths = _select_lfs_paths(root, lfs_paths, paths) + if not selected_paths: + return StripResult(errors, warnings) + + with tempfile.TemporaryDirectory(prefix="repo-sync-lfs-validate-") as temp_dir: + for index, relpath in enumerate(selected_paths): + payload_path = os.path.join(temp_dir, f"payload-{index}") + try: + _write_lfs_payload(root, ref, relpath, payload_path) + except subprocess.CalledProcessError as exc: + stderr = ( + exc.stderr + if isinstance(exc.stderr, str) + else (exc.stderr or b"").decode("utf-8", errors="replace") + ) + errors.append( + f"{relpath}: failed to materialize Git LFS payload: " + f"{stderr.strip() or exc}" + ) + continue + errors.extend( + validate_lfs_payload_file(payload_path, filepath=relpath) + ) + + return StripResult(errors, warnings) + + +def _git_lfs_paths(root: str, ref: str) -> list[str]: + """Return Git LFS paths present at a ref.""" + result = subprocess.run( + ["git", "lfs", "ls-files", "--json", ref], + cwd=root, + capture_output=True, + text=True, + check=True, + ) + data = json.loads(result.stdout or "{}") + files = data.get("files") or [] + paths: list[str] = [] + for entry in files: + path = entry.get("name") or entry.get("path") + if path: + paths.append(path) + return sorted(paths) + + +def _select_lfs_paths( + root: str, + lfs_paths: list[str], + paths: list[str] | None, +) -> list[str]: + """Return the LFS paths selected by the action's path filters.""" + if paths is None: + return lfs_paths + if any( + path == ".gitattributes" or path.endswith("/.gitattributes") + for path in paths + ): + return lfs_paths + + expanded = {os.path.relpath(path, root) for path in _expand_paths(root, paths)} + selected: list[str] = [] + for lfs_path in lfs_paths: + if lfs_path in expanded or _matches_any_pattern(lfs_path, paths): + selected.append(lfs_path) + return selected + + +def _matches_any_pattern(path: str, patterns: list[str]) -> bool: + """Return True if a repo-relative path matches any glob pattern.""" + pure_path = PurePosixPath(path) + return any(pure_path.match(pattern) for pattern in patterns) + + +def _write_lfs_payload(root: str, ref: str, relpath: str, output_path: str) -> None: + """Write the LFS-smudged payload for a path to a temporary file.""" + command = ["git", "cat-file", "--filters", f"{ref}:{relpath}"] + with open(output_path, "wb") as output: + subprocess.run( + command, + cwd=root, + stdout=output, + stderr=subprocess.PIPE, + check=True, + ) diff --git a/src/repo_sync/workflows/create_sync_prs.py b/src/repo_sync/workflows/create_sync_prs.py index 1a4fdfa..27e10bc 100644 --- a/src/repo_sync/workflows/create_sync_prs.py +++ b/src/repo_sync/workflows/create_sync_prs.py @@ -27,12 +27,13 @@ from repo_sync.stack.gh_ops import GhOps from repo_sync.stack.git_ops import GitOps -from repo_sync.stack.lfs import collect_lfs_pointers +from repo_sync.stack.lfs import LfsPointer, collect_lfs_pointers from repo_sync.stack.trailers import ( SyncOrigin, append_trailer, format_conflict_trailer, ) +from repo_sync.strip.lfs import validate_lfs_payload_file from repo_sync.workflows.conflict import ( add_conflict_label, assign_conflict_reviewer, @@ -152,6 +153,7 @@ def _mirror_lfs_objects( changed_paths: list[str], attributes_git: GitOps, attributes_ref: str, + validate_payload_markers: bool = False, ) -> None: """Mirror LFS objects referenced by changed pointer files in a snapshot.""" scan_paths = None if _lfs_attributes_changed(changed_paths) else changed_paths @@ -198,6 +200,8 @@ def _mirror_lfs_objects( "Missing Git LFS object(s) after fetch: " + ", ".join(sorted(missing_oids)) ) + if validate_payload_markers: + _validate_lfs_payload_markers(source_git, source_ref, pointers) target_remote = f"repo_sync_lfs_target_{os.getpid()}" peer_url = peer_git.remote_url("origin") @@ -216,6 +220,27 @@ def _lfs_attributes_changed(changed_paths: list[str]) -> bool: ) +def _validate_lfs_payload_markers( + source_git: GitOps, + source_ref: str, + pointers: list[LfsPointer], +) -> None: + """Reject text LFS payloads that rely on repo-sync stripping markers.""" + errors: list[str] = [] + with tempfile.TemporaryDirectory(prefix="repo-sync-lfs-payload-") as temp_dir: + for index, pointer in enumerate(pointers): + payload_path = os.path.join(temp_dir, f"payload-{index}") + source_git.lfs_write_path(source_ref, pointer.path, payload_path) + errors.extend( + validate_lfs_payload_file(payload_path, filepath=pointer.path) + ) + if errors: + raise PermanentSyncError( + "Git LFS payloads cannot contain repo-sync private markers:\n" + + "\n".join(errors) + ) + + # Docker image for the PR description agent. Built locally by the sync # workflow from docker/pr-description/Dockerfile (not pushed to a registry). # Can be overridden via environment variable for testing. @@ -624,6 +649,7 @@ def _sync_private_to_public( changed_paths=changed_paths, attributes_git=GitOps(diff_repo), attributes_ref="HEAD", + validate_payload_markers=True, ) # Build the PR description (identical for conflict and non-conflict). diff --git a/tests/stack/test_git_ops.py b/tests/stack/test_git_ops.py index ca40183..e0019d4 100644 --- a/tests/stack/test_git_ops.py +++ b/tests/stack/test_git_ops.py @@ -45,3 +45,28 @@ def test_lfs_fetch_paths_uses_cat_file_filters_for_exact_paths( "--filters", "abc123:asset,with-comma.bin", ] + + +def test_lfs_write_path_uses_cat_file_filters_for_exact_path( + tmp_git_repo: GitOps, + tmp_path: Path, +) -> None: + result = MagicMock() + result.returncode = 0 + result.stderr = b"" + output_path = tmp_path / "payload" + + with patch("repo_sync.stack.git_ops.subprocess.run", return_value=result) as run: + tmp_git_repo.lfs_write_path( + "abc123", + "asset,with-comma.bin", + str(output_path), + ) + + run.assert_called_once() + assert run.call_args.args[0] == [ + "git", + "cat-file", + "--filters", + "abc123:asset,with-comma.bin", + ] diff --git a/tests/test_cli.py b/tests/test_cli.py index 8bc74cf..4568c9c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,8 +3,10 @@ from __future__ import annotations from pathlib import Path +from unittest.mock import patch from repo_sync.strip.cli import main +from repo_sync.strip.tree import StripResult def _write(root: Path, relpath: str, content: str) -> Path: @@ -62,3 +64,13 @@ def test_validate_only_with_paths(self, tmp_path: Path) -> None: # Only validate good.rs. rc = main(["--validate-only", str(tmp_path), "good.rs"]) assert rc == 0 + + def test_validate_only_with_lfs_payload_errors(self, tmp_path: Path) -> None: + """CLI --validate-only can include Git LFS payload marker validation.""" + with patch( + "repo_sync.strip.cli.validate_lfs_payloads", + return_value=StripResult(["asset.txt: LFS marker error"], []), + ): + rc = main(["--validate-only", "--validate-lfs-payloads", str(tmp_path)]) + + assert rc == 1 diff --git a/tests/test_lfs_payload_validation.py b/tests/test_lfs_payload_validation.py new file mode 100644 index 0000000..3914ce2 --- /dev/null +++ b/tests/test_lfs_payload_validation.py @@ -0,0 +1,78 @@ +"""Tests for validating repo-sync markers in Git LFS payloads.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path +from unittest.mock import patch + +from repo_sync.strip.lfs import validate_lfs_payload_file, validate_lfs_payloads + + +def test_validate_lfs_payload_file_rejects_private_region_markers( + tmp_path: Path, +) -> None: + payload = tmp_path / "payload.txt" + payload.write_text( + "public\n" + "# !repo-sync: private-start\n" + "secret\n" + "# !repo-sync: private-end\n", + encoding="utf-8", + ) + + errors = validate_lfs_payload_file(str(payload), filepath="asset.txt") + + assert len(errors) == 1 + assert "asset.txt" in errors[0] + assert "private region markers" in errors[0] + + +def test_validate_lfs_payload_file_allows_binary_marker_bytes( + tmp_path: Path, +) -> None: + payload = tmp_path / "payload.bin" + payload.write_bytes(b"\x00!repo-sync: private-file") + + assert validate_lfs_payload_file(str(payload), filepath="asset.bin") == [] + + +def test_validate_lfs_payload_file_allows_marker_substrings( + tmp_path: Path, +) -> None: + payload = tmp_path / "payload.txt" + payload.write_text( + "Docs mention !repo-sync: private-start/end as prose.\n", + encoding="utf-8", + ) + + assert validate_lfs_payload_file(str(payload), filepath="asset.txt") == [] + + +def test_validate_lfs_payloads_materializes_lfs_paths( + tmp_path: Path, +) -> None: + def run(command: list[str], **kwargs: object) -> subprocess.CompletedProcess: + if command[:4] == ["git", "lfs", "ls-files", "--json"]: + return subprocess.CompletedProcess( + command, + 0, + stdout='{"files":[{"name":"asset.txt"}]}', + stderr="", + ) + if command[:3] == ["git", "cat-file", "--filters"]: + stdout = kwargs["stdout"] + stdout.write( + b"public\n" + b"# !repo-sync: private-start\n" + b"secret\n" + b"# !repo-sync: private-end\n" + ) + return subprocess.CompletedProcess(command, 0, stderr=b"") + raise AssertionError(f"unexpected command: {command}") + + with patch("repo_sync.strip.lfs.subprocess.run", side_effect=run): + result = validate_lfs_payloads(str(tmp_path)) + + assert len(result.errors) == 1 + assert "asset.txt" in result.errors[0] diff --git a/tests/workflows/test_lfs_mirroring.py b/tests/workflows/test_lfs_mirroring.py index 3028062..6e24e40 100644 --- a/tests/workflows/test_lfs_mirroring.py +++ b/tests/workflows/test_lfs_mirroring.py @@ -196,6 +196,49 @@ def test_mirror_lfs_objects_fails_when_fetched_object_is_missing( source_git.lfs_push_oids.assert_not_called() +def test_mirror_lfs_objects_fails_before_push_for_lfs_payload_markers( + tmp_path: Path, +) -> None: + (tmp_path / "asset.txt").write_text(_pointer(OID), encoding="utf-8") + source_git = MagicMock(spec=GitOps) + peer_git = MagicMock(spec=GitOps) + attributes_git = MagicMock(spec=GitOps) + attributes_git.lfs_tracked_paths.return_value = {"asset.txt"} + source_git.lfs_missing_oids.return_value = [] + + def write_payload(_ref: str, _path: str, output_path: str) -> None: + Path(output_path).write_text( + "public\n" + "# !repo-sync: private-start\n" + "secret\n" + "# !repo-sync: private-end\n", + encoding="utf-8", + ) + + source_git.lfs_write_path.side_effect = write_payload + + with pytest.raises(PermanentSyncError, match="asset.txt"): + _mirror_lfs_objects( + source_git=source_git, + peer_git=peer_git, + source_ref="abc123", + snapshot_dir=str(tmp_path), + changed_paths=["asset.txt"], + attributes_git=attributes_git, + attributes_ref="attrs-ref", + validate_payload_markers=True, + ) + + source_git.lfs_fetch_paths.assert_called_once_with( + "abc123", + ["asset.txt"], + ) + source_git.lfs_write_path.assert_called_once() + peer_git.remote_url.assert_not_called() + source_git.remote_add_or_update.assert_not_called() + source_git.lfs_push_oids.assert_not_called() + + def test_mirror_lfs_objects_fetches_exact_comma_path( tmp_path: Path, ) -> None: From 764470a24f8c7f231112397a4e8d9728bee182c6 Mon Sep 17 00:00:00 2001 From: David Stern Date: Thu, 14 May 2026 23:35:15 -0400 Subject: [PATCH 6/8] agentic pr review --- README.md | 8 ++++++++ actions/validate-markers/action.yml | 6 ++---- src/repo_sync/stack/git_ops.py | 12 +++++++++-- src/repo_sync/strip/cli.py | 30 +++++++++++++++++++++++++++- src/repo_sync/strip/lfs.py | 2 ++ tests/stack/test_git_ops.py | 21 +++++++++++++++++++ tests/test_cli.py | 26 ++++++++++++++++++++++++ tests/test_lfs_payload_validation.py | 1 + 8 files changed, 99 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3a21df1..bfc00a4 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,14 @@ jobs: this validates that all `!repo-sync` markers are properly paired, not nested, that no symlinks exist in the repo, and that text Git LFS payloads do not contain repo-sync markers. +to validate only selected paths, pass `paths` as a JSON array so file names containing spaces or shell metacharacters are preserved exactly: + +```yaml +- uses: warpdotdev/repo-sync/actions/validate-markers@main + with: + paths: '["src/file with spaces.txt", "assets/*.bin"]' +``` + ### step 3: add the sync workflow (both repos) add the sync workflow to both repos. see [examples/consuming-repo-sync.yml](examples/consuming-repo-sync.yml) for a complete example. the key pieces are: diff --git a/actions/validate-markers/action.yml b/actions/validate-markers/action.yml index ca613fd..76ec395 100644 --- a/actions/validate-markers/action.yml +++ b/actions/validate-markers/action.yml @@ -7,7 +7,7 @@ description: > inputs: paths: description: > - Space-separated list of relative file paths or globs to validate. + JSON array of relative file paths or globs to validate. Defaults to all files in the repository. required: false default: "" @@ -35,8 +35,6 @@ runs: run: | args=("--validate-only" "--validate-lfs-payloads" "${{ github.workspace }}") if [ -n "$PATHS_INPUT" ]; then - for p in $PATHS_INPUT; do - args+=("$p") - done + args+=("--paths-json" "$PATHS_INPUT") fi repo-sync-strip "${args[@]}" diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index a294760..42fb074 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -429,7 +429,11 @@ def lfs_tracked_paths( def lfs_fetch_paths(self, ref: str, paths: list[str]) -> None: """Fetch LFS objects for exact paths at a ref.""" - env = {**os.environ, **self._env_additions} if self._env_additions else None + env = { + **os.environ, + **self._env_additions, + "GIT_ATTR_SOURCE": ref, + } for path in paths: result = subprocess.run( ["git", "cat-file", "--filters", f"{ref}:{path}"], @@ -449,7 +453,11 @@ def lfs_fetch_paths(self, ref: str, paths: list[str]) -> None: def lfs_write_path(self, ref: str, path: str, output_path: str) -> None: """Write the LFS-smudged content for an exact path at a ref.""" - env = {**os.environ, **self._env_additions} if self._env_additions else None + env = { + **os.environ, + **self._env_additions, + "GIT_ATTR_SOURCE": ref, + } command = ["git", "cat-file", "--filters", f"{ref}:{path}"] with open(output_path, "wb") as output: result = subprocess.run( diff --git a/src/repo_sync/strip/cli.py b/src/repo_sync/strip/cli.py index d876434..ec32497 100644 --- a/src/repo_sync/strip/cli.py +++ b/src/repo_sync/strip/cli.py @@ -12,6 +12,7 @@ from __future__ import annotations import argparse +import json import sys from repo_sync.strip.lfs import validate_lfs_payloads @@ -36,6 +37,14 @@ def main(argv: list[str] | None = None) -> int: "(only meaningful with --validate-only)." ), ) + parser.add_argument( + "--paths-json", + default="", + help=( + "JSON array of relative file paths to validate " + "(only meaningful with --validate-only)." + ), + ) parser.add_argument( "--validate-only", action="store_true", @@ -51,7 +60,26 @@ def main(argv: list[str] | None = None) -> int: args = parser.parse_args(argv) - paths = args.paths if args.paths else None + if args.paths_json: + try: + paths_json = json.loads(args.paths_json) + except json.JSONDecodeError as exc: + print(f"--paths-json must be a JSON array: {exc}", file=sys.stderr) + return 1 + if not isinstance(paths_json, list) or not all( + isinstance(path, str) for path in paths_json + ): + print("--paths-json must be a JSON array of strings", file=sys.stderr) + return 1 + if args.paths: + print( + "--paths-json cannot be combined with positional paths", + file=sys.stderr, + ) + return 1 + paths = paths_json if paths_json else None + else: + paths = args.paths if args.paths else None if args.validate_only: result = strip_tree(args.directory, validate_only=True, paths=paths) diff --git a/src/repo_sync/strip/lfs.py b/src/repo_sync/strip/lfs.py index b59e821..ede3ec0 100644 --- a/src/repo_sync/strip/lfs.py +++ b/src/repo_sync/strip/lfs.py @@ -155,11 +155,13 @@ def _matches_any_pattern(path: str, patterns: list[str]) -> bool: def _write_lfs_payload(root: str, ref: str, relpath: str, output_path: str) -> None: """Write the LFS-smudged payload for a path to a temporary file.""" command = ["git", "cat-file", "--filters", f"{ref}:{relpath}"] + env = {**os.environ, "GIT_ATTR_SOURCE": ref} with open(output_path, "wb") as output: subprocess.run( command, cwd=root, stdout=output, stderr=subprocess.PIPE, + env=env, check=True, ) diff --git a/tests/stack/test_git_ops.py b/tests/stack/test_git_ops.py index e0019d4..ec13f50 100644 --- a/tests/stack/test_git_ops.py +++ b/tests/stack/test_git_ops.py @@ -45,6 +45,26 @@ def test_lfs_fetch_paths_uses_cat_file_filters_for_exact_paths( "--filters", "abc123:asset,with-comma.bin", ] + assert run.call_args.kwargs["env"]["GIT_ATTR_SOURCE"] == "abc123" + + +def test_lfs_write_path_uses_attributes_from_ref(tmp_git_repo: GitOps) -> None: + repo_dir = Path(tmp_git_repo.repo_dir) + tmp_git_repo._run(["lfs", "install", "--local"]) + tmp_git_repo._run(["lfs", "track", "*.txt"]) + (repo_dir / "asset.txt").write_text("payload\n", encoding="utf-8") + tmp_git_repo._run(["add", ".gitattributes", "asset.txt"]) + tmp_git_repo._run(["commit", "-m", "add lfs asset"]) + lfs_ref = tmp_git_repo.rev_parse("HEAD") + + (repo_dir / ".gitattributes").unlink() + tmp_git_repo._run(["add", ".gitattributes"]) + tmp_git_repo._run(["commit", "-m", "stop tracking txt files"]) + + output_path = repo_dir / "payload.out" + tmp_git_repo.lfs_write_path(lfs_ref, "asset.txt", str(output_path)) + + assert output_path.read_text(encoding="utf-8") == "payload\n" def test_lfs_write_path_uses_cat_file_filters_for_exact_path( @@ -70,3 +90,4 @@ def test_lfs_write_path_uses_cat_file_filters_for_exact_path( "--filters", "abc123:asset,with-comma.bin", ] + assert run.call_args.kwargs["env"]["GIT_ATTR_SOURCE"] == "abc123" diff --git a/tests/test_cli.py b/tests/test_cli.py index 4568c9c..74ea8ed 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -74,3 +74,29 @@ def test_validate_only_with_lfs_payload_errors(self, tmp_path: Path) -> None: rc = main(["--validate-only", "--validate-lfs-payloads", str(tmp_path)]) assert rc == 1 + + def test_validate_only_with_paths_json_preserves_spaces( + self, + tmp_path: Path, + ) -> None: + """CLI --paths-json preserves exact file paths with whitespace.""" + _write(tmp_path, "a b.txt", "hello\n") + with patch( + "repo_sync.strip.cli.validate_lfs_payloads", + return_value=StripResult([], []), + ) as validate_lfs_payloads: + rc = main( + [ + "--validate-only", + "--validate-lfs-payloads", + str(tmp_path), + "--paths-json", + '["a b.txt"]', + ] + ) + + assert rc == 0 + validate_lfs_payloads.assert_called_once_with( + str(tmp_path), + paths=["a b.txt"], + ) diff --git a/tests/test_lfs_payload_validation.py b/tests/test_lfs_payload_validation.py index 3914ce2..23506bb 100644 --- a/tests/test_lfs_payload_validation.py +++ b/tests/test_lfs_payload_validation.py @@ -61,6 +61,7 @@ def run(command: list[str], **kwargs: object) -> subprocess.CompletedProcess: stderr="", ) if command[:3] == ["git", "cat-file", "--filters"]: + assert kwargs["env"]["GIT_ATTR_SOURCE"] == "HEAD" stdout = kwargs["stdout"] stdout.write( b"public\n" From 1a3baf3683e1edb418db7b82424919b7125bcc0f Mon Sep 17 00:00:00 2001 From: David Stern Date: Fri, 15 May 2026 00:13:35 -0400 Subject: [PATCH 7/8] agentic pr review --- .github/workflows/sync.yml | 4 ++ src/repo_sync/stack/git_ops.py | 56 ++++++++++++-------- src/repo_sync/strip/lfs.py | 45 ++++++++++++---- src/repo_sync/workflows/create_sync_prs.py | 10 +++- tests/stack/test_git_ops.py | 35 ++++++++++++ tests/test_lfs_payload_validation.py | 35 ++++++++++++ tests/workflows/test_lfs_mirroring.py | 13 ++++- tests/workflows/test_sync_workflow_config.py | 16 ++++++ 8 files changed, 181 insertions(+), 33 deletions(-) create mode 100644 tests/workflows/test_sync_workflow_config.py diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml index 614facf..c99cbe7 100644 --- a/.github/workflows/sync.yml +++ b/.github/workflows/sync.yml @@ -132,6 +132,10 @@ jobs: - name: Build conflict resolution agent image run: docker build -f .repo-sync/docker/conflict-resolution/Dockerfile -t repo-sync-conflict-resolution .repo-sync + - name: Configure Git LFS + run: | + git lfs install --local + git -C peer lfs install --local - name: Check if SSH key is provided id: check-ssh-key diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index 42fb074..4e1e59f 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -4,10 +4,12 @@ import os import subprocess +import tempfile from dataclasses import dataclass from pathlib import Path from repo_sync.errors import VerboseCalledProcessError +from repo_sync.stack.lfs import parse_lfs_pointer @dataclass @@ -427,31 +429,29 @@ def lfs_tracked_paths( tracked_paths.add(path) return tracked_paths - def lfs_fetch_paths(self, ref: str, paths: list[str]) -> None: + def lfs_fetch_paths( + self, + ref: str, + paths: list[str], + expected_oids: dict[str, str] | None = None, + ) -> None: """Fetch LFS objects for exact paths at a ref.""" - env = { - **os.environ, - **self._env_additions, - "GIT_ATTR_SOURCE": ref, - } for path in paths: - result = subprocess.run( - ["git", "cat-file", "--filters", f"{ref}:{path}"], - cwd=self.repo_dir, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - text=True, - env=env, - ) - if result.returncode != 0: - raise VerboseCalledProcessError( - result.returncode, - ["git", "cat-file", "--filters", f"{ref}:{path}"], - "", - result.stderr, + with tempfile.NamedTemporaryFile() as output: + self.lfs_write_path( + ref, + path, + output.name, + expected_oid=(expected_oids or {}).get(path), ) - def lfs_write_path(self, ref: str, path: str, output_path: str) -> None: + def lfs_write_path( + self, + ref: str, + path: str, + output_path: str, + expected_oid: str | None = None, + ) -> None: """Write the LFS-smudged content for an exact path at a ref.""" env = { **os.environ, @@ -474,6 +474,20 @@ def lfs_write_path(self, ref: str, path: str, output_path: str) -> None: b"", result.stderr, ) + pointer = parse_lfs_pointer(Path(output_path).read_bytes(), path) + if pointer is not None and ( + expected_oid is None or pointer.oid == expected_oid + ): + raise VerboseCalledProcessError( + 1, + command, + b"", + ( + "git cat-file --filters returned an LFS pointer instead " + "of payload bytes. Ensure Git LFS filters are configured " + "with `git lfs install --local`." + ), + ) def lfs_missing_oids(self, oids: list[str]) -> list[str]: """Return LFS object IDs that are not present in the local LFS store.""" diff --git a/src/repo_sync/strip/lfs.py b/src/repo_sync/strip/lfs.py index ede3ec0..6d5d2d2 100644 --- a/src/repo_sync/strip/lfs.py +++ b/src/repo_sync/strip/lfs.py @@ -8,6 +8,7 @@ import subprocess import tempfile +from repo_sync.stack.lfs import parse_lfs_pointer from repo_sync.strip.detect import is_binary from repo_sync.strip.markers import ( MarkerError, @@ -66,7 +67,7 @@ def validate_lfs_payloads( errors: list[str] = [] warnings: list[str] = [] try: - lfs_paths = _git_lfs_paths(root, ref) + lfs_oids_by_path = _git_lfs_paths(root, ref) except subprocess.CalledProcessError as exc: stderr = ( exc.stderr @@ -78,7 +79,7 @@ def validate_lfs_payloads( warnings, ) - selected_paths = _select_lfs_paths(root, lfs_paths, paths) + selected_paths = _select_lfs_paths(root, sorted(lfs_oids_by_path), paths) if not selected_paths: return StripResult(errors, warnings) @@ -86,7 +87,13 @@ def validate_lfs_payloads( for index, relpath in enumerate(selected_paths): payload_path = os.path.join(temp_dir, f"payload-{index}") try: - _write_lfs_payload(root, ref, relpath, payload_path) + _write_lfs_payload( + root, + ref, + relpath, + payload_path, + expected_oid=lfs_oids_by_path.get(relpath), + ) except subprocess.CalledProcessError as exc: stderr = ( exc.stderr @@ -105,8 +112,8 @@ def validate_lfs_payloads( return StripResult(errors, warnings) -def _git_lfs_paths(root: str, ref: str) -> list[str]: - """Return Git LFS paths present at a ref.""" +def _git_lfs_paths(root: str, ref: str) -> dict[str, str | None]: + """Return Git LFS paths and object IDs present at a ref.""" result = subprocess.run( ["git", "lfs", "ls-files", "--json", ref], cwd=root, @@ -116,12 +123,13 @@ def _git_lfs_paths(root: str, ref: str) -> list[str]: ) data = json.loads(result.stdout or "{}") files = data.get("files") or [] - paths: list[str] = [] + paths: dict[str, str | None] = {} for entry in files: path = entry.get("name") or entry.get("path") + oid = entry.get("oid") if path: - paths.append(path) - return sorted(paths) + paths[path] = oid if isinstance(oid, str) else None + return paths def _select_lfs_paths( @@ -152,7 +160,13 @@ def _matches_any_pattern(path: str, patterns: list[str]) -> bool: return any(pure_path.match(pattern) for pattern in patterns) -def _write_lfs_payload(root: str, ref: str, relpath: str, output_path: str) -> None: +def _write_lfs_payload( + root: str, + ref: str, + relpath: str, + output_path: str, + expected_oid: str | None = None, +) -> None: """Write the LFS-smudged payload for a path to a temporary file.""" command = ["git", "cat-file", "--filters", f"{ref}:{relpath}"] env = {**os.environ, "GIT_ATTR_SOURCE": ref} @@ -165,3 +179,16 @@ def _write_lfs_payload(root: str, ref: str, relpath: str, output_path: str) -> N env=env, check=True, ) + pointer = parse_lfs_pointer(Path(output_path).read_bytes(), relpath) + if pointer is not None and ( + expected_oid is None or pointer.oid == expected_oid + ): + raise subprocess.CalledProcessError( + 1, + command, + stderr=( + "git cat-file --filters returned an LFS pointer instead " + "of payload bytes. Ensure Git LFS filters are configured " + "with `git lfs install --local`." + ), + ) diff --git a/src/repo_sync/workflows/create_sync_prs.py b/src/repo_sync/workflows/create_sync_prs.py index 27e10bc..ba45107 100644 --- a/src/repo_sync/workflows/create_sync_prs.py +++ b/src/repo_sync/workflows/create_sync_prs.py @@ -193,7 +193,8 @@ def _mirror_lfs_objects( ) fetch_paths = sorted({pointer.path for pointer in pointers}) - source_git.lfs_fetch_paths(source_ref, fetch_paths) + expected_oids = {pointer.path: pointer.oid for pointer in pointers} + source_git.lfs_fetch_paths(source_ref, fetch_paths, expected_oids=expected_oids) missing_oids = source_git.lfs_missing_oids(oids) if missing_oids: raise PermanentSyncError( @@ -230,7 +231,12 @@ def _validate_lfs_payload_markers( with tempfile.TemporaryDirectory(prefix="repo-sync-lfs-payload-") as temp_dir: for index, pointer in enumerate(pointers): payload_path = os.path.join(temp_dir, f"payload-{index}") - source_git.lfs_write_path(source_ref, pointer.path, payload_path) + source_git.lfs_write_path( + source_ref, + pointer.path, + payload_path, + expected_oid=pointer.oid, + ) errors.extend( validate_lfs_payload_file(payload_path, filepath=pointer.path) ) diff --git a/tests/stack/test_git_ops.py b/tests/stack/test_git_ops.py index ec13f50..d213bf5 100644 --- a/tests/stack/test_git_ops.py +++ b/tests/stack/test_git_ops.py @@ -5,6 +5,9 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest + +from repo_sync.errors import VerboseCalledProcessError from repo_sync.stack.git_ops import GitOps @@ -48,6 +51,38 @@ def test_lfs_fetch_paths_uses_cat_file_filters_for_exact_paths( assert run.call_args.kwargs["env"]["GIT_ATTR_SOURCE"] == "abc123" +def test_lfs_write_path_fails_when_filters_return_expected_pointer( + tmp_git_repo: GitOps, + tmp_path: Path, +) -> None: + oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + result = MagicMock() + result.returncode = 0 + result.stderr = b"" + output_path = tmp_path / "payload" + + def run(_command: list[str], **kwargs: object) -> MagicMock: + stdout = kwargs["stdout"] + stdout.write( + b"version https://git-lfs.github.com/spec/v1\n" + + f"oid sha256:{oid}\n".encode() + + b"size 1234\n" + ) + return result + + with patch("repo_sync.stack.git_ops.subprocess.run", side_effect=run): + with pytest.raises( + VerboseCalledProcessError, + match="git lfs install --local", + ): + tmp_git_repo.lfs_write_path( + "abc123", + "asset.bin", + str(output_path), + expected_oid=oid, + ) + + def test_lfs_write_path_uses_attributes_from_ref(tmp_git_repo: GitOps) -> None: repo_dir = Path(tmp_git_repo.repo_dir) tmp_git_repo._run(["lfs", "install", "--local"]) diff --git a/tests/test_lfs_payload_validation.py b/tests/test_lfs_payload_validation.py index 23506bb..b1c85b7 100644 --- a/tests/test_lfs_payload_validation.py +++ b/tests/test_lfs_payload_validation.py @@ -77,3 +77,38 @@ def run(command: list[str], **kwargs: object) -> subprocess.CompletedProcess: assert len(result.errors) == 1 assert "asset.txt" in result.errors[0] + + +def test_validate_lfs_payloads_fails_when_filters_return_pointer( + tmp_path: Path, +) -> None: + oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + + def run(command: list[str], **kwargs: object) -> subprocess.CompletedProcess: + if command[:4] == ["git", "lfs", "ls-files", "--json"]: + return subprocess.CompletedProcess( + command, + 0, + stdout=( + '{"files":[{"name":"asset.bin",' + f'"oid":"{oid}"' + "}]}" + ), + stderr="", + ) + if command[:3] == ["git", "cat-file", "--filters"]: + stdout = kwargs["stdout"] + stdout.write( + b"version https://git-lfs.github.com/spec/v1\n" + + f"oid sha256:{oid}\n".encode() + + b"size 1234\n" + ) + return subprocess.CompletedProcess(command, 0, stderr=b"") + raise AssertionError(f"unexpected command: {command}") + + with patch("repo_sync.strip.lfs.subprocess.run", side_effect=run): + result = validate_lfs_payloads(str(tmp_path)) + + assert len(result.errors) == 1 + assert "asset.bin" in result.errors[0] + assert "git lfs install --local" in result.errors[0] diff --git a/tests/workflows/test_lfs_mirroring.py b/tests/workflows/test_lfs_mirroring.py index 6e24e40..501ad48 100644 --- a/tests/workflows/test_lfs_mirroring.py +++ b/tests/workflows/test_lfs_mirroring.py @@ -59,6 +59,7 @@ def test_mirror_lfs_objects_pushes_changed_pointer_oids( source_git.lfs_fetch_paths.assert_called_once_with( "abc123", ["asset.bin"], + expected_oids={"asset.bin": OID}, ) source_git.lfs_missing_oids.assert_called_once_with([OID]) peer_git.remote_url.assert_called_once_with("origin") @@ -132,6 +133,7 @@ def test_mirror_lfs_objects_does_not_push_private_stripped_pointer( source_git.lfs_fetch_paths.assert_called_once_with( "abc123", ["asset.bin"], + expected_oids={"asset.bin": OID}, ) source_git.lfs_push_oids.assert_called_once_with(remote, [OID]) assert PRIVATE_OID not in source_git.lfs_push_oids.call_args.args[1] @@ -192,6 +194,7 @@ def test_mirror_lfs_objects_fails_when_fetched_object_is_missing( source_git.lfs_fetch_paths.assert_called_once_with( "abc123", ["asset.bin"], + expected_oids={"asset.bin": OID}, ) source_git.lfs_push_oids.assert_not_called() @@ -206,7 +209,12 @@ def test_mirror_lfs_objects_fails_before_push_for_lfs_payload_markers( attributes_git.lfs_tracked_paths.return_value = {"asset.txt"} source_git.lfs_missing_oids.return_value = [] - def write_payload(_ref: str, _path: str, output_path: str) -> None: + def write_payload( + _ref: str, + _path: str, + output_path: str, + expected_oid: str | None = None, + ) -> None: Path(output_path).write_text( "public\n" "# !repo-sync: private-start\n" @@ -232,6 +240,7 @@ def write_payload(_ref: str, _path: str, output_path: str) -> None: source_git.lfs_fetch_paths.assert_called_once_with( "abc123", ["asset.txt"], + expected_oids={"asset.txt": OID}, ) source_git.lfs_write_path.assert_called_once() peer_git.remote_url.assert_not_called() @@ -267,6 +276,7 @@ def test_mirror_lfs_objects_fetches_exact_comma_path( source_git.lfs_fetch_paths.assert_called_once_with( "abc123", ["asset,with-comma.bin"], + expected_oids={"asset,with-comma.bin": OID}, ) @@ -304,4 +314,5 @@ def test_mirror_lfs_objects_scans_all_pointers_when_attributes_change( source_git.lfs_fetch_paths.assert_called_once_with( "abc123", ["existing.bin"], + expected_oids={"existing.bin": OID}, ) diff --git a/tests/workflows/test_sync_workflow_config.py b/tests/workflows/test_sync_workflow_config.py new file mode 100644 index 0000000..87248db --- /dev/null +++ b/tests/workflows/test_sync_workflow_config.py @@ -0,0 +1,16 @@ +"""Tests for reusable sync workflow configuration.""" + +from __future__ import annotations + +from pathlib import Path + + +def test_sync_workflow_configures_lfs_for_both_checkouts() -> None: + workflow = Path(".github/workflows/sync.yml").read_text(encoding="utf-8") + + assert "- name: Configure Git LFS" in workflow + assert "git lfs install --local" in workflow + assert "git -C peer lfs install --local" in workflow + assert workflow.index("- name: Configure Git LFS") < workflow.index( + "- name: Run sync" + ) From ba6890d3e111f45fae670e16cce17335544e18a4 Mon Sep 17 00:00:00 2001 From: David Stern Date: Fri, 15 May 2026 00:23:49 -0400 Subject: [PATCH 8/8] agentic code review --- src/repo_sync/stack/git_ops.py | 4 ++-- src/repo_sync/stack/lfs.py | 7 +++++++ src/repo_sync/strip/lfs.py | 4 ++-- tests/stack/test_lfs.py | 20 +++++++++++++++++++- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/repo_sync/stack/git_ops.py b/src/repo_sync/stack/git_ops.py index 4e1e59f..b793193 100644 --- a/src/repo_sync/stack/git_ops.py +++ b/src/repo_sync/stack/git_ops.py @@ -9,7 +9,7 @@ from pathlib import Path from repo_sync.errors import VerboseCalledProcessError -from repo_sync.stack.lfs import parse_lfs_pointer +from repo_sync.stack.lfs import parse_lfs_pointer_file @dataclass @@ -474,7 +474,7 @@ def lfs_write_path( b"", result.stderr, ) - pointer = parse_lfs_pointer(Path(output_path).read_bytes(), path) + pointer = parse_lfs_pointer_file(output_path, path) if pointer is not None and ( expected_oid is None or pointer.oid == expected_oid ): diff --git a/src/repo_sync/stack/lfs.py b/src/repo_sync/stack/lfs.py index 78aa1ef..7f3b01b 100644 --- a/src/repo_sync/stack/lfs.py +++ b/src/repo_sync/stack/lfs.py @@ -57,6 +57,13 @@ def parse_lfs_pointer(data: bytes, path: str) -> LfsPointer | None: return LfsPointer(path=path, oid=oid, size=size) +def parse_lfs_pointer_file(file_path: str | os.PathLike[str], path: str) -> LfsPointer | None: + """Parse a Git LFS pointer file without reading large payloads into memory.""" + with open(file_path, "rb") as file: + data = file.read(_MAX_POINTER_BYTES + 1) + return parse_lfs_pointer(data, path) + + def collect_lfs_pointers( root: str, paths: Iterable[str] | None = None, diff --git a/src/repo_sync/strip/lfs.py b/src/repo_sync/strip/lfs.py index 6d5d2d2..d387c97 100644 --- a/src/repo_sync/strip/lfs.py +++ b/src/repo_sync/strip/lfs.py @@ -8,7 +8,7 @@ import subprocess import tempfile -from repo_sync.stack.lfs import parse_lfs_pointer +from repo_sync.stack.lfs import parse_lfs_pointer_file from repo_sync.strip.detect import is_binary from repo_sync.strip.markers import ( MarkerError, @@ -179,7 +179,7 @@ def _write_lfs_payload( env=env, check=True, ) - pointer = parse_lfs_pointer(Path(output_path).read_bytes(), relpath) + pointer = parse_lfs_pointer_file(output_path, relpath) if pointer is not None and ( expected_oid is None or pointer.oid == expected_oid ): diff --git a/tests/stack/test_lfs.py b/tests/stack/test_lfs.py index 3e3611f..2373d1d 100644 --- a/tests/stack/test_lfs.py +++ b/tests/stack/test_lfs.py @@ -3,10 +3,17 @@ from __future__ import annotations from pathlib import Path +from unittest.mock import mock_open, patch import pytest -from repo_sync.stack.lfs import LfsPointer, collect_lfs_pointers, parse_lfs_pointer +from repo_sync.stack.lfs import ( + _MAX_POINTER_BYTES, + LfsPointer, + collect_lfs_pointers, + parse_lfs_pointer, + parse_lfs_pointer_file, +) OID = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" @@ -27,6 +34,17 @@ def test_parse_lfs_pointer() -> None: assert pointer == LfsPointer(path="asset.bin", oid=OID, size=1234) +def test_parse_lfs_pointer_file_reads_only_pointer_sized_prefix() -> None: + data = _pointer() + b"x" * (_MAX_POINTER_BYTES * 2) + file_open = mock_open(read_data=data) + + with patch("builtins.open", file_open): + assert parse_lfs_pointer_file("payload.bin", "asset.bin") is None + + handle = file_open() + handle.read.assert_called_once_with(_MAX_POINTER_BYTES + 1) + + def test_parse_lfs_pointer_rejects_ordinary_file() -> None: assert parse_lfs_pointer(b"hello\n", "hello.txt") is None