Skip to content

Commit 8d36866

Browse files
Merge pull request #24 from TemoaProject/develop
2 parents a24e433 + 72e26ba commit 8d36866

4 files changed

Lines changed: 115 additions & 63 deletions

File tree

src/datamanager/__main__.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -288,18 +288,23 @@ def _run_prepare_logic(ctx: typer.Context, name: str, file: Path) -> None:
288288
old_path = Path(tempdir) / "prev.sqlite"
289289
# Download from the PRODUCTION bucket
290290
core.download_from_r2(client, latest_version["r2_object_key"], old_path)
291-
diff_text = core.generate_sql_diff(old_path, file)
292-
293-
if diff_text.count("\n") <= settings.max_diff_lines:
294-
diff_filename = f"diff-{latest_version['version']}-to-{new_version}.diff"
295-
# Store diffs in a dedicated top-level directory
296-
diff_git_path = Path("diffs") / name / diff_filename
297-
diff_git_path.parent.mkdir(parents=True, exist_ok=True)
298-
diff_git_path.write_text(diff_text)
299-
subprocess.run(["git", "add", str(diff_git_path)])
300-
console.print(f"📝 Diff stored in Git at: [green]{diff_git_path}[/]")
291+
292+
full_diff, summary = core.generate_sql_diff(old_path, file)
293+
294+
if full_diff.count("\n") <= settings.max_diff_lines:
295+
to_save = summary + full_diff
296+
msg = "📝 Full diff stored in Git at:"
301297
else:
302-
console.print("📝 Diff is too large – omitted from Git.")
298+
to_save = summary
299+
msg = "📝 Full diff too large; summary stored in Git at:"
300+
301+
diff_filename = f"diff-{latest_version['version']}-to-{new_version}.diff"
302+
diff_git_path = Path("diffs") / name / diff_filename
303+
diff_git_path.parent.mkdir(parents=True, exist_ok=True)
304+
diff_git_path.write_text(to_save)
305+
subprocess.run(["git", "add", str(diff_git_path)])
306+
307+
console.print(f"{msg} [green]{diff_git_path}[/]")
303308

304309
new_entry = {
305310
"version": new_version,

src/datamanager/core.py

Lines changed: 44 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import shutil
66
import sqlite3
77
import subprocess
8-
import tempfile
98
from pathlib import Path, PurePath
109
import os
1110
import uuid
@@ -127,40 +126,37 @@ def pull_and_verify(object_key: str, expected_hash: str, output_path: Path) -> b
127126
return False
128127

129128

130-
def generate_sql_diff(old_file: Path, new_file: Path) -> str:
129+
def generate_sql_diff(old_file: Path, new_file: Path) -> tuple[str, str]:
131130
"""
132-
Return a unified diff of the SQL schema/data between two SQLite files.
131+
Return (full_diff, summary) between two SQLite files.
133132
134-
- If both sqlite cli and diff are available, use them (fast path).
135-
- Otherwise fall back to std-lib `sqlite3` + `difflib` (pure-Python).
133+
- If `sqldiff` CLI is available, use that for both full and summary.
134+
- Otherwise fall back to sqlite3 + difflib, and synthesize a summary.
136135
"""
137136

138-
has_cli = shutil.which("sqlite3") and shutil.which("diff")
139-
140-
if has_cli:
141-
with tempfile.TemporaryDirectory() as tmp:
142-
old_dump, new_dump = Path(tmp) / "old.sql", Path(tmp) / "new.sql"
143-
subprocess.run(
144-
["sqlite3", str(old_file), ".dump"],
145-
text=True,
146-
check=True,
147-
stdout=old_dump.open("w"),
148-
)
149-
subprocess.run(
150-
["sqlite3", str(new_file), ".dump"],
151-
text=True,
152-
check=True,
153-
stdout=new_dump.open("w"),
154-
)
155-
proc = subprocess.run(
156-
["diff", "-u", str(old_dump), str(new_dump)],
157-
text=True,
158-
capture_output=True,
159-
)
160-
return proc.stdout
137+
# Try sqldiff CLI first
138+
if shutil.which("sqldiff"):
139+
# Full diff
140+
proc_full = subprocess.run(
141+
["sqldiff", str(old_file), str(new_file)],
142+
text=True,
143+
capture_output=True,
144+
check=True,
145+
)
146+
full_diff = proc_full.stdout
147+
148+
# Summary (sqldiff --summary)
149+
proc_sum = subprocess.run(
150+
["sqldiff", "--summary", str(old_file), str(new_file)],
151+
text=True,
152+
capture_output=True,
153+
check=True,
154+
)
155+
summary = proc_sum.stdout
161156

162-
# pure-Python fallback
157+
return full_diff, summary
163158

159+
# Pure-Python fallback: dump both DBs and diff
164160
def _dump(db: Path) -> str:
165161
buf = io.StringIO()
166162
con = sqlite3.connect(db)
@@ -169,14 +165,31 @@ def _dump(db: Path) -> str:
169165
con.close()
170166
return buf.getvalue()
171167

172-
old_sql, new_sql = _dump(old_file), _dump(new_file)
168+
old_sql = _dump(old_file)
169+
new_sql = _dump(new_file)
170+
173171
diff_iter = difflib.unified_diff(
174172
old_sql.splitlines(keepends=True),
175173
new_sql.splitlines(keepends=True),
176174
fromfile=str(PurePath(old_file).name),
177175
tofile=str(PurePath(new_file).name),
178176
)
179-
return "".join(diff_iter)
177+
full_diff = "".join(diff_iter)
178+
179+
# Synthesize a summary: count ADDs and DELs
180+
adds = sum(
181+
1
182+
for ln in full_diff.splitlines()
183+
if ln.startswith("+") and not ln.startswith("+++")
184+
)
185+
dels = sum(
186+
1
187+
for ln in full_diff.splitlines()
188+
if ln.startswith("-") and not ln.startswith("---")
189+
)
190+
summary = f"# summary: {adds} additions, {dels} deletions\n"
191+
192+
return full_diff, summary
180193

181194

182195
def delete_from_r2(client: S3Client, object_key: str) -> None:

tests/test_core.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,21 @@ def test_generate_sql_diff(tmp_path: Path) -> None:
4040
con.commit()
4141
con.close()
4242

43-
diff = core.generate_sql_diff(old_db_path, new_db_path)
44-
assert "-INSERT INTO users VALUES(1,'Alice');" in diff
45-
assert "+INSERT INTO users VALUES(2,'Bob');" in diff
43+
full_diff, summary = core.generate_sql_diff(old_db_path, new_db_path)
44+
45+
# We should see evidence of the change in full_diff:
46+
# - either an UPDATE statement from sqldiff,
47+
# - or a "+INSERT ..." / "-INSERT ..." style from the fallback.
48+
assert "Bob" in full_diff, f"full_diff did not mention Bob:\n{full_diff}"
49+
assert (
50+
"UPDATE users SET" in full_diff
51+
or "+INSERT INTO" in full_diff
52+
or "-INSERT INTO" in full_diff
53+
), f"unexpected diff format:\n{full_diff}"
54+
55+
# And summary should be a non‐empty string
56+
assert isinstance(summary, str)
57+
assert summary.strip(), "Summary should not be empty"
4658

4759

4860
def test_r2_interactions(mocker: MockerFixture, tmp_path: Path) -> None:

tests/test_main.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,55 +37,77 @@ def test_prepare_for_create_success(test_repo: Path, mocker: MockerFixture) -> N
3737
def test_prepare_for_update_with_small_diff(
3838
test_repo: Path, mocker: MockerFixture
3939
) -> None:
40-
"""Test an update that generates and stores a small diff."""
40+
"""Test an update that generates and stores both summary+full diff."""
4141
os.chdir(test_repo)
4242
mock_r2_client = mocker.patch("datamanager.core.get_r2_client").return_value
43-
# Configure head_object to prevent TypeError in the download progress bar
4443
mock_r2_client.head_object.return_value = {"ContentLength": 1024}
4544
mocker.patch("datamanager.core.upload_to_staging")
4645
mocker.patch("datamanager.core.download_from_r2")
47-
mocker.patch("datamanager.core.generate_sql_diff", return_value="--- a\n+++ b")
46+
47+
# Prepare a fake summary and full diff
48+
fake_summary = "# summary: 1 add, 1 del\n"
49+
fake_full = "--- a\n+++ b\n-foo\n+bar\n"
50+
mocker.patch(
51+
"datamanager.core.generate_sql_diff",
52+
return_value=(fake_full, fake_summary),
53+
)
4854

4955
v3_file = test_repo / "v3_data.sqlite"
5056
v3_file.write_text("this is v3")
5157

5258
result = runner.invoke(app, ["prepare", "core-dataset.sqlite", str(v3_file)])
53-
5459
assert result.exit_code == 0, result.stdout
55-
assert "Diff stored in Git" in result.stdout
60+
assert "Full diff stored in Git" in result.stdout
5661

57-
expected_diff_path = Path("diffs/core-dataset.sqlite/diff-v2-to-v3.diff")
58-
assert expected_diff_path.exists()
62+
expected = Path("diffs/core-dataset.sqlite/diff-v2-to-v3.diff")
63+
assert expected.exists()
5964

60-
dataset = manifest.get_dataset("core-dataset.sqlite")
61-
assert dataset is not None, "Dataset should not be None after preparation."
62-
assert dataset["history"][0]["diffFromPrevious"] == str(expected_diff_path)
65+
# Now verify the file contains summary first, then full diff
66+
content = expected.read_text()
67+
assert content == fake_summary + fake_full
68+
69+
ds = manifest.get_dataset("core-dataset.sqlite")
70+
assert ds is not None
71+
assert ds["history"][0]["diffFromPrevious"] == str(expected)
6372

6473

6574
def test_prepare_for_update_with_large_diff(
6675
test_repo: Path, mocker: MockerFixture
6776
) -> None:
68-
"""Test an update where the diff is too large and is omitted."""
77+
"""Test an update where the full diff is too large and only summary is stored."""
6978
os.chdir(test_repo)
7079
mock_r2_client = mocker.patch("datamanager.core.get_r2_client").return_value
7180
mock_r2_client.head_object.return_value = {"ContentLength": 1024}
7281
mocker.patch("datamanager.core.upload_to_staging")
7382
mocker.patch("datamanager.core.download_from_r2")
74-
# Make the diff larger than the default MAX_DIFF_LINES
75-
large_diff = "line\n" * (settings.max_diff_lines + 1)
76-
mocker.patch("datamanager.core.generate_sql_diff", return_value=large_diff)
83+
# Make the full diff larger than the default limit, but still provide a summary
84+
large_full = "line\n" * (settings.max_diff_lines + 1)
85+
small_summary = "# summary: huge diff, see details in PR\n"
86+
mocker.patch(
87+
"datamanager.core.generate_sql_diff",
88+
return_value=(large_full, small_summary),
89+
)
7790

7891
v3_file = test_repo / "v3_data.sqlite"
7992
v3_file.write_text("this is v3")
8093

8194
result = runner.invoke(app, ["prepare", "core-dataset.sqlite", str(v3_file)])
82-
8395
assert result.exit_code == 0, result.stdout
84-
assert "Diff is too large" in result.stdout
8596

97+
# We should see that the full diff was too large but a summary was stored
98+
assert "too large" in result.stdout.lower()
99+
100+
expected_diff_path = Path("diffs/core-dataset.sqlite/diff-v2-to-v3.diff")
101+
assert expected_diff_path.exists(), "Summary file should have been written"
102+
103+
# The manifest should record the path
86104
dataset = manifest.get_dataset("core-dataset.sqlite")
87-
assert dataset is not None, "Dataset should not be None after preparation."
88-
assert dataset["history"][0]["diffFromPrevious"] is None
105+
assert dataset is not None
106+
assert dataset["history"][0]["diffFromPrevious"] == str(expected_diff_path)
107+
108+
# And the contents of that file should be exactly our summary
109+
content = expected_diff_path.read_text()
110+
assert content == small_summary
89111

90112

91113
def test_prepare_no_changes(test_repo: Path, mocker: MockerFixture) -> None:

0 commit comments

Comments
 (0)