Skip to content

Commit d991180

Browse files
fix(loaders): strip git log -p commit metadata before parsing (#228)
Co-authored-by: Ben Vinegar <2153+benvinegar@users.noreply.github.com>
1 parent 834abfe commit d991180

2 files changed

Lines changed: 354 additions & 1 deletion

File tree

src/core/loaders.gitLog.test.ts

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
import { describe, expect, test } from "bun:test";
2+
import { parsePatchFiles } from "@pierre/diffs";
3+
import { stripGitLogMetadata } from "./loaders";
4+
5+
describe("stripGitLogMetadata", () => {
6+
test("returns input unchanged when no commit boundary is present", () => {
7+
const patch = [
8+
"diff --git a/foo b/foo",
9+
"index 0000000..1111111 100644",
10+
"--- a/foo",
11+
"+++ b/foo",
12+
"@@ -1,1 +1,1 @@",
13+
"-old",
14+
"+new",
15+
"",
16+
].join("\n");
17+
18+
expect(stripGitLogMetadata(patch)).toBe(patch);
19+
});
20+
21+
test("strips a single commit's metadata header", () => {
22+
const input = [
23+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12",
24+
"Author: Someone <me@example.com>",
25+
"Date: Tue Mar 3 12:00:00 2026 +0100",
26+
"",
27+
" feat: do thing",
28+
"",
29+
"diff --git a/foo b/foo",
30+
"@@ -1,1 +1,1 @@",
31+
"-old",
32+
"+new",
33+
"",
34+
].join("\n");
35+
36+
const expected = ["diff --git a/foo b/foo", "@@ -1,1 +1,1 @@", "-old", "+new", ""].join("\n");
37+
38+
expect(stripGitLogMetadata(input)).toBe(expected);
39+
});
40+
41+
test("handles multiple commits in `git log -p` output", () => {
42+
const input = [
43+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12",
44+
"Author: A <a@x>",
45+
"Date: Tue Mar 3 12:00:00 2026 +0100",
46+
"",
47+
" first",
48+
"",
49+
"diff --git a/foo b/foo",
50+
"@@ -1,1 +1,1 @@",
51+
"-a",
52+
"+b",
53+
"",
54+
"commit aaaabbbbccccddddeeeeffff0000111122223333",
55+
"Author: B <b@x>",
56+
"Date: Wed Mar 4 12:00:00 2026 +0100",
57+
"",
58+
" second",
59+
"",
60+
"diff --git a/bar b/bar",
61+
"@@ -1,1 +1,1 @@",
62+
"-c",
63+
"+d",
64+
"",
65+
].join("\n");
66+
67+
const expected = [
68+
"diff --git a/foo b/foo",
69+
"@@ -1,1 +1,1 @@",
70+
"-a",
71+
"+b",
72+
"",
73+
"diff --git a/bar b/bar",
74+
"@@ -1,1 +1,1 @@",
75+
"-c",
76+
"+d",
77+
"",
78+
].join("\n");
79+
80+
expect(stripGitLogMetadata(input)).toBe(expected);
81+
});
82+
83+
test("accepts decorated commit headers (refs in parens)", () => {
84+
const input = [
85+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12 (HEAD -> main, origin/main)",
86+
"Author: A <a@x>",
87+
"Date: Tue Mar 3 12:00:00 2026 +0100",
88+
"",
89+
" msg",
90+
"",
91+
"diff --git a/foo b/foo",
92+
"@@ -1,1 +1,1 @@",
93+
" ctx",
94+
"",
95+
].join("\n");
96+
97+
expect(stripGitLogMetadata(input)).toBe(
98+
["diff --git a/foo b/foo", "@@ -1,1 +1,1 @@", " ctx", ""].join("\n"),
99+
);
100+
});
101+
102+
test("accepts abbreviated SHAs (--abbrev)", () => {
103+
const input = [
104+
"commit 1a2b3c4",
105+
"Author: A <a@x>",
106+
"Date: Tue Mar 3 12:00:00 2026 +0100",
107+
"",
108+
" msg",
109+
"",
110+
"diff --git a/foo b/foo",
111+
"@@ -1,1 +1,1 @@",
112+
" ctx",
113+
"",
114+
].join("\n");
115+
116+
expect(stripGitLogMetadata(input)).toContain("diff --git a/foo b/foo");
117+
expect(stripGitLogMetadata(input)).not.toContain("commit 1a2b3c4");
118+
expect(stripGitLogMetadata(input)).not.toContain("Author:");
119+
});
120+
121+
test("drops --stat blocks between header and patch (they aren't valid hunk lines)", () => {
122+
const input = [
123+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12",
124+
"Author: A <a@x>",
125+
"Date: Tue Mar 3 12:00:00 2026 +0100",
126+
"",
127+
" msg",
128+
"",
129+
" foo | 2 +-",
130+
" 1 file changed, 1 insertion(+), 1 deletion(-)",
131+
"",
132+
"diff --git a/foo b/foo",
133+
"@@ -1,1 +1,1 @@",
134+
"-old",
135+
"+new",
136+
"",
137+
].join("\n");
138+
139+
const result = stripGitLogMetadata(input);
140+
expect(result).not.toContain("1 file changed");
141+
expect(result).not.toContain("foo | 2 +-");
142+
expect(result.startsWith("diff --git a/foo b/foo")).toBe(true);
143+
});
144+
145+
test("drops merge-commit metadata", () => {
146+
const input = [
147+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12",
148+
"Merge: aaaaaaa bbbbbbb",
149+
"Author: A <a@x>",
150+
"Date: Tue Mar 3 12:00:00 2026 +0100",
151+
"",
152+
" Merge branch 'topic'",
153+
"",
154+
"diff --git a/foo b/foo",
155+
"@@ -1,1 +1,1 @@",
156+
" ctx",
157+
"",
158+
].join("\n");
159+
160+
const result = stripGitLogMetadata(input);
161+
expect(result).not.toContain("Merge:");
162+
expect(result.startsWith("diff --git a/foo b/foo")).toBe(true);
163+
});
164+
165+
test("strips boundaries with SHA-256 (64-char) hashes", () => {
166+
// git init --object-format=sha256 emits 64-char hex SHAs.
167+
const sha256 = "a".repeat(64);
168+
const input = [
169+
`commit ${sha256}`,
170+
"Author: A <a@x>",
171+
"Date: Tue Mar 3 12:00:00 2026 +0100",
172+
"",
173+
" msg",
174+
"",
175+
"diff --git a/foo b/foo",
176+
"@@ -1,1 +1,1 @@",
177+
"-old",
178+
"+new",
179+
"",
180+
].join("\n");
181+
182+
const result = stripGitLogMetadata(input);
183+
expect(result).not.toContain(`commit ${sha256}`);
184+
expect(result).not.toContain("Author:");
185+
expect(result.startsWith("diff --git a/foo b/foo")).toBe(true);
186+
});
187+
188+
test("preserves context lines that mention the word 'commit'", () => {
189+
// A real hunk line that begins with a space-then-'commit' must NOT be
190+
// treated as a commit boundary — its leading space is the diff
191+
// line-type marker.
192+
const input = [
193+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12",
194+
"Author: A <a@x>",
195+
"Date: Tue Mar 3 12:00:00 2026 +0100",
196+
"",
197+
" msg",
198+
"",
199+
"diff --git a/foo b/foo",
200+
"@@ -1,2 +1,2 @@",
201+
" commit deadbeefcafebabe1234567890abcdef12345678 looks like a sha",
202+
"-old",
203+
"+new",
204+
"",
205+
].join("\n");
206+
207+
const result = stripGitLogMetadata(input);
208+
expect(result).toContain(" commit deadbeefcafebabe1234567890abcdef12345678 looks like a sha");
209+
});
210+
211+
// Integration-style: real `git log -p`-shaped input should round-trip
212+
// through @pierre/diffs without triggering any
213+
// `parseLineType: Invalid firstChar` warnings, which is the bug this
214+
// helper exists to fix.
215+
test("stripped output parses via @pierre/diffs without parseLineType warnings", () => {
216+
const gitLogOutput = [
217+
"commit ed3dcb9406b1a169ef4740c858f6dff3dde146a0",
218+
"Author: t <t@t>",
219+
"Date: Thu May 7 17:30:17 2026 +0200",
220+
"",
221+
" update file",
222+
"",
223+
"diff --git a/file.txt b/file.txt",
224+
"index 9c59e24..e019be0 100644",
225+
"--- a/file.txt",
226+
"+++ b/file.txt",
227+
"@@ -1 +1 @@",
228+
"-first",
229+
"+second",
230+
"",
231+
"commit 2d037adaddaa53bbb9f037ba36a8e0ed57632f20",
232+
"Author: t <t@t>",
233+
"Date: Thu May 7 17:30:17 2026 +0200",
234+
"",
235+
" add file",
236+
"",
237+
"diff --git a/file.txt b/file.txt",
238+
"new file mode 100644",
239+
"index 0000000..9c59e24",
240+
"--- /dev/null",
241+
"+++ b/file.txt",
242+
"@@ -0,0 +1 @@",
243+
"+first",
244+
"",
245+
].join("\n");
246+
247+
const captured: string[] = [];
248+
const originalError = console.error;
249+
console.error = (...args: unknown[]) => {
250+
captured.push(args.map((a) => String(a)).join(" "));
251+
};
252+
253+
try {
254+
const stripped = stripGitLogMetadata(gitLogOutput);
255+
const parsed = parsePatchFiles(stripped, "patch", true);
256+
expect(parsed.flatMap((entry) => entry.files).length).toBe(2);
257+
} finally {
258+
console.error = originalError;
259+
}
260+
261+
const noisy = captured.filter(
262+
(line) => line.includes("Invalid firstChar") || line.includes("invalid rawLine"),
263+
);
264+
expect(noisy).toEqual([]);
265+
});
266+
267+
test("drops trailing commit with no diff (e.g. empty merge)", () => {
268+
const input = [
269+
"commit 1a2b3c4d5e6f7890abcdef1234567890abcdef12",
270+
"Author: A <a@x>",
271+
"Date: Tue Mar 3 12:00:00 2026 +0100",
272+
"",
273+
" diff-bearing commit",
274+
"",
275+
"diff --git a/foo b/foo",
276+
"@@ -1,1 +1,1 @@",
277+
"-a",
278+
"+b",
279+
"",
280+
"commit aaaabbbbccccddddeeeeffff0000111122223333",
281+
"Author: B <b@x>",
282+
"Date: Wed Mar 4 12:00:00 2026 +0100",
283+
"",
284+
" diff-less commit",
285+
"",
286+
].join("\n");
287+
288+
const result = stripGitLogMetadata(input);
289+
expect(result).toContain("diff --git a/foo b/foo");
290+
expect(result).not.toContain("diff-less commit");
291+
expect(result).not.toContain("aaaabbbb");
292+
});
293+
});

src/core/loaders.ts

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,64 @@ function stripTerminalControl(text: string) {
6464
.replace(/\x1b[@-_]/g, "");
6565
}
6666

67+
/**
68+
* Strip `git log -p` / `git show -p` commit metadata so the surviving text
69+
* is a plain patch stream that `@pierre/diffs` can parse without spamming
70+
* `parseLineType: Invalid firstChar` warnings on every commit boundary.
71+
*
72+
* Each commit in `git log -p` looks like:
73+
*
74+
* ```
75+
* commit <sha>[ (refs)]
76+
* Author: ...
77+
* Date: ...
78+
*
79+
* <commit message>
80+
*
81+
* diff --git a/foo b/foo
82+
* ...
83+
* ```
84+
*
85+
* Lines from `commit ` through the first patch header (`diff --git `,
86+
* `--- `, or `+++ `) are dropped. Hunk-body lines always start with
87+
* `+`, `-`, ` ` or `\`, so a real context line that begins with the word
88+
* "commit" is unaffected (its leading space prevents the regex match).
89+
*
90+
* Returns the input unchanged when no `commit <sha>` boundary is present,
91+
* keeping the regular patch path zero-cost.
92+
*/
93+
export function stripGitLogMetadata(text: string) {
94+
// Hex range up to 64 covers both SHA-1 (40) and SHA-256 (64) repos.
95+
const COMMIT_BOUNDARY = /^commit [0-9a-f]{4,64}(?: |$)/m;
96+
if (!COMMIT_BOUNDARY.test(text)) {
97+
return text;
98+
}
99+
100+
const lines = text.split("\n");
101+
const out: string[] = [];
102+
let inHeader = false;
103+
104+
for (const line of lines) {
105+
if (COMMIT_BOUNDARY.test(line)) {
106+
inHeader = true;
107+
continue;
108+
}
109+
if (inHeader) {
110+
// The header section ends at the first patch line. `diff --git `
111+
// is the canonical Git start; `--- `/`+++ ` cover unified-diff
112+
// input where someone synthesised log output without it.
113+
if (line.startsWith("diff --git ") || line.startsWith("--- ") || line.startsWith("+++ ")) {
114+
inHeader = false;
115+
out.push(line);
116+
}
117+
continue;
118+
}
119+
out.push(line);
120+
}
121+
122+
return out.join("\n");
123+
}
124+
67125
/** Split a multi-file patch into per-file chunks so each diff file keeps its original patch text. */
68126
function splitPatchIntoFileChunks(rawPatch: string) {
69127
const patch = rawPatch.replaceAll("\r\n", "\n");
@@ -308,7 +366,9 @@ function normalizePatchChangeset(
308366
sourceLabel: string,
309367
agentContext: AgentContext | null,
310368
): Changeset {
311-
const normalizedPatchText = stripTerminalControl(patchText.replaceAll("\r\n", "\n"));
369+
const normalizedPatchText = stripGitLogMetadata(
370+
stripTerminalControl(patchText.replaceAll("\r\n", "\n")),
371+
);
312372

313373
let parsedPatches: ReturnType<typeof parsePatchFiles>;
314374
try {

0 commit comments

Comments
 (0)