Skip to content

Commit 244ad4f

Browse files
perf: replace O(mn) LCS diff with Myers O(ND) algorithm
Replace the quadratic LCS DP table with Myers' shortest edit script algorithm, reducing memory from O(mn) to O(n+m) and time from O(mn) to O(nd) where d is the edit distance. Add proper multi-hunk support for unified diff output with configurable context lines. Eliminates the code duplication between lcs_diff and unified_diff.
1 parent 1e675ff commit 244ad4f

1 file changed

Lines changed: 203 additions & 43 deletions

File tree

src/applets/diff.cpp

Lines changed: 203 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#include <algorithm>
12
#include <cstdio>
3+
#include <cstring>
24
#include <string>
35
#include <vector>
46

@@ -16,59 +18,216 @@ constexpr cfbox::help::HelpEntry HELP = {
1618
.extra = "",
1719
};
1820

19-
static auto lcs_diff(const std::vector<std::string>& a, const std::vector<std::string>& b) -> void {
20-
auto m = a.size(), n = b.size();
21-
std::vector<std::vector<int>> dp(m + 1, std::vector<int>(n + 1, 0));
22-
for (std::size_t i = 1; i <= m; ++i)
23-
for (std::size_t j = 1; j <= n; ++j)
24-
dp[i][j] = (a[i-1] == b[j-1]) ? dp[i-1][j-1] + 1 : std::max(dp[i-1][j], dp[i][j-1]);
25-
26-
std::vector<std::pair<char, std::string>> edits;
27-
std::size_t i = m, j = n;
28-
while (i > 0 || j > 0) {
29-
if (i > 0 && j > 0 && a[i-1] == b[j-1]) {
30-
edits.push_back({' ', a[i-1]});
31-
--i; --j;
32-
} else if (j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j])) {
33-
edits.push_back({'+', b[j-1]});
34-
--j;
21+
struct Edit {
22+
char op; // ' ', '+', '-'
23+
std::size_t line; // line content index (a for ' '/'-', b for '+')
24+
};
25+
26+
// Myers O(ND) diff — compute shortest edit script
27+
static auto myers_diff(const std::vector<std::string>& a, const std::vector<std::string>& b)
28+
-> std::vector<Edit> {
29+
auto N = static_cast<int>(a.size());
30+
auto M = static_cast<int>(b.size());
31+
if (N == 0 && M == 0) return {};
32+
33+
// Simple cases: one side empty
34+
if (N == 0) {
35+
std::vector<Edit> e;
36+
for (int j = 0; j < M; ++j) e.push_back({'+', static_cast<std::size_t>(j)});
37+
return e;
38+
}
39+
if (M == 0) {
40+
std::vector<Edit> e;
41+
for (int i = 0; i < N; ++i) e.push_back({'-', static_cast<std::size_t>(i)});
42+
return e;
43+
}
44+
45+
// Forward pass with V-trace storage
46+
int max_d = N + M;
47+
int off = max_d; // offset to make k index non-negative
48+
// Store complete V array at each d
49+
std::vector<std::vector<int>> vv;
50+
51+
{
52+
std::vector<int> v(static_cast<std::size_t>(2 * max_d + 1), 0);
53+
v[static_cast<std::size_t>(1 + off)] = 0;
54+
55+
for (int d = 0; d <= max_d; ++d) {
56+
std::vector<int> prev = v;
57+
for (int k = -d; k <= d; k += 2) {
58+
int x;
59+
if (k == -d || (k != d && prev[static_cast<std::size_t>(k - 1 + off)] < prev[static_cast<std::size_t>(k + 1 + off)])) {
60+
x = prev[static_cast<std::size_t>(k + 1 + off)];
61+
} else {
62+
x = prev[static_cast<std::size_t>(k - 1 + off)] + 1;
63+
}
64+
int y = x - k;
65+
while (x < N && y < M && a[static_cast<std::size_t>(x)] == b[static_cast<std::size_t>(y)]) {
66+
++x; ++y;
67+
}
68+
v[static_cast<std::size_t>(k + off)] = x;
69+
if (x >= N && y >= M) {
70+
vv.push_back(v);
71+
goto forward_done;
72+
}
73+
}
74+
vv.push_back(v);
75+
}
76+
}
77+
forward_done:
78+
79+
// Backtrack through vv to recover edit script
80+
std::vector<Edit> edits;
81+
int x = N, y = M;
82+
83+
for (int d = static_cast<int>(vv.size()) - 1; d > 0; --d) {
84+
int k = x - y;
85+
auto& prev = vv[static_cast<std::size_t>(d - 1)];
86+
87+
// Determine if we came from k+1 (insert) or k-1 (delete)
88+
bool from_below = (k == -d) ||
89+
(k != d && prev[static_cast<std::size_t>(k - 1 + off)] < prev[static_cast<std::size_t>(k + 1 + off)]);
90+
91+
int mid_x, mid_y; // position after the non-diagonal step
92+
if (from_below) {
93+
mid_x = prev[static_cast<std::size_t>(k + 1 + off)];
94+
mid_y = mid_x - (k + 1);
3595
} else {
36-
edits.push_back({'-', a[i-1]});
37-
--i;
96+
mid_x = prev[static_cast<std::size_t>(k - 1 + off)] + 1;
97+
mid_y = mid_x - (k - 1);
3898
}
99+
100+
// Record diagonal steps (equal lines) from (x,y) back to (mid_x, mid_y)
101+
while (x > mid_x && y > mid_y) {
102+
--x; --y;
103+
edits.push_back({' ', static_cast<std::size_t>(x)});
104+
}
105+
106+
// Record the non-diagonal step
107+
if (from_below) {
108+
// insert b[y-1] — but after the step, we're at (mid_x, mid_y) = (prev[k+1], prev[k+1]-(k+1))
109+
// The step moved from (mid_x, mid_y+1) down to (mid_x, mid_y)
110+
edits.push_back({'+', static_cast<std::size_t>(mid_y)}); // b[mid_y] was inserted
111+
--y; // adjust to position before insert
112+
} else {
113+
// delete a[x-1]
114+
edits.push_back({'-', static_cast<std::size_t>(mid_x - 1)}); // a[mid_x-1] was deleted
115+
--x; // adjust to position before delete
116+
}
117+
118+
// Now (x,y) should match prev[k'] where k' is the diagonal we came from
39119
}
40-
for (auto it = edits.rbegin(); it != edits.rend(); ++it) {
41-
std::printf("%c%s\n", it->first, it->second.c_str());
120+
121+
// d=0: only diagonal steps from (x,y) to (0,0)
122+
while (x > 0 && y > 0) {
123+
--x; --y;
124+
edits.push_back({' ', static_cast<std::size_t>(x)});
42125
}
126+
127+
std::reverse(edits.begin(), edits.end());
128+
return edits;
43129
}
44130

45-
static auto unified_diff(const std::string& file1, const std::string& file2,
46-
const std::vector<std::string>& a, const std::vector<std::string>& b) -> void {
47-
std::printf("--- %s\n+++ %s\n@@ -1,%zu +1,%zu @@\n", file1.c_str(), file2.c_str(), a.size(), b.size());
48-
auto m = a.size(), n = b.size();
49-
std::vector<std::vector<int>> dp(m + 1, std::vector<int>(n + 1, 0));
50-
for (std::size_t i = 1; i <= m; ++i)
51-
for (std::size_t j = 1; j <= n; ++j)
52-
dp[i][j] = (a[i-1] == b[j-1]) ? dp[i-1][j-1] + 1 : std::max(dp[i-1][j], dp[i][j-1]);
53-
54-
std::vector<std::pair<char, std::string>> edits;
55-
std::size_t i = m, j = n;
56-
while (i > 0 || j > 0) {
57-
if (i > 0 && j > 0 && a[i-1] == b[j-1]) {
58-
edits.push_back({' ', a[i-1]});
59-
--i; --j;
60-
} else if (j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j])) {
61-
edits.push_back({'+', b[j-1]});
62-
--j;
131+
static auto print_edits(const std::vector<Edit>& edits,
132+
const std::vector<std::string>& a,
133+
const std::vector<std::string>& b) -> void {
134+
for (auto& e : edits) {
135+
if (e.op == ' ' || e.op == '-') {
136+
std::printf("%c%s\n", e.op, a[e.line].c_str());
63137
} else {
64-
edits.push_back({'-', a[i-1]});
65-
--i;
138+
std::printf("+%s\n", b[e.line].c_str());
66139
}
67140
}
68-
for (auto it = edits.rbegin(); it != edits.rend(); ++it) {
69-
std::printf("%c%s\n", it->first, it->second.c_str());
141+
}
142+
143+
struct Hunk {
144+
int a_start, a_count;
145+
int b_start, b_count;
146+
std::vector<Edit> edits;
147+
};
148+
149+
static auto build_hunks(const std::vector<Edit>& edits,
150+
int context = 3) -> std::vector<Hunk> {
151+
if (edits.empty()) return {};
152+
153+
// Find change positions
154+
std::vector<int> change_idx;
155+
for (int i = 0; i < static_cast<int>(edits.size()); ++i) {
156+
if (edits[static_cast<std::size_t>(i)].op != ' ')
157+
change_idx.push_back(i);
158+
}
159+
if (change_idx.empty()) return {};
160+
161+
// Group changes into hunks with context
162+
std::vector<Hunk> hunks;
163+
int hunk_start = std::max(0, change_idx[0] - context);
164+
165+
for (int ci = 1; ci < static_cast<int>(change_idx.size()); ++ci) {
166+
int gap_start = change_idx[static_cast<std::size_t>(ci - 1)] + 1;
167+
int gap_end = change_idx[static_cast<std::size_t>(ci)] - 1;
168+
// If gap between changes exceeds 2*context, split into new hunk
169+
if (gap_end - gap_start + 1 > 2 * context) {
170+
int hunk_end = std::min(static_cast<int>(edits.size()) - 1,
171+
change_idx[static_cast<std::size_t>(ci - 1)] + context);
172+
Hunk h;
173+
h.edits.assign(edits.begin() + hunk_start, edits.begin() + hunk_end + 1);
174+
// Count a/b lines for this hunk
175+
h.a_start = 1; h.a_count = 0;
176+
h.b_start = 1; h.b_count = 0;
177+
bool a_init = false, b_init = false;
178+
for (auto& e : h.edits) {
179+
if (e.op == ' ' || e.op == '-') {
180+
if (!a_init) { h.a_start = static_cast<int>(e.line) + 1; a_init = true; }
181+
++h.a_count;
182+
}
183+
if (e.op == ' ' || e.op == '+') {
184+
if (!b_init) { h.b_start = static_cast<int>(e.line) + 1; b_init = true; }
185+
++h.b_count;
186+
}
187+
}
188+
hunks.push_back(std::move(h));
189+
hunk_start = std::max(0, change_idx[static_cast<std::size_t>(ci)] - context);
190+
}
191+
}
192+
// Last hunk
193+
int hunk_end = std::min(static_cast<int>(edits.size()) - 1,
194+
change_idx.back() + context);
195+
Hunk h;
196+
h.edits.assign(edits.begin() + hunk_start, edits.begin() + hunk_end + 1);
197+
h.a_start = 1; h.a_count = 0;
198+
h.b_start = 1; h.b_count = 0;
199+
bool a_init = false, b_init = false;
200+
for (auto& e : h.edits) {
201+
if (e.op == ' ' || e.op == '-') {
202+
if (!a_init) { h.a_start = static_cast<int>(e.line) + 1; a_init = true; }
203+
++h.a_count;
204+
}
205+
if (e.op == ' ' || e.op == '+') {
206+
if (!b_init) { h.b_start = static_cast<int>(e.line) + 1; b_init = true; }
207+
++h.b_count;
208+
}
209+
}
210+
hunks.push_back(std::move(h));
211+
return hunks;
212+
}
213+
214+
static auto unified_diff(const std::string& file1, const std::string& file2,
215+
const std::vector<std::string>& a, const std::vector<std::string>& b) -> void {
216+
std::printf("--- %s\n+++ %s\n", file1.c_str(), file2.c_str());
217+
auto edits = myers_diff(a, b);
218+
auto hunks = build_hunks(edits);
219+
for (auto& h : hunks) {
220+
std::printf("@@ -%d,%d +%d,%d @@\n",
221+
h.a_start, h.a_count, h.b_start, h.b_count);
222+
for (auto& e : h.edits) {
223+
if (e.op == ' ' || e.op == '-')
224+
std::printf("%c%s\n", e.op, a[e.line].c_str());
225+
else
226+
std::printf("+%s\n", b[e.line].c_str());
227+
}
70228
}
71229
}
230+
72231
} // namespace
73232

74233
auto diff_main(int argc, char* argv[]) -> int {
@@ -96,7 +255,8 @@ auto diff_main(int argc, char* argv[]) -> int {
96255
if (unified) {
97256
unified_diff(std::string{pos[0]}, std::string{pos[1]}, *a_result, *b_result);
98257
} else {
99-
lcs_diff(*a_result, *b_result);
258+
auto edits = myers_diff(*a_result, *b_result);
259+
print_edits(edits, *a_result, *b_result);
100260
}
101261
return 1;
102262
}

0 commit comments

Comments
 (0)