Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions coopy/CompareTable.hx
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,81 @@ class CompareTable {
scale = -1;
}
}
// Fallback matching for rows that couldn't be matched via key equality.
// If an unmatched row in A is "sandwiched" between matched rows that map
// to consecutive rows in B, and the corresponding position in B is also
// unmatched, match them IF they have substantial content in common AND
// the row in B doesn't have duplicates (which would make matching ambiguous).
if (pending_ct>0 && reverse_pending_ct>0 && index_top!=null) {
for (xa in 1...ha) {
if (used_reverse.exists(xa)) continue; // already matched
// Find the previous matched row
var prev_xa : Int = xa - 1;
while (prev_xa >= 0 && !used_reverse.exists(prev_xa)) {
prev_xa--;
}
// Find the next matched row
var next_xa : Int = xa + 1;
while (next_xa < ha && !used_reverse.exists(next_xa)) {
next_xa++;
}
// Get their mappings in B
var prev_xb : Null<Int> = (prev_xa >= 0) ? align.a2b(prev_xa) : null;
var next_xb : Null<Int> = (next_xa < ha) ? align.a2b(next_xa) : null;
// Calculate expected position in B
var expected_xb : Null<Int> = null;
if (prev_xb != null && next_xb != null) {
// Check if there's exactly one gap in B
if (next_xb - prev_xb == next_xa - prev_xa) {
expected_xb = prev_xb + (xa - prev_xa);
}
} else if (prev_xb != null) {
expected_xb = prev_xb + (xa - prev_xa);
} else if (next_xb != null) {
expected_xb = next_xb - (next_xa - xa);
}
if (expected_xb != null && expected_xb >= 1 && expected_xb < hb) {
if (!used.exists(expected_xb)) {
// Check if there are other unmatched rows in B with the same key
// that could create ambiguity. We only skip if there are multiple
// UNMATCHED rows with the same key.
var kb : String = index_top.remoteKey(expected_xb);
var unmatched_with_same_key : Int = 0;
for (xb in 0...hb) {
if (xb == expected_xb) continue;
if (used.exists(xb)) continue; // already matched
if (index_top.remoteKey(xb) == kb) unmatched_with_same_key++;
}
// Only skip if there are other unmatched rows with same key
if (unmatched_with_same_key > 0) continue;

// Check if rows have substantial content in common
var dominated_score : Int = 0;
var common_count : Int = 0;
for (cunit in common_units) {
var ca : Int = cunit.l;
var cb : Int = cunit.r;
if (ca >= 0 && cb >= 0) {
common_count++;
var va : Dynamic = a.getCell(ca, xa);
var vb : Dynamic = b.getCell(cb, expected_xb);
if (av.equals(va, vb)) {
dominated_score++;
}
}
}
// Only match if strictly more than half the columns match
if (common_count > 0 && dominated_score * 2 > common_count) {
align.link(xa, expected_xb);
used.set(expected_xb, 1);
used_reverse.set(xa, 1);
pending_ct--;
reverse_pending_ct--;
}
}
}
}
}
// for consistency, explicitly mark unaligned things
for (i in 1...ha) {
if (!used_reverse.exists(i)) {
Expand Down
116 changes: 116 additions & 0 deletions test/test_row_alignment_heuristic.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Test for row alignment when column selection heuristic excludes distinguishing columns.
// This tests the fix for a bug where rows that differ by only one column were incorrectly
// reported as delete+insert instead of modification, when:
// 1. The table has many columns (>5) triggering the column selection heuristic
// 2. The selected columns cannot distinguish similar rows within the same table
// 3. The only distinguishing column is also the one that differs between tables

var coopy = require('daff');
var tester = require('tester');

// Test case 1: Original broken case (broken-case/a.csv vs b.csv)
// Row 5 (Transport & storage, Bexley) differs only in column "2023" (7000 vs 8000)
// With 9 columns and 7 rows, the heuristic might not select columns that
// distinguish this row from row 4 (Construction, Bexley)
{
var t1 = new coopy.TableView([
["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"],
["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"],
["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"],
["12000","15000","5","Retail","12000","E09000003","Barnet","15000","15000"],
["3500","6000","3","Construction","4000","E09000004","Bexley","3500","6000"],
["3500","7000","6","Transport & storage","4000","E09000004","Bexley","3500","6000"],
["1250","2500","16","Other services","1000","E09000005","Brent","1250","2250"]
]);
var t2 = new coopy.TableView([
["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"],
["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"],
["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"],
["12000","15000","5","Retail","12000","E09000003","Barnet","15000","15000"],
["3500","6000","3","Construction","4000","E09000004","Bexley","3500","6000"],
["3500","8000","6","Transport & storage","4000","E09000004","Bexley","3500","6000"],
["1250","2500","16","Other services","1000","E09000005","Brent","1250","2250"]
]);

var ct = new coopy.Coopy.compareTables(t1, t2);
var align = ct.align();

// All rows should match to the same position
tester.align_asserts(align, [
[0, 0], // header
[1, 1], // Barking and Dagenham
[2, 2], // Barnet Construction
[3, 3], // Barnet Retail
[4, 4], // Bexley Construction
[5, 5], // Bexley Transport & storage (the row that was incorrectly unmatched)
[6, 6] // Brent
]);

// Verify round-trip patching works
tester.bi_round_trip(t1, t2, "row alignment with many columns - single cell change");
}

// Test case 2: Verify that truly different rows are NOT matched
// When the last row is completely different, it should be delete+insert, not modify
{
var t1 = new coopy.TableView([
["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"],
["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"],
["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"],
["3500","6000","3","Construction","4000","E09000004","Bexley","3500","6000"]
]);
var t2 = new coopy.TableView([
["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"],
["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"],
["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"],
["9999","9999","99","Completely Different","9999","E09000099","Nowhere","9999","9999"]
]);

var ct = new coopy.Coopy.compareTables(t1, t2);
var align = ct.align();

// Row 3 should NOT match because it's completely different
tester.align_asserts(align, [
[0, 0],
[1, 1],
[2, 2],
[3, null] // Bexley Construction deleted
]);
}

// Test case 3: Multiple rows with same index key values
// When rows share values in the indexed columns but are different rows,
// the fallback should still work if only one is unmatched
{
var t1 = new coopy.TableView([
["year1","year2","year3","year4","year5","sector","code"],
["1000","2000","3000","4000","5000","Alpha","A1"],
["1000","2000","3000","4000","5000","Beta","B1"],
["1000","2000","3000","4000","5000","Gamma","C1"],
["1000","2000","3000","4000","9999","Delta","D1"]
]);
var t2 = new coopy.TableView([
["year1","year2","year3","year4","year5","sector","code"],
["1000","2000","3000","4000","5000","Alpha","A1"],
["1000","2000","3000","4000","5000","Beta","B1"],
["1000","2000","3000","4000","5000","Gamma","C1"],
["1000","2000","3000","4000","8888","Delta","D1"]
]);

var ct = new coopy.Coopy.compareTables(t1, t2);
var align = ct.align();

// All rows should match despite having similar year columns
// Row 4 (Delta) differs only in year5 (9999 vs 8888)
tester.align_asserts(align, [
[0, 0],
[1, 1],
[2, 2],
[3, 3],
[4, 4]
]);

tester.bi_round_trip(t1, t2, "rows with similar index key values");
}

console.log("All row alignment heuristic tests passed!");