diff --git a/coopy/CompareTable.hx b/coopy/CompareTable.hx index 3540e60..2b835e3 100644 --- a/coopy/CompareTable.hx +++ b/coopy/CompareTable.hx @@ -389,6 +389,81 @@ class CompareTable { scale = -1; } } + // Fallback matching for rows that couldn't be matched via key equality. + // If an unmatched row in A is "sandwiched" between matched rows that map + // to consecutive rows in B, and the corresponding position in B is also + // unmatched, match them IF they have substantial content in common AND + // the row in B doesn't have duplicates (which would make matching ambiguous). + if (pending_ct>0 && reverse_pending_ct>0 && index_top!=null) { + for (xa in 1...ha) { + if (used_reverse.exists(xa)) continue; // already matched + // Find the previous matched row + var prev_xa : Int = xa - 1; + while (prev_xa >= 0 && !used_reverse.exists(prev_xa)) { + prev_xa--; + } + // Find the next matched row + var next_xa : Int = xa + 1; + while (next_xa < ha && !used_reverse.exists(next_xa)) { + next_xa++; + } + // Get their mappings in B + var prev_xb : Null = (prev_xa >= 0) ? align.a2b(prev_xa) : null; + var next_xb : Null = (next_xa < ha) ? align.a2b(next_xa) : null; + // Calculate expected position in B + var expected_xb : Null = null; + if (prev_xb != null && next_xb != null) { + // Check if there's exactly one gap in B + if (next_xb - prev_xb == next_xa - prev_xa) { + expected_xb = prev_xb + (xa - prev_xa); + } + } else if (prev_xb != null) { + expected_xb = prev_xb + (xa - prev_xa); + } else if (next_xb != null) { + expected_xb = next_xb - (next_xa - xa); + } + if (expected_xb != null && expected_xb >= 1 && expected_xb < hb) { + if (!used.exists(expected_xb)) { + // Check if there are other unmatched rows in B with the same key + // that could create ambiguity. We only skip if there are multiple + // UNMATCHED rows with the same key. + var kb : String = index_top.remoteKey(expected_xb); + var unmatched_with_same_key : Int = 0; + for (xb in 0...hb) { + if (xb == expected_xb) continue; + if (used.exists(xb)) continue; // already matched + if (index_top.remoteKey(xb) == kb) unmatched_with_same_key++; + } + // Only skip if there are other unmatched rows with same key + if (unmatched_with_same_key > 0) continue; + + // Check if rows have substantial content in common + var dominated_score : Int = 0; + var common_count : Int = 0; + for (cunit in common_units) { + var ca : Int = cunit.l; + var cb : Int = cunit.r; + if (ca >= 0 && cb >= 0) { + common_count++; + var va : Dynamic = a.getCell(ca, xa); + var vb : Dynamic = b.getCell(cb, expected_xb); + if (av.equals(va, vb)) { + dominated_score++; + } + } + } + // Only match if strictly more than half the columns match + if (common_count > 0 && dominated_score * 2 > common_count) { + align.link(xa, expected_xb); + used.set(expected_xb, 1); + used_reverse.set(xa, 1); + pending_ct--; + reverse_pending_ct--; + } + } + } + } + } // for consistency, explicitly mark unaligned things for (i in 1...ha) { if (!used_reverse.exists(i)) { diff --git a/test/test_row_alignment_heuristic.js b/test/test_row_alignment_heuristic.js new file mode 100644 index 0000000..2988cac --- /dev/null +++ b/test/test_row_alignment_heuristic.js @@ -0,0 +1,116 @@ +// Test for row alignment when column selection heuristic excludes distinguishing columns. +// This tests the fix for a bug where rows that differ by only one column were incorrectly +// reported as delete+insert instead of modification, when: +// 1. The table has many columns (>5) triggering the column selection heuristic +// 2. The selected columns cannot distinguish similar rows within the same table +// 3. The only distinguishing column is also the one that differs between tables + +var coopy = require('daff'); +var tester = require('tester'); + +// Test case 1: Original broken case (broken-case/a.csv vs b.csv) +// Row 5 (Transport & storage, Bexley) differs only in column "2023" (7000 vs 8000) +// With 9 columns and 7 rows, the heuristic might not select columns that +// distinguish this row from row 4 (Construction, Bexley) +{ + var t1 = new coopy.TableView([ + ["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"], + ["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"], + ["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"], + ["12000","15000","5","Retail","12000","E09000003","Barnet","15000","15000"], + ["3500","6000","3","Construction","4000","E09000004","Bexley","3500","6000"], + ["3500","7000","6","Transport & storage","4000","E09000004","Bexley","3500","6000"], + ["1250","2500","16","Other services","1000","E09000005","Brent","1250","2250"] + ]); + var t2 = new coopy.TableView([ + ["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"], + ["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"], + ["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"], + ["12000","15000","5","Retail","12000","E09000003","Barnet","15000","15000"], + ["3500","6000","3","Construction","4000","E09000004","Bexley","3500","6000"], + ["3500","8000","6","Transport & storage","4000","E09000004","Bexley","3500","6000"], + ["1250","2500","16","Other services","1000","E09000005","Brent","1250","2250"] + ]); + + var ct = new coopy.Coopy.compareTables(t1, t2); + var align = ct.align(); + + // All rows should match to the same position + tester.align_asserts(align, [ + [0, 0], // header + [1, 1], // Barking and Dagenham + [2, 2], // Barnet Construction + [3, 3], // Barnet Retail + [4, 4], // Bexley Construction + [5, 5], // Bexley Transport & storage (the row that was incorrectly unmatched) + [6, 6] // Brent + ]); + + // Verify round-trip patching works + tester.bi_round_trip(t1, t2, "row alignment with many columns - single cell change"); +} + +// Test case 2: Verify that truly different rows are NOT matched +// When the last row is completely different, it should be delete+insert, not modify +{ + var t1 = new coopy.TableView([ + ["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"], + ["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"], + ["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"], + ["3500","6000","3","Construction","4000","E09000004","Bexley","3500","6000"] + ]); + var t2 = new coopy.TableView([ + ["1971","2023","sector_ID","sector","1973","gss_code","borough","1985","2019"], + ["250","1250","16","Other services","500","E09000002","Barking and Dagenham","600","1000"], + ["6000","10000","3","Construction","7000","E09000003","Barnet","4500","9000"], + ["9999","9999","99","Completely Different","9999","E09000099","Nowhere","9999","9999"] + ]); + + var ct = new coopy.Coopy.compareTables(t1, t2); + var align = ct.align(); + + // Row 3 should NOT match because it's completely different + tester.align_asserts(align, [ + [0, 0], + [1, 1], + [2, 2], + [3, null] // Bexley Construction deleted + ]); +} + +// Test case 3: Multiple rows with same index key values +// When rows share values in the indexed columns but are different rows, +// the fallback should still work if only one is unmatched +{ + var t1 = new coopy.TableView([ + ["year1","year2","year3","year4","year5","sector","code"], + ["1000","2000","3000","4000","5000","Alpha","A1"], + ["1000","2000","3000","4000","5000","Beta","B1"], + ["1000","2000","3000","4000","5000","Gamma","C1"], + ["1000","2000","3000","4000","9999","Delta","D1"] + ]); + var t2 = new coopy.TableView([ + ["year1","year2","year3","year4","year5","sector","code"], + ["1000","2000","3000","4000","5000","Alpha","A1"], + ["1000","2000","3000","4000","5000","Beta","B1"], + ["1000","2000","3000","4000","5000","Gamma","C1"], + ["1000","2000","3000","4000","8888","Delta","D1"] + ]); + + var ct = new coopy.Coopy.compareTables(t1, t2); + var align = ct.align(); + + // All rows should match despite having similar year columns + // Row 4 (Delta) differs only in year5 (9999 vs 8888) + tester.align_asserts(align, [ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + [4, 4] + ]); + + tester.bi_round_trip(t1, t2, "rows with similar index key values"); +} + +console.log("All row alignment heuristic tests passed!");