Skip to content

Commit 336a269

Browse files
author
Bartosz Smoczynski
committed
Optionally return match operations in get_all_paths
1 parent 3231815 commit 336a269

6 files changed

Lines changed: 49 additions & 29 deletions

File tree

editdistance/_edit_distance_osa.cpp

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,19 +91,30 @@ std::vector<std::vector<CppEditop>> backtrack_all_paths(
9191
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
9292
current_path.pop_back();
9393
}
94-
95-
if (i > 0 && j > 0) {
96-
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : replace_weight;
97-
if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) {
98-
std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]);
99-
CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char);
94+
95+
if (i > 0 && j > 0 && a[i-1] != b[j-1]) {
96+
if (std::abs((dp[i-1][j-1] + replace_weight) - current_cost) < tol) {
97+
std::string out_char = std::string(1, b[j-1]);
98+
CppEditop op(REPLACE, i-1, j-1, replace_weight, out_char);
10099
current_path.push_back(op);
101100
auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
102101
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
103102
current_path.pop_back();
104103
}
105104
}
106-
105+
106+
if (i > 0 && j > 0 && a[i-1] == b[j-1]) {
107+
double match_weight = 0.0; // We might want to make this non-zero in the future
108+
if (std::abs((dp[i-1][j-1] + match_weight) - current_cost) < tol) {
109+
std::string out_char = std::string(1, a[i-1]);
110+
CppEditop op(MATCH, i-1, j-1, match_weight, out_char);
111+
current_path.push_back(op);
112+
auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
113+
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
114+
current_path.pop_back();
115+
}
116+
}
117+
107118
if (i > 1 && j > 1 &&
108119
a[i-1] == b[j-2] && a[i-2] == b[j-1] &&
109120
std::abs((dp[i-2][j-2] + swap_weight) - current_cost) < tol) {
@@ -161,6 +172,7 @@ std::string editop_name_to_string(CppEditopName name) {
161172
case DELETE: return "DELETE";
162173
case REPLACE: return "REPLACE";
163174
case SWAP: return "SWAP";
175+
case MATCH: return "MATCH";
164176
default: return "UNKNOWN";
165177
}
166178
}

editdistance/_edit_distance_osa.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ enum CppEditopName {
1111
INSERT,
1212
DELETE,
1313
REPLACE,
14-
SWAP
14+
SWAP,
15+
MATCH
1516
};
1617

1718
struct CppEditop {

editdistance/edit_distance_osa.pyx

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# distutils: language = c++
22
# distutils: sources = ./editdistance/_edit_distance_osa.cpp
33

4+
from libcpp cimport bool
45
from libcpp.map cimport map
56
from libcpp.string cimport string
67
from libcpp.vector cimport vector
@@ -14,6 +15,7 @@ cdef extern from "_edit_distance_osa.hpp":
1415
DELETE
1516
REPLACE
1617
SWAP
18+
MATCH
1719

1820
cdef struct CppEditop:
1921
CppEditopName name
@@ -38,6 +40,7 @@ class EditopName(Enum):
3840
DELETE = 1
3941
REPLACE = 2
4042
SWAP = 3
43+
MATCH = 4
4144

4245

4346
cdef class Editop:
@@ -64,7 +67,8 @@ def get_all_paths(
6467
double replace_weight=1.0,
6568
double insert_weight=1.0,
6669
double delete_weight=1.0,
67-
double swap_weight=1.0
70+
double swap_weight=1.0,
71+
bool return_matches=False,
6872
):
6973
cdef string cpp_a = a.encode("utf-8")
7074
cdef string cpp_b = b.encode("utf-8")
@@ -76,8 +80,6 @@ def get_all_paths(
7680
for cpp_path in cpp_paths:
7781
python_path = []
7882
for cpp_op in cpp_path:
79-
if cpp_op.cost == 0:
80-
continue
8183
if cpp_op.name == INSERT:
8284
py_name = EditopName.INSERT
8385
elif cpp_op.name == DELETE:
@@ -86,6 +88,11 @@ def get_all_paths(
8688
py_name = EditopName.REPLACE
8789
elif cpp_op.name == SWAP:
8890
py_name = EditopName.SWAP
91+
elif cpp_op.name == MATCH:
92+
if return_matches:
93+
py_name = EditopName.MATCH
94+
else:
95+
continue
8996
else:
9097
py_name = None
9198
python_path.append(Editop(
@@ -99,12 +106,11 @@ def get_all_paths(
99106
return python_paths
100107

101108
def apply_editops(src, dst, editops):
109+
# assumes editops are sorted from left to right
110+
# assumes match operations are included
102111
src_idx = 0
103112
s = ""
104113
for op in editops:
105-
while src_idx < op.src_idx:
106-
s += src[src_idx]
107-
src_idx += 1
108114
if op.name == EditopName.INSERT:
109115
s += dst[op.dst_idx]
110116
elif op.name == EditopName.DELETE:
@@ -116,9 +122,9 @@ def apply_editops(src, dst, editops):
116122
s += src[op.src_idx + 1]
117123
s += src[op.src_idx]
118124
src_idx += 2
119-
while src_idx < len(src):
120-
s += src[src_idx]
121-
src_idx += 1
125+
elif op.name == EditopName.MATCH:
126+
s += src[op.src_idx]
127+
src_idx += 1
122128
return s
123129

124130

examples/osa_example.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,7 @@
55
"""
66

77
try:
8-
from editdistance.osa import (
9-
compute_distance,
10-
get_all_paths,
11-
)
8+
from editdistance.osa import compute_distance, get_all_paths
129

1310
def main():
1411
# Test case from original Python code
@@ -25,12 +22,20 @@ def main():
2522
print(f"Distance: {distance}")
2623

2724
paths = get_all_paths(source, target)
25+
paths_with_matches = get_all_paths(source, target, return_matches=True)
2826
print(f"Number of optimal edit sequences: {len(paths)}")
2927

28+
print("Paths without match editops:")
3029
for i, path in enumerate(paths, 1):
31-
print(f"Path {i}:")
30+
print(f" Path {i}:")
3231
for op in path:
33-
print(f" {op}")
32+
print(f" {op}")
33+
print()
34+
print("Paths with match editops:")
35+
for i, path in enumerate(paths_with_matches, 1):
36+
print(f" Path {i}:")
37+
for op in path:
38+
print(f" {op}")
3439
print()
3540

3641
if __name__ == "__main__":

tests/__init__.py

Whitespace-only changes.

tests/tests_osa.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
import unittest
22

3-
from editdistance.osa import (
4-
apply_editops,
5-
compute_distance,
6-
get_all_paths,
7-
)
3+
from editdistance.osa import apply_editops, compute_distance, get_all_paths
84

95
COMPUTE_DISTANCE_TEST_CASES = [
106
("single character", "a", "b", 1.0),
@@ -76,7 +72,7 @@ def test_get_all_paths(self):
7672
def test_editops_transform(self):
7773
for src, dst in EDITOPS_TRANSFORM_TEST_CASES:
7874
with self.subTest(src=src, dst=dst):
79-
paths = get_all_paths(src, dst)
75+
paths = get_all_paths(src, dst, return_matches=True)
8076
self.assertTrue(paths, f"No paths found for {src} -> {dst}")
8177
for path in paths:
8278
result = apply_editops(src, dst, path)

0 commit comments

Comments
 (0)