@@ -116,12 +116,13 @@ defmodule Dmp.Match do
116116 s = alphabet ( pattern )
117117
118118 # Initialise the bit arrays.
119- matchmask = 1 <<< ( pattern_length - 1 )
120- shiftmask = ( 1 <<< pattern_length ) - 1
119+ match_mask = 1 <<< ( pattern_length - 1 )
120+ overflow_mask = ( 1 <<< pattern_length ) - 1
121121 text_length = String . length ( text )
122122
123123 constants =
124- { text , pattern , loc , s , matchmask , shiftmask , text_length , pattern_length , match_distance }
124+ { text , pattern , loc , s , match_mask , overflow_mask , text_length , pattern_length ,
125+ match_distance }
125126
126127 # Uncomment to see the bitarray
127128 # debug_alphabet(pattern, s) |> Enum.join("\n") |> IO.puts
@@ -207,14 +208,14 @@ defmodule Dmp.Match do
207208 # previous `d-1` error level.
208209 # `text_length - $$n$$ in the Wu and Manber paper.
209210 # `pattern_length` - $$m$$ in the Wu and Manber paper.
210- # `matchmask ` - Bitmask to test the value of $$R_j+1 [m]$$.
211+ # `match_mask ` - Bitmask to test the value of $$R_{j+1} [m]$$.
211212 #
212213 # Returns updated `best_loc`, `score_threshold`, and `max_distance`,
213214 # and the calculated $$R_j^d$$ bitarray from this level.
214215 defp search_at_error_level (
215216 d ,
216217 { best_loc , score_threshold , max_distance , last_rd } ,
217- { text , _pattern , loc , s , matchmask , shiftmask , text_length , pattern_length ,
218+ { text , _pattern , loc , s , match_mask , overflow_mask , text_length , pattern_length ,
218219 match_distance }
219220 ) do
220221 distance =
@@ -232,7 +233,9 @@ defmodule Dmp.Match do
232233 # for debugging we store the "size" of the array at index -1
233234 rd = % { ( finish + 1 ) => ( 1 <<< d ) - 1 , - 1 => finish + 2 }
234235 acc2 = { best_loc , score_threshold , start , rd }
235- constants2 = { d , text , loc , last_rd , s , matchmask , shiftmask , pattern_length , match_distance }
236+
237+ constants2 =
238+ { d , text , loc , last_rd , s , match_mask , overflow_mask , pattern_length , match_distance }
236239
237240 { best_loc , score_threshold , _j , rd } =
238241 Enum . reduce_while ( finish .. 0 // - 1 , acc2 , fn j , acc ->
@@ -263,26 +266,15 @@ defmodule Dmp.Match do
263266 # `bin_mid` - Midpoint between `bin_min` and `bin_max`
264267 #
265268 # Returns `bin_mid`, where `loc + bin_mid` has the lowest bitap score.
266- defp best_distance (
267- bin_min ,
268- bin_mid ,
269- _bin_max ,
270- _acc
271- )
272- when bin_min >= bin_mid do
273- # Done
274- bin_mid
275- end
269+ defp best_distance ( bin_min , bin_mid , _ , _ ) when bin_min >= bin_mid , do: bin_mid
276270
277271 defp best_distance (
278272 bin_min ,
279273 bin_mid ,
280274 bin_max ,
281275 { d , loc , pattern_length , score_threshold , match_distance }
282276 ) do
283- # Loop
284- mid_loc = loc + bin_mid
285- score = bitap_score ( d , mid_loc , loc , pattern_length , match_distance )
277+ score = bitap_score ( d , loc + bin_mid , loc , pattern_length , match_distance )
286278
287279 { bin_min , bin_max } =
288280 if score <= score_threshold do
@@ -301,55 +293,104 @@ defmodule Dmp.Match do
301293 )
302294 end
303295
304- # This is the heart of the bitap algorithm.
305- #
306- # Updates the `rd` bitarray for the current error level `d` at the index `j`
307- # (representing the zero-based location `j - 1` in `text`), and
308- # then tests for a match (an exact match if `d == 0` or a match with
309- # `d` errors).
310- #
311- # If a match is found, we calculate the error score (number of errors and
312- # distance from expected location) and if it's lower than the current
313- # threshold, we stop calculating the update if we have already gone
314- # below the minimum possible location, or continue the update
315- # going with a smaller range of indices.
316- defp bitap_update (
317- j ,
318- { best_loc , score_threshold , start , rd } ,
319- _constants
320- )
321- when j < start do
296+ @ typedoc "Accumulator for `bitap_update/3`."
297+ @ type update_acc ( ) :: { integer ( ) , float ( ) , non_neg_integer ( ) , bitap_array ( ) }
298+ @ typep update_constants ( ) ::
299+ { non_neg_integer ( ) , String . t ( ) , integer ( ) , bitap_array ( ) , alpha ( ) , non_neg_integer ( ) ,
300+ non_neg_integer ( ) , non_neg_integer ( ) , non_neg_integer ( ) }
301+
302+ @ doc """
303+ Perform the bitap algorithm and calculate error score if a match is found.
304+
305+ * `acc` - Accumulator tuple, with `best_loc`, `score_threshold`, `start`, and `rd` elements.
306+ * `constants` - Other constant values needed for calculations.
307+
308+ Updates the `rd` bitarray for the current error level `d` at the index `j`
309+ (representing the zero-based location `j - 1` in `text`), and
310+ then tests for a match (an exact match if `d == 0` or a match with
311+ `d` errors).
312+
313+ If a match is found at position `j`, calculate the error score
314+ (based on the error level `d` and the distance from expected location).
315+ If the score is lower than the current threshold, stop calculating the update
316+ if we have already gone below the minimum possible location,
317+ or continue the update, limiting the range of `j` (increasing the
318+ `start` value).
319+
320+ ## Notes
321+
322+ The `j` index is decremented from the end of the text to the start of the text.
323+ Since the iteration is moving from high `j` to low, `bitap_update` does "Lshift"
324+ operations, not the "Rshift" operations in the Wu and Manber paper, and uses
325+ the previous values that were set at `j + 1`, not `j`.
326+
327+ Here the calculations are:
328+
329+ $$Rsubscptj^d = \\ begin{cases}
330+ Lshift [ Rsubscpt{j+1}^d ] \\ text{ AND } S_c &\\ text{if } d = 0 \\ cr
331+ Lshift [ Rsubscpt{j+1}^d ] \\ text{ AND } S_c \\ text{ OR } Lshift [ Rsubscptj^{d-1} \\ text{ OR } Rsubscpt{j+1}^{d-1} ] \\ text{ OR } Rsubscpt{j+1}^{d-1} &\\ text{otherwise}
332+ \\ end{cases}$$
333+
334+ versus in Wu and Manber's paper:
335+
336+ $$Rsubscptj^d = \\ begin{cases}
337+ Rshift [ Rsubscpt{j}^d ] \\ text{ AND } S_c &\\ text{if } d = 0 \\ cr
338+ Rshift [ Rsubscpt{j}^d ] \\ text{ AND } S_c \\ text{ OR } Rshift [ Rsubscptj^{d-1} \\ text{ OR } Rsubscpt{j+1}^{d-1} ] \\ text{ OR } Rsubscpt{j}^{d-1} &\\ text{otherwise}
339+ \\ end{cases}$$
340+
341+ """
342+ @ spec bitap_update ( non_neg_integer ( ) , update_acc ( ) , update_constants ( ) ) ::
343+ { :cont | :halt , update_acc ( ) }
344+ def bitap_update ( j , acc , constants )
345+
346+ def bitap_update (
347+ j ,
348+ { best_loc , score_threshold , start , rd } ,
349+ _constants
350+ )
351+ when j < start do
322352 # Exceeded our current distance from loc. Done.
323353 # Return `j + 1` in the `start` position and break the iteration.
324354 { :halt , { best_loc , score_threshold , j + 1 , rd } }
325355 end
326356
327- defp bitap_update (
328- j ,
329- { best_loc , score_threshold , start , rd } ,
330- { d , text , loc , last_rd , s , matchmask , shiftmask , pattern_length , match_distance }
331- ) do
357+ def bitap_update (
358+ j ,
359+ { best_loc , score_threshold , start , rd } ,
360+ { d , text , loc , last_rd , s , match_mask , overflow_mask , pattern_length , match_distance }
361+ ) do
362+ # $$S_c$$
332363 char_match = s_c ( s , String . at ( text , j - 1 ) )
333364
334- rd_j_1 = Map . get ( rd , j + 1 , 0 ) |> shift_left ( shiftmask )
335- rd_j_1 = rd_j_1 &&& char_match
365+ # Perform shift-OR update
366+
367+ # $$Lshift[R_{j+1}^d] AND S_c$$
368+ shift_d_and_s_c = ( Map . get ( rd , j + 1 , 0 ) <<< 1 ||| 1 ) &&& char_match
336369
337370 rd_j =
338371 if d == 0 do
339372 # First pass: exact match.
340- rd_j_1
373+ shift_d_and_s_c
341374 else
342375 # Subsequent passes: fuzzy match.
343- last_rd_j_1 = Map . get ( last_rd , j + 1 , 0 )
344- last_rd_j = Map . get ( last_rd , j , 0 )
345- last_rd_j = ( last_rd_j ||| last_rd_j_1 ) |> shift_left ( shiftmask )
346- rd_j_1 ||| last_rd_j ||| last_rd_j_1
376+ # $$R_{j+1}^{d-1}$$
377+ rd_d1_j1 = Map . get ( last_rd , j + 1 , 0 )
378+ # $$R_j^{d-1}$$
379+ rd_d1_j = Map . get ( last_rd , j , 0 )
380+
381+ # Restrict shifted values to pattern_length with $$AND overflow_mask$$
382+ # $$Lshift[R_j^{d-1} OR R_{j+1}^{d-1}]$$
383+ shift_d1 = ( ( rd_d1_j ||| rd_d1_j1 ) <<< 1 ||| 1 ) &&& overflow_mask
384+
385+ # $$Lshift[R_{j+1}^d] AND S_c OR Lshift[R_j^{d-1} OR R_{j+1}^{d-1}] OR R_{j+1}^{d-1}$$
386+ shift_d_and_s_c ||| shift_d1 ||| rd_d1_j1
347387 end
348388
349389 # Update mask array
350390 rd = Map . put ( rd , j , rd_j )
351391
352- if ( rd_j &&& matchmask ) != 0 do
392+ # Test for a match: $$if Rd_j+1[m] = 1$$
393+ if ( rd_j &&& match_mask ) != 0 do
353394 # Found a match
354395 test_score_at_match (
355396 d ,
@@ -363,11 +404,6 @@ defmodule Dmp.Match do
363404 end
364405 end
365406
366- # Keep values during shifts from overflowing by ANDing with `shiftmask`
367- defp shift_left ( val , shiftmask ) do
368- ( val <<< 1 &&& shiftmask ) ||| 1
369- end
370-
371407 # We found a match during `bitap_update/3`. Verify
372408 # that it is the best match and either stop the `rd` array update
373409 # if we have already passed `loc` or reduce the range of indices
0 commit comments