Skip to content

Commit 02bc3d3

Browse files
committed
Add elastic-like fuzziness parameter
1 parent 490854a commit 02bc3d3

2 files changed

Lines changed: 84 additions & 9 deletions

File tree

internal/routes/dictionary_fix.go

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,12 @@ package routes
22

33
import (
44
"context"
5+
"encoding/json"
56
"errors"
7+
"fmt"
68
"regexp"
9+
"strconv"
10+
"strings"
711
"unicode/utf8"
812

913
"github.com/f1monkey/spellchecker-web/internal/spellchecker"
@@ -17,12 +21,12 @@ type dictionaryGetter interface {
1721
}
1822

1923
type DictionaryFixRequest struct {
20-
Code string `path:"code" minLength:"1"`
21-
22-
Text string `json:"text" description:"Phrase to be checked"`
23-
Limit int `json:"limit" default:"5" desciption:"Max suggestions per word"`
24-
MaxErrors int `json:"maxErrors" default:"2" desciption:"Max spellchecker errors allowed"`
25-
SimilarityThreshold float64 `json:"similarityThreshold" minimum:"0" maximum:"1" desciption:"Word similarity percent required"`
24+
Code string `path:"code" minLength:"1" description:"Dictionary code to use for spellchecking."`
25+
Text string `json:"text" description:"Input text to be checked and corrected."`
26+
Limit int `json:"limit" default:"5" description:"Maximum number of suggestions to return per word."`
27+
MaxErrors int `json:"maxErrors" default:"2" description:"Maximum number of bit-level differences allowed between the input word and a dictionary word. Examples: deletion=1 bit (proble→problem), insertion=1 bit (problemm→problem), substitution=2 bits (problam→problem), transposition=0 bits (problme→problem). Not recommended to set higher than 2, as it can impact performance."`
28+
Fuzziness FuzzinessValue `json:"fuzziness" description:"Maximum allowed Levenshtein edit distance. Allowed values: '0','1','2'... (fixed distance), 'AUTO' (auto by word length, default AUTO:3,6), 'AUTO:low,high' (custom range). See: https://www.elastic.co/docs/reference/elasticsearch/rest-apis/common-options#fuzziness"`
29+
SimilarityThreshold float64 `json:"similarityThreshold" minimum:"0" maximum:"1" description:"Required similarity ratio between input word and candidate suggestion (0.0–1.0). Example: 0.6 = candidate must be at least 60% similar to input."`
2630
}
2731

2832
type DictionaryFixResponse struct {
@@ -62,6 +66,11 @@ func dictionaryFix(registry dictionaryGetter, splitter *regexp.Regexp) usecase.I
6266
return status.Wrap(err, status.Internal)
6367
}
6468

69+
fuzziness, err := input.Fuzziness.Parse()
70+
if err != nil {
71+
return status.Wrap(err, status.InvalidArgument)
72+
}
73+
6574
if input.Text == "" {
6675
output.Fixes = make([]Fix, 0)
6776
return nil
@@ -85,7 +94,7 @@ func dictionaryFix(registry dictionaryGetter, splitter *regexp.Regexp) usecase.I
8594

8695
suggestions := sc.Suggest(&f1mspellchecker.SearchOptions{
8796
MaxErrors: input.MaxErrors,
88-
FilterFunc: spellchecker.ScoringFunc(input.MaxErrors, input.SimilarityThreshold),
97+
FilterFunc: spellchecker.ScoringFunc(fuzziness, input.SimilarityThreshold),
8998
}, word, input.Limit)
9099

91100
if suggestions.ExactMatch {
@@ -129,3 +138,43 @@ func dictionaryFix(registry dictionaryGetter, splitter *regexp.Regexp) usecase.I
129138

130139
return u
131140
}
141+
142+
type FuzzinessValue string
143+
144+
// Parse converts FuzzinessValue into spellchecker.Fuzziness.
145+
func (fv FuzzinessValue) Parse() (spellchecker.Fuzziness, error) {
146+
raw := strings.TrimSpace(strings.ToUpper(string(fv)))
147+
148+
switch {
149+
case raw == "":
150+
return spellchecker.FixedFuzziness(0), nil
151+
case raw == "AUTO":
152+
return spellchecker.AutoFuzziness{Low: 3, High: 6}, nil
153+
case strings.HasPrefix(raw, "AUTO:"):
154+
parts := strings.Split(strings.TrimPrefix(raw, "AUTO:"), ",")
155+
if len(parts) != 2 {
156+
return nil, fmt.Errorf("invalid AUTO fuzziness format: %q", raw)
157+
}
158+
low, err1 := strconv.Atoi(parts[0])
159+
high, err2 := strconv.Atoi(parts[1])
160+
if err1 != nil || err2 != nil {
161+
return nil, fmt.Errorf("invalid AUTO fuzziness values: %q", raw)
162+
}
163+
return spellchecker.AutoFuzziness{Low: low, High: high}, nil
164+
default:
165+
if n, err := strconv.Atoi(raw); err == nil {
166+
return spellchecker.FixedFuzziness(n), nil
167+
}
168+
return nil, fmt.Errorf("unknown fuzziness value: %q", raw)
169+
}
170+
}
171+
172+
// UnmarshalJSON validates that fuzziness is passed as a JSON string.
173+
func (fv *FuzzinessValue) UnmarshalJSON(data []byte) error {
174+
var raw string
175+
if err := json.Unmarshal(data, &raw); err != nil {
176+
return fmt.Errorf("fuzziness must be a string: %w", err)
177+
}
178+
*fv = FuzzinessValue(raw)
179+
return nil
180+
}

internal/spellchecker/scoring.go

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,36 @@ import (
77
"github.com/f1monkey/spellchecker/v2"
88
)
99

10-
func ScoringFunc(maxErrors int, similarityThreshold float64) spellchecker.FilterFunc {
10+
type Fuzziness interface {
11+
MaxAllowedErrors(wordLen int) int
12+
}
13+
14+
type FixedFuzziness int
15+
16+
func (f FixedFuzziness) MaxAllowedErrors(_ int) int {
17+
return int(f)
18+
}
19+
20+
type AutoFuzziness struct {
21+
Low, High int
22+
}
23+
24+
func (a AutoFuzziness) MaxAllowedErrors(wordLen int) int {
25+
if wordLen < a.Low {
26+
return 0
27+
}
28+
29+
if wordLen < a.High {
30+
return 1
31+
}
32+
33+
return 2
34+
}
35+
36+
func ScoringFunc(fuzziness Fuzziness, similarityThreshold float64) spellchecker.FilterFunc {
1137
return func(src, candidate []rune, count uint) (float64, bool) {
1238
distance, prefixLen, suffixLen := levenshtein.Calculate(src, candidate, 0, 1, 1, 1)
13-
if distance > maxErrors {
39+
if distance > fuzziness.MaxAllowedErrors(len(src)) {
1440
return 0, false
1541
}
1642

0 commit comments

Comments
 (0)