Skip to content

Commit fa67a3e

Browse files
committed
feat: accept multi-word phrases in the vocabulary (#1035)
Signed-off-by: Joseph Kato <joseph@jdkato.io>
1 parent 2ff92e0 commit fa67a3e

11 files changed

Lines changed: 112 additions & 8 deletions

File tree

internal/check/conditional.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ type Conditional struct {
1515
First string
1616
Second string
1717
exceptRe *regexp2.Regexp
18+
phraseRe *regexp2.Regexp
1819
Ignorecase bool
1920
Vocab bool
2021
}
@@ -39,6 +40,7 @@ func NewConditional(cfg *core.Config, generic baseCheck, path string) (Condition
3940
return rule, core.NewE201FromPosition(err.Error(), path, 1)
4041
}
4142
rule.exceptRe = re
43+
rule.phraseRe = buildPhraseRe(rule.Exceptions, cfg.AcceptedTokens, rule.Vocab)
4244

4345
re, err = regexp2.CompileStd(rule.Second)
4446
if err != nil {
@@ -91,7 +93,7 @@ func (c Conditional) Run(blk nlp.Block, f *core.File, cfg *core.Config) ([]core.
9193
return alerts, err
9294
}
9395

94-
if !core.StringInSlice(s, f.Sequences) && !isMatch(c.exceptRe, s) {
96+
if !core.StringInSlice(s, f.Sequences) && !isMatch(c.exceptRe, s) && !withinPhrase(c.phraseRe, txt, loc) {
9597
// If we've found one (e.g., "WHO") and we haven't marked it as
9698
// being defined previously, send an Alert.
9799
a, erra := makeAlert(c.Definition, loc, txt, cfg)

internal/check/definition.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,53 @@ func updateExceptions(previous []string, current []string, vocab bool) (*regexp2
375375
return &regexp2.Regexp{}, nil
376376
}
377377

378+
// buildPhraseRe compiles a regex matching the multi-word entries among the
379+
// given exception terms (a rule's `exceptions` plus, when `vocab` is set, the
380+
// project's accepted vocabulary). It lets a rule suppress a finding that falls
381+
// within an accepted phrase even when the rule matched only one of the phrase's
382+
// component words -- e.g. `mea` within an accepted `mea culpa`. See #1035.
383+
//
384+
// Returns nil when there are no multi-word terms (or one fails to compile, in
385+
// which case `updateExceptions` surfaces the error).
386+
func buildPhraseRe(previous, current []string, vocab bool) *regexp2.Regexp {
387+
terms := append([]string{}, previous...)
388+
if vocab {
389+
terms = append(terms, current...)
390+
}
391+
392+
phrases := []string{}
393+
for _, term := range terms {
394+
if strings.ContainsAny(term, " \t") || strings.Contains(term, `\s`) {
395+
phrases = append(phrases, term)
396+
}
397+
}
398+
399+
if len(phrases) == 0 {
400+
return nil
401+
}
402+
403+
re, err := regexp2.CompileStd(ignoreCase + `\b(?:` + strings.Join(phrases, "|") + `)\b`)
404+
if err != nil {
405+
return nil
406+
}
407+
return re
408+
}
409+
410+
// withinPhrase reports whether the span `loc` falls entirely within a match of
411+
// `phraseRe` (an accepted multi-word phrase) in `txt`. Both `loc` and the
412+
// phrase spans are rune offsets, as returned by regexp2's FindAllStringIndex.
413+
func withinPhrase(phraseRe *regexp2.Regexp, txt string, loc []int) bool {
414+
if phraseRe == nil {
415+
return false
416+
}
417+
for _, span := range phraseRe.FindAllStringIndex(txt, -1) {
418+
if loc[0] >= span[0] && loc[1] <= span[1] {
419+
return true
420+
}
421+
}
422+
return false
423+
}
424+
378425
func decodeRule(input interface{}, output interface{}) error {
379426
config := mapstructure.DecoderConfig{
380427
ErrorUnused: true,

internal/check/existence.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ type Existence struct {
1818
// `exceptions` (`array`): An array of strings to be ignored.
1919
Exceptions []string
2020
exceptRe *regexp2.Regexp
21+
phraseRe *regexp2.Regexp
2122
pattern *regexp2.Regexp
2223
Append bool
2324
IgnoreCase bool
@@ -50,6 +51,7 @@ func NewExistence(cfg *core.Config, generic baseCheck, path string) (Existence,
5051
return rule, core.NewE201FromPosition(err.Error(), path, 1)
5152
}
5253
rule.exceptRe = re
54+
rule.phraseRe = buildPhraseRe(rule.Exceptions, cfg.AcceptedTokens, rule.Vocab && !rule.Nonword)
5355

5456
regex := makeRegexp(
5557
cfg.WordTemplate,
@@ -90,7 +92,7 @@ func (e Existence) Run(blk nlp.Block, _ *core.File, cfg *core.Config) ([]core.Al
9092
}
9193

9294
observed := strings.TrimSpace(converted)
93-
if !isMatch(e.exceptRe, observed) {
95+
if !isMatch(e.exceptRe, observed) && !withinPhrase(e.phraseRe, blk.Text, loc) {
9496
a, erra := makeAlert(e.Definition, loc, blk.Text, cfg)
9597
if erra != nil {
9698
return alerts, erra

internal/check/repetition.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ type Repetition struct {
2020
Exceptions []string
2121

2222
exceptRe *regexp2.Regexp
23+
phraseRe *regexp2.Regexp
2324
pattern *regexp2.Regexp
2425
}
2526

@@ -42,6 +43,7 @@ func NewRepetition(cfg *core.Config, generic baseCheck, path string) (Repetition
4243
return rule, core.NewE201FromPosition(err.Error(), path, 1)
4344
}
4445
rule.exceptRe = re
46+
rule.phraseRe = buildPhraseRe(rule.Exceptions, cfg.AcceptedTokens, rule.Vocab)
4547

4648
regex := ""
4749
if rule.Ignorecase {
@@ -102,9 +104,8 @@ func (o Repetition) Run(blk nlp.Block, _ *core.File, cfg *core.Config) ([]core.A
102104
// I almost forgot about that. That is important.
103105
//
104106
// All plans except a Personal plan can use Redis. Redis ...
105-
if !isMatch(o.exceptRe, converted) {
106-
floc := []int{ploc[0], loc[1]}
107-
107+
floc := []int{ploc[0], loc[1]}
108+
if !isMatch(o.exceptRe, converted) && !withinPhrase(o.phraseRe, txt, floc) {
108109
a, erra := makeAlert(o.Definition, floc, txt, cfg)
109110
if erra != nil {
110111
return alerts, erra

internal/check/spelling.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ type Spelling struct {
3535
Dicpath string
3636
Threshold int
3737
exceptRe *regexp2.Regexp
38+
phraseRe *regexp2.Regexp
3839
gs *spell.Checker
3940
Custom bool
4041
Append bool
@@ -85,6 +86,12 @@ func addExceptions(s *Spelling, generic baseCheck, cfg *core.Config) error { //n
8586
ignoreCase + strings.Join(s.Exceptions, "|"))
8687
}
8788

89+
// A multi-word term (e.g. `mea culpa`) is accepted only as a phrase; its
90+
// component words are still spell-checked on their own. We mask these in
91+
// `Run` via `phraseRe`, built from the same vocabulary as every other
92+
// Vocab-aware rule. See #1035.
93+
s.phraseRe = buildPhraseRe(nil, cfg.AcceptedTokens, true)
94+
8895
return nil
8996
}
9097

@@ -173,16 +180,30 @@ func (s Spelling) Run(blk nlp.Block, _ *core.File, _ *core.Config) ([]core.Alert
173180
// See https://github.com/errata-ai/vale/v2/issues/148.
174181
txt = s.gs.Convert(txt)
175182

183+
// Mask any accepted multi-word phrases (e.g. `mea culpa`) so their
184+
// component words aren't spell-checked individually, while the same words
185+
// elsewhere still are. We replace each match with an equal-length run of
186+
// spaces, which preserves the byte offsets of every other word. See #1035.
187+
checkTxt := txt
188+
if s.phraseRe != nil {
189+
masked, err := s.phraseRe.ReplaceFunc(txt, func(m regexp2.Match) string {
190+
return strings.Repeat(" ", len(m.String()))
191+
}, -1, -1)
192+
if err == nil {
193+
checkTxt = masked
194+
}
195+
}
196+
176197
OUTER:
177-
for _, word := range nlp.WordTokenizer.Tokenize(txt) {
198+
for _, word := range nlp.WordTokenizer.Tokenize(checkTxt) {
178199
for _, filter := range s.Filters {
179200
if filter.MatchString(word) {
180201
continue OUTER
181202
}
182203
}
183204

184205
if !s.gs.Spell(word) && !isMatch(s.exceptRe, word) {
185-
offset := strings.Index(txt, word)
206+
offset := strings.Index(checkTxt, word)
186207
loc := []int{offset, offset + len(word)}
187208

188209
a := core.Alert{Check: s.Name, Severity: s.Level, Span: loc,

internal/check/substitution.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type Substitution struct {
1919
repl []string
2020
Swap map[string]string
2121
exceptRe *regexp2.Regexp
22+
phraseRe *regexp2.Regexp
2223
pattern *regexp2.Regexp
2324
Ignorecase bool
2425
Nonword bool
@@ -50,6 +51,7 @@ func NewSubstitution(cfg *core.Config, generic baseCheck, path string) (Substitu
5051
return rule, core.NewE201FromPosition(err.Error(), path, 1)
5152
}
5253
rule.exceptRe = re
54+
rule.phraseRe = buildPhraseRe(rule.Exceptions, cfg.AcceptedTokens, rule.Vocab)
5355

5456
regex := makeRegexp(
5557
cfg.WordTemplate,
@@ -136,7 +138,7 @@ func (s Substitution) Run(blk nlp.Block, _ *core.File, cfg *core.Config) ([]core
136138
} else {
137139
same = matchToken(expected, observed, false)
138140
}
139-
if !same && !isMatch(s.exceptRe, observed) {
141+
if !same && !isMatch(s.exceptRe, observed) && !withinPhrase(s.phraseRe, txt, loc) {
140142
action := s.Fields().Action
141143
if action.Name == "replace" && len(action.Params) == 0 {
142144
action.Params = getOptions(expected)

testdata/features/misc.feature

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,17 @@ Feature: Misc
3636
test.md:19:5:Vale.Terms:Use 'MyProduct Enterprise' instead of 'MyProduct enterprise'.
3737
"""
3838

39+
Scenario: Multi-word vocab phrases (#1035)
40+
# An accepted phrase is exempt across every Vocab-aware rule (here both
41+
# Vale.Spelling and a custom existence rule), while its component words
42+
# are still flagged on their own.
43+
When I use Vocab "Phrases"
44+
Then the output should contain exactly:
45+
"""
46+
test.md:3:8:Test.Place:Avoid 'place'.
47+
test.md:5:3:Vale.Spelling:Did you really mean 'flooberg'?
48+
"""
49+
3950
Scenario: Line Endings
4051
When I test "misc/line-endings"
4152
Then the output should contain exactly:
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
StylesPath = ../styles
2+
MinAlertLevel = suggestion
3+
4+
Vocab = Phrases
5+
6+
[*.md]
7+
BasedOnStyles = Vale, Test
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
A flooberg place appeared today.
2+
3+
Just a place on its own here.
4+
5+
A flooberg by itself there.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
extends: existence
2+
message: "Avoid 'place'."
3+
level: warning
4+
tokens:
5+
- 'place'

0 commit comments

Comments
 (0)