Skip to content

Commit e48efcb

Browse files
committed
feat(filter): 增强WordsLenFilter支持中英文混合文本
改进WordsLenFilter函数,使其能够正确处理中英文混合的提交信息: 1. 对于中文文本统计字符数 2. 对于英文文本统计单词数 3. 对于混合文本同时统计中文字符和英文单词
1 parent 334e4de commit e48efcb

2 files changed

Lines changed: 54 additions & 3 deletions

File tree

chlog/filter.go

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package chlog
22

33
import (
44
"strings"
5+
"unicode"
56

67
"github.com/gookit/goutil/strutil"
78
)
@@ -12,8 +13,7 @@ type ItemFilter interface {
1213
Handle(li *LogItem) bool
1314
}
1415

15-
// ItemFilterFunc define. return false to filter item.
16-
// type LineFilterFunc func(line string) bool
16+
// ItemFilterFunc define. return False to filter(discard) item.
1717
type ItemFilterFunc func(li *LogItem) bool
1818

1919
// Handle filtering
@@ -37,9 +37,36 @@ func MsgLenFilter(minLen int) ItemFilterFunc {
3737
}
3838

3939
// WordsLenFilter handler
40+
// - For English text: counts words separated by whitespace
41+
// - For Chinese text: counts characters (runes) since Chinese doesn't use spaces
42+
// - For mixed text: counts both English words and Chinese characters
4043
func WordsLenFilter(minLen int) ItemFilterFunc {
4144
return func(li *LogItem) bool {
42-
return len(strutil.Split(li.Msg, " ")) > minLen
45+
msg := li.Msg
46+
wordCount := 0
47+
inWord := false
48+
hasChinese := false
49+
50+
for _, r := range msg {
51+
if unicode.Is(unicode.Han, r) {
52+
hasChinese = true
53+
wordCount++
54+
inWord = false
55+
} else if unicode.IsSpace(r) {
56+
inWord = false
57+
} else {
58+
if !inWord {
59+
wordCount++
60+
inWord = true
61+
}
62+
}
63+
}
64+
65+
if !hasChinese {
66+
return len(strutil.Split(msg, " ")) > minLen
67+
}
68+
69+
return wordCount > minLen
4370
}
4471
}
4572

chlog/filter_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,27 @@ func TestKeywordsFilter(t *testing.T) {
1616
li = &chlog.LogItem{Msg: "chore: fix gh action script error"}
1717
assert.True(t, fl(li))
1818
}
19+
20+
func TestWordsLenFilter(t *testing.T) {
21+
fl := chlog.WordsLenFilter(3)
22+
23+
// 英文测试:4个单词,应该通过
24+
li := &chlog.LogItem{Msg: "fix: update config file"}
25+
assert.True(t, fl(li))
26+
27+
// 英文测试:2个单词,应该被过滤
28+
li = &chlog.LogItem{Msg: "fix bug"}
29+
assert.False(t, fl(li))
30+
31+
// 中文测试:超过3个字符,应该通过
32+
li = &chlog.LogItem{Msg: "修复了一个重要的配置问题"}
33+
assert.True(t, fl(li))
34+
35+
// 中文测试:少于3个字符,应该被过滤
36+
li = &chlog.LogItem{Msg: "修bug"}
37+
assert.False(t, fl(li))
38+
39+
// 中英混合测试
40+
li = &chlog.LogItem{Msg: "fix 修复了问题"}
41+
assert.True(t, fl(li))
42+
}

0 commit comments

Comments
 (0)