Skip to content

Commit 0df645a

Browse files
committed
update Kiwi to v0.21.0
1 parent ef73aeb commit 0df645a

7 files changed

Lines changed: 152 additions & 80 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,5 @@ __debug_bin
1919

2020
ModelGenerator/
2121
.idea/
22+
models/
23+
include/

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
KIWI_VERSION := "v0.10.3"
1+
KIWI_VERSION := "v0.21.0"
22

33
.PHONY: test
44
test: ModelGenerator/default.dict
5-
go test ./...
5+
go test -count=1 ./...
66

77
ModelGenerator/default.dict:
8-
curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION).tgz --output model.tgz
8+
curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION)_base.tgz --output model.tgz
99
tar -xzvf model.tgz
1010
rm -f model.tgz
1111

kiwi.go

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
package kiwi
33

44
/*
5-
#cgo LDFLAGS: -l kiwi
5+
#cgo CFLAGS: -I/usr/local/include
6+
#cgo LDFLAGS: -Wl,-rpath,/usr/local/lib
7+
68
#include <stdlib.h>
79
#include <string.h>
810
#include <stdint.h> // for uintptr_t
@@ -14,6 +16,7 @@ extern int KiwiReaderBridge(int lineNumber, char *buffer, void *userData);
1416
import "C"
1517

1618
import (
19+
"fmt"
1720
"io"
1821
"runtime/cgo"
1922
"unsafe"
@@ -91,17 +94,36 @@ type TokenResult struct {
9194

9295
// Analyze returns the result of the analysis.
9396
func (k *Kiwi) Analyze(text string, topN int, options AnalyzeOption) ([]TokenResult, error) {
94-
kiwiResH := C.kiwi_analyze(k.handler, C.CString(text), C.int(topN), C.int(options))
97+
var (
98+
blocklist C.kiwi_morphset_h
99+
pretokenized C.kiwi_pretokenized_h
100+
cText = C.CString(text)
101+
)
102+
103+
defer C.free(unsafe.Pointer(cText))
95104

105+
kiwiResH := C.kiwi_analyze(k.handler, cText, C.int(topN), C.int(options), blocklist, pretokenized)
106+
if kiwiResH == nil {
107+
return nil, fmt.Errorf("failed to analyze text")
108+
}
96109
defer C.kiwi_res_close(kiwiResH)
97110

98111
resSize := int(C.kiwi_res_size(kiwiResH))
112+
if resSize < 0 {
113+
return nil, fmt.Errorf("invalid result size: %d", resSize)
114+
}
115+
99116
res := make([]TokenResult, resSize)
100117

101118
for i := 0; i < resSize; i++ {
102-
tokens := make([]TokenInfo, int(C.kiwi_res_word_num(kiwiResH, C.int(i))))
119+
wordNum := int(C.kiwi_res_word_num(kiwiResH, C.int(i)))
120+
if wordNum < 0 {
121+
return nil, fmt.Errorf("invalid word number: %d", wordNum)
122+
}
103123

104-
for j := 0; j < len(tokens); j++ {
124+
tokens := make([]TokenInfo, wordNum)
125+
126+
for j := 0; j < wordNum; j++ {
105127
pos, err := ParsePOSType(C.GoString(C.kiwi_res_tag(kiwiResH, C.int(i), C.int(j))))
106128
if err != nil {
107129
return nil, err
@@ -131,15 +153,30 @@ type SplitResult struct {
131153

132154
// SplitSentence returns the line of sentences.
133155
func (k *Kiwi) SplitSentence(text string, options AnalyzeOption) ([]SplitResult, error) {
134-
kiwiSsH := C.kiwi_split_into_sents(k.handler, C.CString(text), C.int(options), nil)
156+
var cText = C.CString(text)
157+
defer C.free(unsafe.Pointer(cText))
158+
159+
kiwiSsH := C.kiwi_split_into_sents(k.handler, cText, C.int(options), nil)
160+
if kiwiSsH == nil {
161+
return nil, fmt.Errorf("failed to split sentences")
162+
}
135163
defer C.kiwi_ss_close(kiwiSsH)
136164

137165
resSize := int(C.kiwi_ss_size(kiwiSsH))
166+
if resSize < 0 {
167+
return nil, fmt.Errorf("invalid result size: %d", resSize)
168+
}
169+
138170
res := make([]SplitResult, resSize)
139171

140172
for i := 0; i < resSize; i++ {
141173
begin := int(C.kiwi_ss_begin_position(kiwiSsH, C.int(i)))
142174
end := int(C.kiwi_ss_end_position(kiwiSsH, C.int(i)))
175+
176+
if begin < 0 || end < begin || end > len(text) {
177+
return nil, fmt.Errorf("invalid position range: begin=%d, end=%d", begin, end)
178+
}
179+
143180
res[i] = SplitResult{
144181
Text: text[begin:end],
145182
Begin: begin,
@@ -188,7 +225,12 @@ func (kb *KiwiBuilder) LoadDict(dictPath string) int {
188225

189226
// Build creates kiwi instance with user word etc.
190227
func (kb *KiwiBuilder) Build() *Kiwi {
191-
h := C.kiwi_builder_build(kb.handler)
228+
var (
229+
typos C.kiwi_typo_h
230+
typoCostThreshold = C.float(1.0)
231+
)
232+
233+
h := C.kiwi_builder_build(kb.handler, typos, typoCostThreshold)
192234
defer kb.Close()
193235
return &Kiwi{
194236
handler: h,

kiwi_example_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ func Example() {
1616
results, _ := k.Analyze("안녕하세요 코딩냄비입니다. 부글부글.", 1 /*=topN*/, kiwi.KIWI_MATCH_ALL)
1717
fmt.Println(results)
1818
// Output:
19-
// [{[{0 NNG 안녕} {2 XSA 하} {4 EP 시} {3 EC 어요} {6 NNP 코딩냄비} {10 VCP 이} {11 EF ᆸ니다} {13 SF .} {15 NNP 부글부} {18 NNG 글} {19 SF .}] -69.74997}]
19+
// [{[{0 NNG 안녕} {2 XSA 하} {3 EF 세요} {6 NNP 코딩냄비} {10 VCP 이} {10 EF ᆸ니다} {13 SF .} {15 MAG 부글부글} {19 SF .}] -55.869953}]
2020
}

kiwi_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99
)
1010

1111
func TestKiwiVersion(t *testing.T) {
12-
assert.Equal(t, KiwiVersion(), "0.10.3")
12+
assert.Equal(t, KiwiVersion(), "0.21.0")
1313
}
1414

1515
func TestAnalyze(t *testing.T) {
@@ -50,12 +50,12 @@ func TestAnalyze(t *testing.T) {
5050
Form: "시",
5151
},
5252
{
53-
Position: 12,
53+
Position: 11,
5454
Tag: POS_EF,
5555
Form: "ᆫ다",
5656
},
5757
},
58-
Score: -38.967132568359375,
58+
Score: -34.55623,
5959
},
6060
}
6161

postype.go

Lines changed: 83 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -11,46 +11,21 @@ const (
1111
POS_NNG POSType = "NNG"
1212
POS_NNP POSType = "NNP"
1313
POS_NNB POSType = "NNB"
14+
POS_NR POSType = "NR"
15+
POS_NP POSType = "NP"
1416

15-
POS_VV POSType = "VV"
16-
POS_VA POSType = "VA"
17-
18-
POS_MAG POSType = "MAG"
19-
20-
POS_NR POSType = "NR"
21-
POS_NP POSType = "NP"
22-
23-
POS_VX POSType = "VX"
24-
25-
POS_MM POSType = "MM"
26-
POS_MAJ POSType = "MAJ"
27-
28-
POS_IC POSType = "IC"
29-
30-
POS_XPN POSType = "XPN"
31-
POS_XSN POSType = "XSN"
32-
POS_XSV POSType = "XSV"
33-
POS_XSA POSType = "XSA"
34-
POS_XR POSType = "XR"
35-
17+
POS_VV POSType = "VV"
18+
POS_VA POSType = "VA"
19+
POS_VX POSType = "VX"
3620
POS_VCP POSType = "VCP"
3721
POS_VCN POSType = "VCN"
3822

39-
POS_SF POSType = "SF"
40-
POS_SP POSType = "SP"
41-
POS_SS POSType = "SS"
42-
POS_SE POSType = "SE"
43-
POS_SO POSType = "SO"
44-
POS_SW POSType = "SW"
23+
POS_MM POSType = "MM"
4524

46-
POS_SL POSType = "SL"
47-
POS_SH POSType = "SH"
48-
POS_SN POSType = "SN"
25+
POS_MAG POSType = "MAG"
26+
POS_MAJ POSType = "MAJ"
4927

50-
POS_W_URL POSType = "W_URL"
51-
POS_W_EMAIL POSType = "W_EMAIL"
52-
POS_W_MENTION POSType = "W_MENTION"
53-
POS_W_HASHTAG POSType = "W_HASHTAG"
28+
POS_IC POSType = "IC"
5429

5530
POS_JKS POSType = "JKS"
5631
POS_JKC POSType = "JKC"
@@ -68,62 +43,111 @@ const (
6843
POS_ETN POSType = "ETN"
6944
POS_ETM POSType = "ETM"
7045

71-
POS_V POSType = "V"
46+
POS_XPN POSType = "XPN"
47+
48+
POS_XSN POSType = "XSN"
49+
POS_XSV POSType = "XSV"
50+
POS_XSA POSType = "XSA"
51+
POS_XSM POSType = "XSM"
52+
53+
POS_XR POSType = "XR"
54+
55+
POS_SF POSType = "SF"
56+
POS_SP POSType = "SP"
57+
POS_SS POSType = "SS"
58+
POS_SSO POSType = "SSO"
59+
POS_SSC POSType = "SSC"
60+
POS_SE POSType = "SE"
61+
POS_SO POSType = "SO"
62+
POS_SW POSType = "SW"
63+
POS_SL POSType = "SL"
64+
POS_SH POSType = "SH"
65+
POS_SN POSType = "SN"
66+
POS_SB POSType = "SB"
67+
68+
POS_UN POSType = "UN"
7269

73-
POS_MAX POSType = "MAX"
70+
POS_W_URL POSType = "W_URL"
71+
POS_W_EMAIL POSType = "W_EMAIL"
72+
POS_W_HASHTAG POSType = "W_HASHTAG"
73+
POS_W_MENTION POSType = "W_MENTION"
74+
POS_W_SERIAL POSType = "W_SERIAL"
75+
POS_W_EMOJI POSType = "W_EMOJI"
76+
77+
POS_Z_CODA POSType = "Z_CODA"
78+
POS_Z_SIOT POSType = "Z_SIOT"
79+
80+
POS_USER_0 POSType = "USER0"
81+
POS_USER_1 POSType = "USER1"
82+
POS_USER_2 POSType = "USER2"
83+
POS_USER_3 POSType = "USER3"
84+
POS_USER_4 POSType = "USER4"
7485
)
7586

7687
func (p POSType) isValid() bool {
7788
switch p {
78-
case POS_UNKNOWN,
89+
case
90+
POS_UNKNOWN,
7991
POS_NNG,
8092
POS_NNP,
8193
POS_NNB,
82-
POS_VV,
83-
POS_VA,
84-
POS_MAG,
8594
POS_NR,
8695
POS_NP,
96+
POS_VV,
97+
POS_VA,
8798
POS_VX,
99+
POS_VCP,
100+
POS_VCN,
88101
POS_MM,
102+
POS_MAG,
89103
POS_MAJ,
90104
POS_IC,
105+
POS_JKS,
106+
POS_JKC,
107+
POS_JKG,
108+
POS_JKO,
109+
POS_JKB,
110+
POS_JKV,
111+
POS_JKQ,
112+
POS_JX,
113+
POS_JC,
114+
POS_EP,
115+
POS_EF,
116+
POS_EC,
117+
POS_ETN,
118+
POS_ETM,
91119
POS_XPN,
92120
POS_XSN,
93121
POS_XSV,
94122
POS_XSA,
123+
POS_XSM,
95124
POS_XR,
96-
POS_VCP,
97-
POS_VCN,
98125
POS_SF,
99126
POS_SP,
100127
POS_SS,
128+
POS_SSO,
129+
POS_SSC,
101130
POS_SE,
102131
POS_SO,
103132
POS_SW,
104133
POS_SL,
105134
POS_SH,
106135
POS_SN,
136+
POS_SB,
137+
POS_UN,
107138
POS_W_URL,
108139
POS_W_EMAIL,
109-
POS_W_MENTION,
110140
POS_W_HASHTAG,
111-
POS_JKS,
112-
POS_JKC,
113-
POS_JKG,
114-
POS_JKO,
115-
POS_JKB,
116-
POS_JKV,
117-
POS_JKQ,
118-
POS_JX,
119-
POS_JC,
120-
POS_EP,
121-
POS_EF,
122-
POS_EC,
123-
POS_ETN,
124-
POS_ETM,
125-
POS_V,
126-
POS_MAX:
141+
POS_W_MENTION,
142+
POS_W_SERIAL,
143+
POS_W_EMOJI,
144+
POS_Z_CODA,
145+
POS_Z_SIOT,
146+
POS_USER_0,
147+
POS_USER_1,
148+
POS_USER_2,
149+
POS_USER_3,
150+
POS_USER_4:
127151
return true
128152
default:
129153
return false

scripts/install_kiwi.bash

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,19 @@ elif [ "$(uname)" == "Windows" ]; then
1111
OS='win'
1212
fi
1313

14+
if [ "$(uname -m)" == "arm64" ]; then
15+
ARCH="arm64"
16+
else
17+
ARCH="x86_64"
18+
fi
19+
1420
echo "set OS env to ${OS:?}"
1521
echo "installing Kiwi version ${KIWI_VERSION:?}"
1622

17-
wget -O kiwi.tgz "https://github.com/bab2min/Kiwi/releases/download/${KIWI_VERSION}/kiwi_${OS}_x86_64_${KIWI_VERSION}.tgz" &&
18-
tar xzvf kiwi.tgz &&
19-
sudo mv build/libkiwi* /usr/local/lib/ &&
23+
wget -O kiwi.tgz "https://github.com/bab2min/Kiwi/releases/download/${KIWI_VERSION}/kiwi_${OS}_${ARCH}_${KIWI_VERSION}.tgz" &&
24+
sudo mkdir -p /usr/local/kiwi &&
25+
sudo tar xzvf kiwi.tgz &&
26+
sudo cp lib/libkiwi* /usr/local/lib &&
27+
sudo cp -rf include/kiwi /usr/local/include &&
2028
[[ "$(uname)" == "Linux" ]] && sudo ldconfig || echo 'skip' &&
21-
rm -rf kiwi.tgz build &&
22-
wget -O source.tgz https://github.com/bab2min/Kiwi/archive/refs/tags/${KIWI_VERSION}.tar.gz &&
23-
tar xzvf source.tgz &&
24-
sudo cp -r Kiwi-${KIWI_VERSION/v/}/include/kiwi /usr/local/include/ &&
25-
rm -rf source.tgz Kiwi-*
29+
rm -rf kiwi.tgz bin lib include

0 commit comments

Comments
 (0)