-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathalgorithm_test.go
More file actions
220 lines (204 loc) · 7.32 KB
/
algorithm_test.go
File metadata and controls
220 lines (204 loc) · 7.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
package compress_test
import (
"strings"
"testing"
"github.com/GrayCodeAI/tok/internal/compress"
)
func TestCompress_Lite(t *testing.T) {
in := "Sure, I can help you with that. Of course, the answer is yes."
out, stats := compress.Compress(in, compress.Lite)
// Lite drops only pleasantries.
if strings.Contains(out, "Sure,") {
t.Errorf("expected 'Sure,' to be dropped, got %q", out)
}
if !strings.Contains(out, "help you with that") {
t.Errorf("expected pleasantry-free text to keep 'help you with that', got %q", out)
}
if stats.Intensity != compress.Lite {
t.Errorf("expected intensity=Full, got %v", stats.Intensity)
}
}
func TestCompress_Full(t *testing.T) {
tests := []struct {
name string
input string
mustDrop []string
mustKeep []string
}{
{
name: "drops articles",
input: "The quick brown fox jumps over the lazy dog.",
mustDrop: []string{"The ", "the "},
mustKeep: []string{"quick brown fox", "lazy dog"},
},
{
name: "drops filler words",
input: "It is just a test. I literally have no idea, actually.",
mustDrop: []string{"just ", "literally ", "actually"},
mustKeep: []string{"test", "no idea"},
},
{
name: "drops pleasantries",
input: "Sure, of course, happy to help.",
mustDrop: []string{"Sure", "of course", "happy to"},
mustKeep: []string{"help"},
},
{
name: "drops hedging",
input: "I think perhaps maybe we should consider it. It seems somewhat slow.",
mustDrop: []string{"I think", "perhaps", "maybe", "It seems", "somewhat"},
mustKeep: []string{"we should consider", "slow"},
},
{
name: "dictionary substitutions",
input: "In order to test the function, we need to make use of the data. Due to the fact that the data is large, we will analyze it.",
mustDrop: []string{"In order to", "make use of", "Due to the fact that"},
mustKeep: []string{"test", "analyze"},
},
{
name: "case-insensitive",
input: "THE big Fox. AN apple. The Cat.",
mustDrop: []string{"THE", "AN", "The"},
mustKeep: []string{"big Fox", "apple", "Cat"},
},
{
name: "no over-dropping",
input: "We need a strategy. They have a plan. I have a pen.",
mustDrop: []string{},
mustKeep: []string{"need", "strategy", "have", "plan", "pen"},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
out, stats := compress.Compress(tc.input, compress.Full)
for _, d := range tc.mustDrop {
if strings.Contains(out, d) {
t.Errorf("expected %q to be dropped, but found in output %q", d, out)
}
}
for _, k := range tc.mustKeep {
if !strings.Contains(out, k) {
t.Errorf("expected %q to be preserved, but not found in output %q", k, out)
}
}
if stats.OriginalBytes == 0 {
t.Error("expected non-zero original bytes")
}
})
}
}
func TestCompress_Ultra(t *testing.T) {
in := "However, the system is basically quite slow. Moreover, it is essentially a bottleneck. Therefore, we should redesign it."
out, stats := compress.Compress(in, compress.Ultra)
// Ultra drops conjunctions + adds fragment-ification
if strings.Contains(out, "However,") {
t.Errorf("expected 'However,' to be dropped, got %q", out)
}
if strings.Contains(out, "Moreover,") {
t.Errorf("expected 'Moreover,' to be dropped, got %q", out)
}
if stats.DroppedConjunctions == 0 {
t.Error("expected DroppedConjunctions > 0 for Ultra")
}
}
func TestCompress_SafetyPassThrough(t *testing.T) {
tests := []struct {
name string
input string
}{
{"rm -rf", "Be careful, you don't want to run rm -rf / on the server."},
{"sudo", "Use sudo apt update to install the package."},
{"force push", "Please don't force push to main, it will break CI."},
{"private key", "The private key should never be committed."},
{"api key", "Your API key is stored in ~/.aws/credentials."},
{"dd command", "Use dd if=/dev/zero of=/dev/null to test throughput."},
{"shutdown", "Run shutdown -h now to power off the system."},
{"warning", "Warning: this command will delete all files."},
{"private key", "Never commit your private key to the repository."},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
out, stats := compress.Compress(tc.input, compress.Full)
if out != tc.input {
t.Errorf("expected sensitive segment to pass through verbatim\ninput: %q\noutput: %q", tc.input, out)
}
if stats.PassThroughSegments == 0 {
t.Errorf("expected PassThroughSegments > 0 for %q", tc.input)
}
})
}
}
func TestCompress_Empty(t *testing.T) {
out, stats := compress.Compress("", compress.Full)
if out != "" {
t.Errorf("expected empty output, got %q", out)
}
if stats.OriginalBytes != 0 {
t.Errorf("expected zero bytes, got %d", stats.OriginalBytes)
}
}
func TestCompress_CJKPassthrough(t *testing.T) {
in := "今天天气很好,我们去公园散步吧。"
out, _ := compress.Compress(in, compress.Full)
if out != in {
t.Errorf("expected CJK to pass through unchanged, got %q", out)
}
}
func TestCompress_StatsPercentOff(t *testing.T) {
in := strings.Repeat("the quick brown fox ", 100)
_, stats := compress.Compress(in, compress.Full)
if stats.PercentOff <= 0 {
t.Errorf("expected non-zero percent-off, got %f", stats.PercentOff)
}
if stats.BytesSaved <= 0 {
t.Errorf("expected positive bytes saved, got %d", stats.BytesSaved)
}
}
func TestCompress_CodePreservation(t *testing.T) {
// Code-bearing text should be preserved (or at least not break syntactically)
in := "Run `rm -rf /tmp` to clean up. Also, just be careful with the sudo commands."
_, stats := compress.Compress(in, compress.Full)
// The sudo segment should be marked pass-through (security keyword)
if stats.PassThroughSegments == 0 {
t.Error("expected at least one segment to be marked pass-through due to sudo keyword")
}
}
func TestCompress_DictionaryExample(t *testing.T) {
in := "In order to install the package, you need to make use of the installer. Due to the fact that the system is up to date, we will proceed."
out, _ := compress.Compress(in, compress.Full)
// "In order to" should become "to"
if strings.Contains(out, "In order to") {
t.Errorf("'In order to' should be replaced, got %q", out)
}
// "make use of" should become "use"
if strings.Contains(out, "make use of") {
t.Errorf("'make use of' should be replaced, got %q", out)
}
// "Due to the fact that" should become "because"
if strings.Contains(out, "Due to the fact that") {
t.Errorf("'Due to the fact that' should be replaced, got %q", out)
}
}
func TestCompress_MultipleIntensityComparison(t *testing.T) {
in := "Sure, I think we should perhaps maybe consider the option. The system is basically quite slow."
lite, _ := compress.Compress(in, compress.Lite)
full, _ := compress.Compress(in, compress.Full)
ultra, _ := compress.Compress(in, compress.Ultra)
if len(lite) <= len(full) {
t.Errorf("expected Lite to be longer than Full, lite=%d full=%d", len(lite), len(full))
}
if len(full) <= len(ultra) {
t.Errorf("expected Full to be longer than Ultra, full=%d ultra=%d", len(full), len(ultra))
}
}
func TestIntensityString(t *testing.T) {
if got := compress.Lite.String(); got != "lite" {
t.Errorf("expected 'lite', got %q", got)
}
if got := compress.Full.String(); got != "full" {
t.Errorf("expected 'full', got %q", got)
}
if got := compress.Ultra.String(); got != "ultra" {
t.Errorf("expected 'ultra', got %q", got)
}
}