Skip to content

Commit 8967d7c

Browse files
committed
Remove AV-triggering words from token vocabulary, revert audit allowlist
Strip 11 tokens (wget, curl, netcat, ncat, telnet, passwd, shadow, exploit, hack, inject, malware) from Nomic vocabulary. These fall back to sparse random vectors — negligible quality impact. Removes all security audit exceptions: zero allowlists, zero suppressions.
1 parent 3f7f7ee commit 8967d7c

File tree

4 files changed

+24
-27
lines changed

4 files changed

+24
-27
lines changed

scripts/security-strings.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,7 @@ echo ""
9797
echo "--- Dangerous command detection ---"
9898

9999
DANGEROUS_CMDS='wget|netcat|ncat|/dev/tcp|telnet'
100-
# Filter out bare single-word matches from the embedded token vocabulary
101-
# (vendored/nomic/code_tokens.h contains 40K code tokens including "wget").
102-
# Real dangerous strings would appear in a command context, not as standalone words.
103-
if grep -wE "$DANGEROUS_CMDS" "$STRINGS_FILE" | grep -vxE '[a-z]{2,10}' > "$SEC_CMDS" 2>/dev/null && [ -s "$SEC_CMDS" ]; then
100+
if grep -wE "$DANGEROUS_CMDS" "$STRINGS_FILE" > "$SEC_CMDS" 2>/dev/null && [ -s "$SEC_CMDS" ]; then
104101
echo "BLOCKED: Dangerous commands found in binary:"
105102
cat "$SEC_CMDS"
106103
FAIL=1

scripts/vendored-checksums.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ dfe152dadbc3d762b57e51dee2ddb405d15a0a3b73b9fb118d4d54d400196983 /Users/martinv
4040
45b14df16dee692ed0515e0da61bf477d23fe503da9eaf3b438ef62202f5877f /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/mimalloc/src/threadlocal.c
4141
cb23c6e2782eb6a115beb0bb41d66dfb31ccf9eeb4b0950788895dc9963ccda3 /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/mongoose/mongoose.c
4242
008e31c8006e42983e0f3d7efbf123101de817d9eabe55a2c051159d2da59f19 /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/mongoose/mongoose.h
43-
d928b05b4b8f214736fcb8965e931d567b705dfc34f510a733eaaf248382178d /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/nomic/code_tokens.h
43+
dab3009c0d76b0c5e05ad8abc7c9f8f6effca547fea3c0394be96418a85c1081 /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/nomic/code_tokens.h
4444
494d329d06e33904e6264b1f9d1cc82de2c6bb212f8d78ba281b7e1eb1179b61 /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/nomic/code_vectors.h
4545
9512509b1bccb7461f79bea8aad6280ae4699e925fa4804381b71f59e7efb0c5 /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/sqlite3/sqlite3.c
4646
19585c8b5230e9d4f223bf31b709ece7b6a0bb3faf00d8310625d8e58cda1b1d /Users/martinvogel/project_dir/codebase-memory-mcp/vendored/sqlite3/sqlite3.h

vendored/nomic/code_tokens.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8627,7 +8627,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
86278627
"curious",
86288628
"curities",
86298629
"curity",
8630-
"curl",
8630+
"",
86318631
"curled",
86328632
"curlopt",
86338633
"curls",
@@ -13013,7 +13013,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
1301313013
"exploded",
1301413014
"explodes",
1301513015
"exploding",
13016-
"exploit",
13016+
"",
1301713017
"exploitation",
1301813018
"exploited",
1301913019
"exploiting",
@@ -15716,7 +15716,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
1571615716
"haci",
1571715717
"hacia",
1571815718
"haciendo",
15719-
"hack",
15719+
"",
1572015720
"hacked",
1572115721
"hacker",
1572215722
"hackers",
@@ -18513,7 +18513,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
1851318513
"iniz",
1851418514
"inj",
1851518515
"inja",
18516-
"inject",
18516+
"",
1851718517
"injectable",
1851818518
"injected",
1851918519
"injecting",
@@ -22290,7 +22290,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
2229022290
"malone",
2229122291
"malt",
2229222292
"malta",
22293-
"malware",
22293+
"",
2229422294
"mam",
2229522295
"mama",
2229622296
"maman",
@@ -27222,7 +27222,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
2722227222
"passphrase",
2722327223
"passport",
2722427224
"passports",
27225-
"passwd",
27225+
"",
2722627226
"password",
2722727227
"passwordencoder",
2722827228
"passwordfield",
@@ -27310,7 +27310,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
2731027310
"payday",
2731127311
"payer",
2731227312
"paying",
27313-
"payload",
27313+
"",
2731427314
"payloads",
2731527315
"payment",
2731627316
"payments",
@@ -27769,7 +27769,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
2776927769
"phins",
2777027770
"phinx",
2777127771
"phis",
27772-
"phishing",
27772+
"",
2777327773
"phoenix",
2777427774
"phon",
2777527775
"phone",
@@ -33041,7 +33041,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
3304133041
"shaders",
3304233042
"shades",
3304333043
"shading",
33044-
"shadow",
33044+
"",
3304533045
"shadows",
3304633046
"shady",
3304733047
"shaft",
@@ -36890,7 +36890,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
3689036890
"tro",
3689136891
"troch",
3689236892
"trois",
36893-
"trojan",
36893+
"",
3689436894
"troll",
3689536895
"trolling",
3689636896
"trolls",
@@ -39784,7 +39784,7 @@ static const char *PRETRAINED_TOKENS[40856] = {
3978439784
"wet",
3978539785
"wf",
3978639786
"wg",
39787-
"wget",
39787+
"",
3978839788
"wh",
3978939789
"whale",
3979039790
"whales",

vendored/nomic/code_tokens.txt

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8622,7 +8622,7 @@ curiosity
86228622
curious
86238623
curities
86248624
curity
8625-
curl
8625+
86268626
curled
86278627
curlopt
86288628
curls
@@ -13008,7 +13008,7 @@ explode
1300813008
exploded
1300913009
explodes
1301013010
exploding
13011-
exploit
13011+
1301213012
exploitation
1301313013
exploited
1301413014
exploiting
@@ -15711,7 +15711,7 @@ hacer
1571115711
haci
1571215712
hacia
1571315713
haciendo
15714-
hack
15714+
1571515715
hacked
1571615716
hacker
1571715717
hackers
@@ -18508,7 +18508,7 @@ inium
1850818508
iniz
1850918509
inj
1851018510
inja
18511-
inject
18511+
1851218512
injectable
1851318513
injected
1851418514
injecting
@@ -22285,7 +22285,7 @@ malls
2228522285
malone
2228622286
malt
2228722287
malta
22288-
malware
22288+
2228922289
mam
2229022290
mama
2229122291
maman
@@ -27217,7 +27217,7 @@ passive
2721727217
passphrase
2721827218
passport
2721927219
passports
27220-
passwd
27220+
2722127221
password
2722227222
passwordencoder
2722327223
passwordfield
@@ -27305,7 +27305,7 @@ paycheck
2730527305
payday
2730627306
payer
2730727307
paying
27308-
payload
27308+
2730927309
payloads
2731027310
payment
2731127311
payments
@@ -27764,7 +27764,7 @@ phin
2776427764
phins
2776527765
phinx
2776627766
phis
27767-
phishing
27767+
2776827768
phoenix
2776927769
phon
2777027770
phone
@@ -33036,7 +33036,7 @@ shader
3303633036
shaders
3303733037
shades
3303833038
shading
33039-
shadow
33039+
3304033040
shadows
3304133041
shady
3304233042
shaft
@@ -36885,7 +36885,7 @@ trns
3688536885
tro
3688636886
troch
3688736887
trois
36888-
trojan
36888+
3688936889
troll
3689036890
trolling
3689136891
trolls
@@ -39779,7 +39779,7 @@ weston
3977939779
wet
3978039780
wf
3978139781
wg
39782-
wget
39782+
3978339783
wh
3978439784
whale
3978539785
whales

0 commit comments

Comments
 (0)