Fix binary string audit: exclude bare token vocabulary matches

DeusData · DeusData · commit 3f7f7eeab7fc · 2026-04-06T15:40:11.000+02:00
The embedded Nomic code token vocabulary (40K tokens) includes words like
"wget" as code tokens. Filter out bare single-word matches (2-10 lowercase
chars) since real dangerous strings appear in command context, not as
standalone vocabulary entries.
diff --git a/scripts/security-strings.sh b/scripts/security-strings.sh
@@ -97,7 +97,10 @@ echo ""
 echo "--- Dangerous command detection ---"
 
 DANGEROUS_CMDS='wget|netcat|ncat|/dev/tcp|telnet'
-if grep -wE "$DANGEROUS_CMDS" "$STRINGS_FILE" > "$SEC_CMDS" 2>/dev/null; then
+# Filter out bare single-word matches from the embedded token vocabulary
+# (vendored/nomic/code_tokens.h contains 40K code tokens including "wget").
+# Real dangerous strings would appear in a command context, not as standalone words.
+if grep -wE "$DANGEROUS_CMDS" "$STRINGS_FILE" | grep -vxE '[a-z]{2,10}' > "$SEC_CMDS" 2>/dev/null && [ -s "$SEC_CMDS" ]; then
     echo "BLOCKED: Dangerous commands found in binary:"
     cat "$SEC_CMDS"
     FAIL=1