|
| 1 | +#!/usr/bin/env bash |
| 2 | +# /usr/libexec/mios/ai-sanitize |
| 3 | +# Sanitize AI artifacts to OpenAI API-compliant minimal form. |
| 4 | +# Implements rules from /usr/share/mios/ai/system.md §6. |
| 5 | +# |
| 6 | +# Usage: |
| 7 | +# mios-ai-sanitize check <path> # Report findings, exit 1 if dirty |
| 8 | +# mios-ai-sanitize clean <path> # In-place sanitize (creates .bak) |
| 9 | +# mios-ai-sanitize diff <path> # Show what would change |
| 10 | +# |
| 11 | +# Operates on: .md, .txt, .json, .yaml, .yml files. |
| 12 | +# Skips: .sh, .py, source code (only sanitizes prose). |
| 13 | + |
| 14 | +set -euo pipefail |
| 15 | + |
| 16 | +readonly SCRIPT_NAME="$(basename "$0")" |
| 17 | +readonly MODE="${1:-}" |
| 18 | +readonly TARGET="${2:-}" |
| 19 | + |
| 20 | +if [[ -z "$MODE" || -z "$TARGET" ]]; then |
| 21 | + cat >&2 <<EOF |
| 22 | +Usage: $SCRIPT_NAME <check|clean|diff> <path> |
| 23 | +
|
| 24 | +Sanitizes AI artifacts per /usr/share/mios/ai/system.md §6. |
| 25 | +EOF |
| 26 | + exit 1 |
| 27 | +fi |
| 28 | + |
| 29 | +# ── Forbidden corporate entities (case-insensitive prose match) ── |
| 30 | +# These are stripped from prose. Source code and protocol references survive |
| 31 | +# (see system.md §6.1 exceptions). |
| 32 | +declare -ar FORBIDDEN_BRANDS=( |
| 33 | + 'Anthropic' |
| 34 | + 'Anthropic, Inc\.' |
| 35 | + 'Claude\.ai' |
| 36 | + 'Claude Sonnet' |
| 37 | + 'Claude Opus' |
| 38 | + 'Claude Haiku' |
| 39 | + 'OpenAI(, Inc\.)?' |
| 40 | + 'ChatGPT' |
| 41 | + 'GPT-4o?' |
| 42 | + 'GPT-3\.5' |
| 43 | + 'Google AI' |
| 44 | + 'Google DeepMind' |
| 45 | + 'DeepMind' |
| 46 | + 'Bard' |
| 47 | + 'Gemini (Pro|Advanced|Ultra)' |
| 48 | + 'Microsoft Copilot' |
| 49 | + 'GitHub Copilot' |
| 50 | + 'Bing AI' |
| 51 | + 'Mistral AI' |
| 52 | + 'Cohere' |
| 53 | + 'xAI' |
| 54 | + 'Grok' |
| 55 | + 'Perplexity AI' |
| 56 | +) |
| 57 | + |
| 58 | +# ── Forbidden conversational metadata patterns ── |
| 59 | +declare -ar FORBIDDEN_META=( |
| 60 | + '^Human:' |
| 61 | + '^Assistant:' |
| 62 | + '^User:' |
| 63 | + '^AI:' |
| 64 | + '<thinking>' |
| 65 | + '</thinking>' |
| 66 | + '<' |
| 67 | + '</' |
| 68 | + '<scratchpad>' |
| 69 | + '</scratchpad>' |
| 70 | + '<reasoning>' |
| 71 | + '</reasoning>' |
| 72 | + 'I.d be happy to help' |
| 73 | + 'Great question!' |
| 74 | + 'Let me think about this step by step' |
| 75 | + '\[doc-[0-9]+-[0-9]+\]' |
| 76 | +) |
| 77 | + |
| 78 | +# ── Forbidden sandbox path traces ── |
| 79 | +declare -ar FORBIDDEN_PATHS=( |
| 80 | + '/mnt/user-data/uploads' |
| 81 | + '/mnt/user-data/outputs' |
| 82 | + '/mnt/skills/' |
| 83 | + '/home/claude' |
| 84 | + '/home/ubuntu' |
| 85 | + '/repo/' |
| 86 | + '/workspace/' |
| 87 | +) |
| 88 | + |
| 89 | +# ── Whitelist (protocol references that survive) ── |
| 90 | +# These look like brand names but are actually protocol/spec references. |
| 91 | +declare -ar WHITELIST_PATTERNS=( |
| 92 | + 'OpenAI[ -]compatible' |
| 93 | + 'OpenAI v1' |
| 94 | + 'OpenAI API (spec|standard|protocol|format)' |
| 95 | + '/v1/(chat|completions|embeddings|models|responses|mcp)' |
| 96 | +) |
| 97 | + |
| 98 | +is_whitelisted() { |
| 99 | + local line="$1" |
| 100 | + local pat |
| 101 | + for pat in "${WHITELIST_PATTERNS[@]}"; do |
| 102 | + if echo "$line" | grep -qE "$pat"; then |
| 103 | + return 0 |
| 104 | + fi |
| 105 | + done |
| 106 | + return 1 |
| 107 | +} |
| 108 | + |
| 109 | +# ── Scanning ── |
| 110 | + |
| 111 | +scan_file() { |
| 112 | + local path="$1" |
| 113 | + local findings=0 |
| 114 | + local lineno=0 |
| 115 | + |
| 116 | + # Skip non-prose files |
| 117 | + case "$path" in |
| 118 | + *.sh|*.py|*.rs|*.go|*.c|*.cpp|*.h|*.hpp|*.toml|*.ini|*.conf) |
| 119 | + return 0 # Source code — skip |
| 120 | + ;; |
| 121 | + esac |
| 122 | + |
| 123 | + while IFS= read -r line; do |
| 124 | + lineno=$((lineno + 1)) |
| 125 | + is_whitelisted "$line" && continue |
| 126 | + |
| 127 | + local pat |
| 128 | + for pat in "${FORBIDDEN_BRANDS[@]}"; do |
| 129 | + if echo "$line" | grep -qiE "\\b${pat}\\b"; then |
| 130 | + printf '%s:%d: BRAND %s\n' "$path" "$lineno" "$line" |
| 131 | + findings=$((findings + 1)) |
| 132 | + fi |
| 133 | + done |
| 134 | + |
| 135 | + for pat in "${FORBIDDEN_META[@]}"; do |
| 136 | + if echo "$line" | grep -qE "$pat"; then |
| 137 | + printf '%s:%d: META %s\n' "$path" "$lineno" "$line" |
| 138 | + findings=$((findings + 1)) |
| 139 | + fi |
| 140 | + done |
| 141 | + |
| 142 | + for pat in "${FORBIDDEN_PATHS[@]}"; do |
| 143 | + if echo "$line" | grep -qF "$pat"; then |
| 144 | + printf '%s:%d: PATH %s\n' "$path" "$lineno" "$line" |
| 145 | + findings=$((findings + 1)) |
| 146 | + fi |
| 147 | + done |
| 148 | + done < "$path" |
| 149 | + |
| 150 | + return "$findings" |
| 151 | +} |
| 152 | + |
| 153 | +# ── Cleaning ── |
| 154 | + |
| 155 | +clean_file() { |
| 156 | + local path="$1" |
| 157 | + local tmp |
| 158 | + tmp="$(mktemp)" |
| 159 | + cp "$path" "${path}.bak" |
| 160 | + |
| 161 | + # Brand stripping: replace standalone brand mentions with neutral terms |
| 162 | + sed -E \ |
| 163 | + -e 's/\b(Anthropic|Anthropic, Inc\.)\b//gI' \ |
| 164 | + -e 's/\b(Claude\.ai|Claude Sonnet|Claude Opus|Claude Haiku|Claude)\b/the assistant/gI' \ |
| 165 | + -e 's/\b(OpenAI, Inc\.|OpenAI Inc\.)\b//g' \ |
| 166 | + -e 's/\b(ChatGPT|GPT-4o?|GPT-3\.5|GPT)\b/the model/gI' \ |
| 167 | + -e 's/\b(Google AI|Google DeepMind|DeepMind|Bard)\b/the assistant/gI' \ |
| 168 | + -e 's/\bGemini (Pro|Advanced|Ultra)\b/the assistant/gI' \ |
| 169 | + -e 's/\b(Microsoft Copilot|GitHub Copilot|Bing AI)\b/the tool/gI' \ |
| 170 | + -e 's/\b(Mistral AI|Cohere|xAI|Grok|Perplexity AI)\b//gI' \ |
| 171 | + "$path" > "$tmp" |
| 172 | + |
| 173 | + # Metadata stripping |
| 174 | + sed -i -E \ |
| 175 | + -e '/^Human:/d' \ |
| 176 | + -e '/^Assistant:/d' \ |
| 177 | + -e '/^User:/d' \ |
| 178 | + -e '/^AI:/d' \ |
| 179 | + -e 's|</?thinking>||g' \ |
| 180 | + -e 's|</?antml:[^>]+>||g' \ |
| 181 | + -e 's|</?scratchpad>||g' \ |
| 182 | + -e 's|</?reasoning>||g' \ |
| 183 | + -e 's/I.d be happy to help[^.]*\.//g' \ |
| 184 | + -e 's/Great question!\s*//g' \ |
| 185 | + -e 's/Let me think about this step by step[^.]*\.//g' \ |
| 186 | + -e 's/\[doc-[0-9]+-[0-9]+\]//g' \ |
| 187 | + "$tmp" |
| 188 | + |
| 189 | + # Path normalization (sandbox traces → FHS scratch) |
| 190 | + sed -i \ |
| 191 | + -e 's|/mnt/user-data/uploads|/var/lib/mios/ai/scratch|g' \ |
| 192 | + -e 's|/mnt/user-data/outputs|/var/lib/mios/ai/scratch|g' \ |
| 193 | + -e 's|/mnt/skills/|/usr/share/mios/skills/|g' \ |
| 194 | + -e 's|/home/claude|/var/lib/mios/ai|g' \ |
| 195 | + -e 's|/home/ubuntu|/var/lib/mios/ai|g' \ |
| 196 | + "$tmp" |
| 197 | + |
| 198 | + # Collapse double-blank lines (compaction) |
| 199 | + awk 'BEGIN{blank=0} /^$/{blank++; if(blank<=1) print; next} {blank=0; print}' \ |
| 200 | + "$tmp" > "${tmp}.compact" |
| 201 | + |
| 202 | + mv "${tmp}.compact" "$path" |
| 203 | + rm -f "$tmp" |
| 204 | + |
| 205 | + echo "Cleaned: $path (backup: ${path}.bak)" |
| 206 | +} |
| 207 | + |
| 208 | +# ── Dispatch ── |
| 209 | + |
| 210 | +case "$MODE" in |
| 211 | + check) |
| 212 | + if [[ -d "$TARGET" ]]; then |
| 213 | + findings=0 |
| 214 | + while IFS= read -r -d '' f; do |
| 215 | + scan_file "$f" || findings=$((findings + $?)) |
| 216 | + done < <(find "$TARGET" -type f \( -name '*.md' -o -name '*.txt' -o -name '*.json' -o -name '*.yaml' -o -name '*.yml' \) -print0) |
| 217 | + echo "---" |
| 218 | + echo "Total findings: $findings" |
| 219 | + [[ "$findings" -eq 0 ]] || exit 1 |
| 220 | + else |
| 221 | + scan_file "$TARGET" || exit 1 |
| 222 | + fi |
| 223 | + ;; |
| 224 | + clean) |
| 225 | + if [[ -d "$TARGET" ]]; then |
| 226 | + while IFS= read -r -d '' f; do |
| 227 | + clean_file "$f" |
| 228 | + done < <(find "$TARGET" -type f \( -name '*.md' -o -name '*.txt' -o -name '*.json' -o -name '*.yaml' -o -name '*.yml' \) -print0) |
| 229 | + else |
| 230 | + clean_file "$TARGET" |
| 231 | + fi |
| 232 | + ;; |
| 233 | + diff) |
| 234 | + clean_file "$TARGET" >/dev/null |
| 235 | + diff -u "${TARGET}.bak" "$TARGET" || true |
| 236 | + mv "${TARGET}.bak" "$TARGET" # Restore — diff mode is non-destructive |
| 237 | + ;; |
| 238 | + *) |
| 239 | + echo "Unknown mode: $MODE" >&2 |
| 240 | + exit 1 |
| 241 | + ;; |
| 242 | +esac |
0 commit comments