|
| 1 | +#!/usr/bin/env bash |
| 2 | +# validate-input.sh |
| 3 | +# |
| 4 | +# Central script for validating untrusted user input in GitHub Actions workflows. |
| 5 | +# Detects hidden Unicode characters, invisible text, and HTML comment injection |
| 6 | +# that could be used for prompt injection attacks against AI/LLM systems. |
| 7 | +# |
| 8 | +# Usage: source this script or call it directly after setting the required |
| 9 | +# environment variables listed below. |
| 10 | +# |
| 11 | +# Required environment variables: |
| 12 | +# INPUT_TEXT - The untrusted text to validate (e.g. issue body, PR body) |
| 13 | +# ITEM_NUMBER - Issue or PR number used to post a warning comment |
| 14 | +# REPO - Repository in "owner/repo" format |
| 15 | +# GH_TOKEN - GitHub token with permission to write comments |
| 16 | +# |
| 17 | +# Optional environment variables: |
| 18 | +# CONTEXT_TYPE - "issue" or "pr" (default: "issue") |
| 19 | +# RUN_ID - Workflow run ID for linking back to this run |
| 20 | +# SERVER_URL - GitHub server URL (default: https://github.com) |
| 21 | + |
| 22 | +set -euo pipefail |
| 23 | + |
| 24 | +INPUT_TEXT="${INPUT_TEXT:-}" |
| 25 | +ITEM_NUMBER="${ITEM_NUMBER:-}" |
| 26 | +REPO="${REPO:-}" |
| 27 | +CONTEXT_TYPE="${CONTEXT_TYPE:-issue}" |
| 28 | +RUN_ID="${RUN_ID:-}" |
| 29 | +SERVER_URL="${SERVER_URL:-https://github.com}" |
| 30 | + |
| 31 | +FINDINGS_FILE="/tmp/validation-findings.txt" |
| 32 | +rm -f "$FINDINGS_FILE" |
| 33 | + |
| 34 | +echo "=== Validating untrusted user input for security threats ===" |
| 35 | + |
| 36 | +# Run the full Unicode and injection analysis in Python, which handles |
| 37 | +# Unicode categories reliably across all platforms. |
| 38 | +python3 - << 'PYEOF' |
| 39 | +import os |
| 40 | +import re |
| 41 | +import sys |
| 42 | +import unicodedata |
| 43 | +
|
| 44 | +input_text = os.environ.get("INPUT_TEXT", "") |
| 45 | +findings = [] |
| 46 | +
|
| 47 | +MAX_INPUT_CHARS = 200_000 # guard against extremely large payloads (~200 KB of ASCII) |
| 48 | +if len(input_text) > MAX_INPUT_CHARS: |
| 49 | + input_text = input_text[:MAX_INPUT_CHARS] |
| 50 | + print("Warning: input was truncated to 200,000 characters for validation", file=sys.stderr) |
| 51 | +
|
| 52 | +# ── 1. Bidirectional text control characters ───────────────────────────────── |
| 53 | +# These are used in the "Trojan Source" class of attacks (CVE-2021-42574). |
| 54 | +# They make rendered text appear different from the actual bytes, hiding |
| 55 | +# malicious instructions from human reviewers while LLMs still process them. |
| 56 | +BIDI_CHARS = { |
| 57 | + 0x200E: "LEFT-TO-RIGHT MARK", |
| 58 | + 0x200F: "RIGHT-TO-LEFT MARK", |
| 59 | + 0x202A: "LEFT-TO-RIGHT EMBEDDING", |
| 60 | + 0x202B: "RIGHT-TO-LEFT EMBEDDING", |
| 61 | + 0x202C: "POP DIRECTIONAL FORMATTING", |
| 62 | + 0x202D: "LEFT-TO-RIGHT OVERRIDE", |
| 63 | + 0x202E: "RIGHT-TO-LEFT OVERRIDE", |
| 64 | + 0x2066: "LEFT-TO-RIGHT ISOLATE", |
| 65 | + 0x2067: "RIGHT-TO-LEFT ISOLATE", |
| 66 | + 0x2068: "FIRST STRONG ISOLATE", |
| 67 | + 0x2069: "POP DIRECTIONAL ISOLATE", |
| 68 | +} |
| 69 | +found_bidi = [name for cp, name in BIDI_CHARS.items() if chr(cp) in input_text] |
| 70 | +if found_bidi: |
| 71 | + findings.append( |
| 72 | + "Bidirectional Unicode control characters detected " |
| 73 | + f"({', '.join(found_bidi[:3])}{'...' if len(found_bidi) > 3 else ''}) — " |
| 74 | + "these can make content appear different to humans than to AI systems " |
| 75 | + "(Trojan Source / CVE-2021-42574)" |
| 76 | + ) |
| 77 | +
|
| 78 | +# ── 2. Zero-width and invisible characters ──────────────────────────────────── |
| 79 | +# Invisible to human readers but processed by AI models — ideal for hiding |
| 80 | +# secret instructions inside otherwise normal-looking text. |
| 81 | +INVISIBLE_CHARS = { |
| 82 | + 0x00AD: "SOFT HYPHEN", |
| 83 | + 0x200B: "ZERO WIDTH SPACE", |
| 84 | + 0x200C: "ZERO WIDTH NON-JOINER", |
| 85 | + 0x200D: "ZERO WIDTH JOINER", |
| 86 | + 0x2060: "WORD JOINER", |
| 87 | + 0xFEFF: "ZERO WIDTH NO-BREAK SPACE (BOM)", |
| 88 | +} |
| 89 | +found_invisible = [name for cp, name in INVISIBLE_CHARS.items() if chr(cp) in input_text] |
| 90 | +if found_invisible: |
| 91 | + findings.append( |
| 92 | + "Invisible/zero-width Unicode characters detected " |
| 93 | + f"({', '.join(found_invisible[:3])}{'...' if len(found_invisible) > 3 else ''}) — " |
| 94 | + "these are not visible to human reviewers but are processed by AI systems" |
| 95 | + ) |
| 96 | +
|
| 97 | +# ── 3. Unicode tag characters (U+E0000–U+E007F) ─────────────────────────────── |
| 98 | +# A block of characters originally reserved for language tags. Completely |
| 99 | +# invisible in most renderers but can encode arbitrary ASCII text. |
| 100 | +tag_chars = [c for c in input_text if 0xE0000 <= ord(c) <= 0xE007F] |
| 101 | +if tag_chars: |
| 102 | + findings.append( |
| 103 | + f"Unicode tag characters detected ({len(tag_chars)} character(s) in U+E0000–E007F range) — " |
| 104 | + "these are fully invisible and can encode hidden ASCII messages" |
| 105 | + ) |
| 106 | +
|
| 107 | +# ── 4. Variation selectors ──────────────────────────────────────────────────── |
| 108 | +# Variation selectors modify the appearance of the preceding character but can |
| 109 | +# also be abused to encode hidden information steganographically. |
| 110 | +variation_selectors = [ |
| 111 | + c for c in input_text |
| 112 | + if (0xFE00 <= ord(c) <= 0xFE0F) or (0xE0100 <= ord(c) <= 0xE01EF) |
| 113 | +] |
| 114 | +if variation_selectors: |
| 115 | + findings.append( |
| 116 | + f"Unicode variation selectors detected ({len(variation_selectors)} character(s)) — " |
| 117 | + "these can be used to steganographically encode hidden data" |
| 118 | + ) |
| 119 | +
|
| 120 | +# ── 5. HTML comments ────────────────────────────────────────────────────────── |
| 121 | +# HTML comments are stripped by GitHub's Markdown renderer, making them |
| 122 | +# invisible to human readers, but an LLM processing the raw source will see |
| 123 | +# and potentially act on any instructions hidden inside them. |
| 124 | +if re.search(r"<!--.*?-->", input_text, re.DOTALL): |
| 125 | + findings.append( |
| 126 | + "HTML comment block(s) detected (<!-- ... -->) — " |
| 127 | + "these are hidden from the rendered view but visible to AI systems " |
| 128 | + "processing the raw source, making them a common prompt injection vector" |
| 129 | + ) |
| 130 | +
|
| 131 | +# ── 6. Non-printable control characters ────────────────────────────────────── |
| 132 | +# Excludes ordinary whitespace (tab, LF, CR) which are expected in text. |
| 133 | +ALLOWED_CONTROL = {0x09, 0x0A, 0x0D} # HT, LF, CR |
| 134 | +control_chars = [ |
| 135 | + c for c in input_text |
| 136 | + if unicodedata.category(c) == "Cc" and ord(c) not in ALLOWED_CONTROL |
| 137 | +] |
| 138 | +if control_chars: |
| 139 | + findings.append( |
| 140 | + f"Non-printable control characters detected ({len(control_chars)} character(s)) — " |
| 141 | + "unexpected control characters may indicate an attempt to confuse parsers or renderers" |
| 142 | + ) |
| 143 | +
|
| 144 | +# Write findings to a temp file so the calling shell script can build the comment |
| 145 | +findings_file = os.environ.get("FINDINGS_FILE", "/tmp/validation-findings.txt") |
| 146 | +with open(findings_file, "w") as fh: |
| 147 | + for f in findings: |
| 148 | + fh.write(f + "\n") |
| 149 | +
|
| 150 | +if findings: |
| 151 | + print(f"⚠️ Found {len(findings)} security concern(s) in input", file=sys.stderr) |
| 152 | + sys.exit(1) |
| 153 | +else: |
| 154 | + print("✅ No suspicious content detected", file=sys.stderr) |
| 155 | + sys.exit(0) |
| 156 | +PYEOF |
| 157 | +VALIDATION_EXIT=$? |
| 158 | + |
| 159 | +if [ "$VALIDATION_EXIT" -ne 0 ]; then |
| 160 | + WORKFLOW_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}" |
| 161 | + |
| 162 | + # Build the warning comment |
| 163 | + { |
| 164 | + echo "## ⚠️ Security Warning: Suspicious Input Detected" |
| 165 | + echo "" |
| 166 | + echo "This ${CONTEXT_TYPE} contains content that may be used for **prompt injection** — an attack that hides instructions inside text to manipulate AI/LLM systems processing it." |
| 167 | + echo "" |
| 168 | + echo "### Findings" |
| 169 | + echo "" |
| 170 | + while IFS= read -r line; do |
| 171 | + echo "- ${line}" |
| 172 | + done < "$FINDINGS_FILE" |
| 173 | + echo "" |
| 174 | + echo "### What this means" |
| 175 | + echo "" |
| 176 | + echo "Hidden Unicode characters or HTML comments can be invisible to human reviewers while still being read and acted upon by AI models. This is a known technique for injecting malicious instructions into AI-assisted workflows." |
| 177 | + echo "" |
| 178 | + echo "**Action required:** Please review and edit the ${CONTEXT_TYPE} to remove any hidden characters before this workflow can proceed. If you believe this is a false positive, please contact a repository maintainer." |
| 179 | + echo "" |
| 180 | + if [ -n "${RUN_ID}" ]; then |
| 181 | + echo "_Detected by [workflow run #${RUN_ID}](${WORKFLOW_URL})_" |
| 182 | + else |
| 183 | + echo "_Detected by an automated security validation step._" |
| 184 | + fi |
| 185 | + } > /tmp/security-comment.md |
| 186 | + |
| 187 | + echo "=== Posting security warning comment to ${CONTEXT_TYPE} #${ITEM_NUMBER} ===" |
| 188 | + if [ "${CONTEXT_TYPE}" = "pr" ]; then |
| 189 | + gh pr comment "${ITEM_NUMBER}" --repo "${REPO}" --body-file /tmp/security-comment.md |
| 190 | + else |
| 191 | + gh issue comment "${ITEM_NUMBER}" --repo "${REPO}" --body-file /tmp/security-comment.md |
| 192 | + fi |
| 193 | + |
| 194 | + echo "::error::Input validation failed: suspicious content detected. See comment on ${CONTEXT_TYPE} #${ITEM_NUMBER} for details." |
| 195 | + exit 1 |
| 196 | +fi |
| 197 | + |
| 198 | +echo "✅ Input validation passed — no suspicious content detected" |
0 commit comments