|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Preview Forge — Layer-1 input-path size cap for /pf:new. |
| 3 | +# |
| 4 | +# WHY (umbrella #95 follow-up, deferred from PR #83) |
| 5 | +# --------------------------------------------------- |
| 6 | +# `plugins/preview-forge/schemas/idea-spec.schema.json` caps `idea_summary` |
| 7 | +# at 5000 chars. That cap fires at the **S-3 schema validation layer**, |
| 8 | +# i.e. AFTER the seed idea has already been: |
| 9 | +# - copied into runs/<id>/idea.json |
| 10 | +# - inflated into the I1 Socratic interview prompt (system prompt + 3 |
| 11 | +# AskUserQuestion modals) |
| 12 | +# - keyed through `scripts/preview-cache.sh key` (which itself hashes |
| 13 | +# the raw idea string into the cache key — a 10MB idea would happily |
| 14 | +# stream through sha256) |
| 15 | +# |
| 16 | +# A 10MB seed idea today would walk through all of that BEFORE the |
| 17 | +# schema layer rejects it. This script is the layer-1 gate cited from |
| 18 | +# `plugins/preview-forge/commands/new.md`: callers (CLI helper or |
| 19 | +# orchestrator) invoke it with the raw seed text and reject early if it |
| 20 | +# exceeds the 5000-char cap, mirroring the schema's `maxLength`. |
| 21 | +# |
| 22 | +# DEFAULT IS REJECT, not silent-truncate |
| 23 | +# -------------------------------------- |
| 24 | +# Truncation would silently lose user intent — half the idea disappears |
| 25 | +# and the Socratic interview proceeds against a corrupted seed. Explicit |
| 26 | +# reject + a clear error message lets the user decide whether to trim. |
| 27 | +# `--truncate` is provided for callers that opt in (e.g. an automation |
| 28 | +# pipeline that re-emits the trimmed payload back to a file). |
| 29 | +# |
| 30 | +# USAGE |
| 31 | +# validate-idea-input.sh "<idea text>" # argv form |
| 32 | +# validate-idea-input.sh - < idea.txt # stdin form (- sentinel, |
| 33 | +# # parallels preview-cache.sh |
| 34 | +# # T-9.3 convention) |
| 35 | +# validate-idea-input.sh --truncate "<idea text>" # emit first 5000 chars |
| 36 | +# validate-idea-input.sh --truncate - # truncate from stdin |
| 37 | +# |
| 38 | +# EXIT CODES |
| 39 | +# 0 → length ≤ 5000 chars; (default mode) idea echoed to stdout |
| 40 | +# unchanged; (truncate mode) idea echoed |
| 41 | +# unchanged |
| 42 | +# 2 → length > 5000 chars; (default mode) reject with stderr message; |
| 43 | +# (truncate mode) first 5000 chars echoed, |
| 44 | +# warning to stderr, exit 0 (NOT 2) |
| 45 | +# 64 → usage error |
| 46 | +# |
| 47 | +# CHARACTER vs BYTE COUNTING |
| 48 | +# -------------------------- |
| 49 | +# The schema's `maxLength` is JSON Schema's `maxLength` keyword, which |
| 50 | +# per the spec counts **Unicode code points** (not UTF-8 bytes). We use |
| 51 | +# python3's `len(str)` which is exactly that — keeping this gate aligned |
| 52 | +# with the schema gate so a Korean idea that passes here doesn't get |
| 53 | +# rejected at S-3 (or vice versa). Zero-third-party-dep policy preserved |
| 54 | +# (LESSON 0.4): python3 is already a hard dependency for the rest of |
| 55 | +# the plugin (preview-cache helpers, hooks). |
| 56 | + |
| 57 | +set -euo pipefail |
| 58 | + |
| 59 | +MAX_LEN=5000 |
| 60 | +mode="reject" |
| 61 | + |
| 62 | +usage() { |
| 63 | + cat >&2 <<'EOF' |
| 64 | +usage: validate-idea-input.sh [--truncate] {<idea-text> | -} |
| 65 | + - reads idea text from argv (one positional arg) or stdin (when arg is `-`) |
| 66 | + - default mode: exit 2 + stderr message if len > 5000 code points |
| 67 | + - --truncate: emit first 5000 code points + stderr warn, exit 0 |
| 68 | +EOF |
| 69 | + exit 64 |
| 70 | +} |
| 71 | + |
| 72 | +if [[ $# -lt 1 ]]; then |
| 73 | + usage |
| 74 | +fi |
| 75 | + |
| 76 | +if [[ "$1" == "--truncate" ]]; then |
| 77 | + mode="truncate" |
| 78 | + shift |
| 79 | +fi |
| 80 | + |
| 81 | +if [[ $# -ne 1 ]]; then |
| 82 | + usage |
| 83 | +fi |
| 84 | + |
| 85 | +idea_arg="$1" |
| 86 | + |
| 87 | +# Read the idea text. Mirror the `-` sentinel convention used by |
| 88 | +# scripts/preview-cache.sh::cmd_key (see T-9.3 rationale): callers that |
| 89 | +# may exceed ARG_MAX (macOS ~256KB, some hosts smaller) pipe via stdin. |
| 90 | +if [[ "$idea_arg" == "-" ]]; then |
| 91 | + # Use the same "append _ then strip exactly one" trick as preview-cache |
| 92 | + # to preserve trailing newlines through bash command substitution. |
| 93 | + idea=$(cat; echo _) |
| 94 | + idea="${idea%_}" |
| 95 | +else |
| 96 | + idea="$idea_arg" |
| 97 | +fi |
| 98 | + |
| 99 | +# Empty input is a hard reject (parallels preview-cache.sh T-9.1: an |
| 100 | +# empty seed idea cannot be a legitimate Socratic input either, and |
| 101 | +# silently passing "" would make the rest of the pipeline misbehave). |
| 102 | +if [[ -z "$idea" ]]; then |
| 103 | + echo "validate-idea-input.sh: idea text is empty — refusing" >&2 |
| 104 | + exit 2 |
| 105 | +fi |
| 106 | + |
| 107 | +# Length check via python3 (Unicode code points, matching JSON Schema |
| 108 | +# `maxLength` semantics — see header). Argv pass + single-quoted heredoc |
| 109 | +# closes the inline-string interpolation surface (same pattern as |
| 110 | +# scripts/preview-cache.sh::py_read_json caller contract). |
| 111 | +# |
| 112 | +# Bounded read (gemini PR #96 review): we only need to know whether the |
| 113 | +# length exceeds MAX_LEN, so cap stdin.read at MAX_LEN+1 code points to |
| 114 | +# avoid pulling a multi-megabyte payload into Python memory. If the read |
| 115 | +# returns exactly MAX_LEN+1 chars, we know length > MAX_LEN. We pass |
| 116 | +# MAX_LEN as argv to avoid shell-interpolating it into the inline python |
| 117 | +# source (parallels preview-cache.sh::py_read_json contract). |
| 118 | +# |
| 119 | +# Note: python MUST drain the rest of stdin even after the bounded read, |
| 120 | +# otherwise `printf '%s' "$idea"` upstream gets SIGPIPE when python |
| 121 | +# exits early — and `set -euo pipefail` propagates that as rc=141 to the |
| 122 | +# overall pipeline. Cheap drain: a no-op .read(1<<20) loop. Cost is the |
| 123 | +# same as the unbounded form but bounded *peak* memory by chunking, so |
| 124 | +# we still meet the gemini review intent (peak RSS, not total throughput). |
| 125 | +length=$(printf '%s' "$idea" | python3 -c ' |
| 126 | +import sys |
| 127 | +limit = int(sys.argv[1]) |
| 128 | +data = sys.stdin.read(limit + 1) |
| 129 | +n = len(data) |
| 130 | +# Drain remaining bytes in chunks so upstream printf does not SIGPIPE |
| 131 | +# under pipefail. Chunk size keeps peak RSS bounded. |
| 132 | +while sys.stdin.read(1 << 20): |
| 133 | + pass |
| 134 | +print(n) |
| 135 | +' "$MAX_LEN") |
| 136 | + |
| 137 | +if [[ "$length" -le "$MAX_LEN" ]]; then |
| 138 | + # Pass-through: emit the idea on stdout for the caller to capture |
| 139 | + # (truncate mode emits same content). |
| 140 | + printf '%s' "$idea" |
| 141 | + exit 0 |
| 142 | +fi |
| 143 | + |
| 144 | +# Over the cap. |
| 145 | +if [[ "$mode" == "truncate" ]]; then |
| 146 | + echo "validate-idea-input.sh: idea length>${MAX_LEN} — truncating to first $MAX_LEN code points" >&2 |
| 147 | + # Bounded read + argv pass (gemini PR #96 review): only read MAX_LEN |
| 148 | + # code points (we throw away anything beyond), and pass MAX_LEN through |
| 149 | + # argv so the python source itself stays single-quoted — no shell |
| 150 | + # interpolation surface. |
| 151 | + printf '%s' "$idea" | python3 -c ' |
| 152 | +import sys |
| 153 | +limit = int(sys.argv[1]) |
| 154 | +sys.stdout.write(sys.stdin.read(limit)) |
| 155 | +# Drain to avoid upstream printf SIGPIPE under pipefail (see length- |
| 156 | +# check rationale above). |
| 157 | +while sys.stdin.read(1 << 20): |
| 158 | + pass |
| 159 | +' "$MAX_LEN" |
| 160 | + exit 0 |
| 161 | +fi |
| 162 | + |
| 163 | +# Default mode: hard reject. Note: `length` is bounded at MAX_LEN+1 by |
| 164 | +# the read cap above (gemini PR #96 review — peak RSS protection), so |
| 165 | +# we report ">${MAX_LEN}" instead of the exact overflow count. |
| 166 | +cat >&2 <<EOF |
| 167 | +validate-idea-input.sh: idea length>${MAX_LEN} (exact: ≥${length}) exceeds $MAX_LEN-character cap. |
| 168 | +
|
| 169 | +The /pf:new seed idea is bounded at $MAX_LEN Unicode code points to match |
| 170 | +the idea-spec schema's idea_summary maxLength. Please shorten the idea, or |
| 171 | +re-invoke with --truncate to silently trim to the first $MAX_LEN chars. |
| 172 | +EOF |
| 173 | +exit 2 |
0 commit comments