sirreal
diff --git a/‎tools/html-api-fuzz/README.md‎
Lines changed: 150 additions & 29 deletions b/‎tools/html-api-fuzz/README.md‎
Lines changed: 150 additions & 29 deletions
diff --git a/‎tools/html-api-fuzz/launcher.php‎
Lines changed: 6 additions & 4 deletions b/‎tools/html-api-fuzz/launcher.php‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎tools/html-api-fuzz/lib/Corpus.php‎
Lines changed: 78 additions & 0 deletions b/‎tools/html-api-fuzz/lib/Corpus.php‎
Lines changed: 78 additions & 0 deletions
@@ -2,9 +2,8 @@
 
 Pure PHP fuzzer for the WordPress HTML API under `src/wp-includes/html-api`.
 It compares `WP_HTML_Processor` against PHP's `Dom\HTMLDocument` oracle using an
-html5lib-style textual tree, and separately checks `WP_HTML_Tag_Processor`
-invariants, simple mutation behavior, and `WP_HTML_Processor::normalize()`
-idempotence.
+html5lib-style textual tree, and separately checks a set of API invariants
+described under “Invariants” below.
 
 No browser, Playwright, Node, or `wp-env` is involved.
 
@@ -27,7 +26,8 @@ Run one generated seed with a specific terminal payload policy:
 php tools/html-api-fuzz/worker.php --seed 1 --payload-policy valid-utf8 --output-dir artifacts/html-api-fuzz/seed-1
 ```
 
-Run a batch in isolated worker subprocesses:
+Run a batch in worker subprocesses (seeds are batched into shared worker
+processes, 25 per process by default; see `--batch-size`):
 
 ```sh
 php tools/html-api-fuzz/runner.php --max-seeds 100 --duration-seconds 60
@@ -53,12 +53,6 @@ php tools/html-api-fuzz/launcher.php --lanes 4 --max-seeds 1000 --watcher
 
 For continuous fuzzing, run the launcher with `--duration-seconds 0 --max-seeds 0`
 and run `watcher.php` in a second shell against the same output directory.
-The tmux helper, `tools/html-api-fuzz/start-continuous-run-tmux.sh`, starts this
-indefinite launcher/watcher/orchestrator setup. It does not currently support a
-graceful shutdown signal. A good follow-up feature would be a stop signal that
-lets each process finish its current unit of work (seed, watcher scan/minimize,
-or Codex turn) and then exit cleanly. For now, stop it by attaching to the tmux
-session and interrupting the panes, or by killing the tmux session.
 
 Replay a failure:
 
@@ -85,16 +79,50 @@ ceilings (`tag-token-limit-exceeded`, `mutation-token-limit-exceeded`,
 (`dom-node-limit-exceeded`). Process timeouts, PHP fatal errors, and memory
 failures are separate failures and are also in scope for triage.
 
+## Execution Model
+
+The runner batches consecutive seeds into one worker process
+(`worker.php --batch-count N`, default `--batch-size 25` on the runner) so the
+WordPress bootstrap and process spawn are paid once per batch rather than once
+per seed. Each seed still writes its own `seed-N/primary` artifacts. If a batch
+process dies or times out mid-way, seeds left without a `result.json` are
+re-run individually in isolation, so a crash on one input cannot take
+neighboring seeds' results with it.
+
+## Input Stages
+
+Seeds are deterministically split between two input stages:
+
+- **Generated** (default ~80%): the structural grammar described under
+  “Generator Profiles”.
+- **Corpus-mutated** (default ~20%, `--corpus-mutate-percent N` on
+  `worker.php`/`runner.php`): a `#data` section from the html5lib-tests
+  tree-construction corpus (`tests/phpunit/data/html5lib-tests`), passed
+  through 1–4 deterministic mutations (byte insert/replace, chunk
+  delete/duplicate, tag-name swap, case toggle, corpus splice). The stage,
+  corpus file, entry index, and operations are recorded in result metadata,
+  and the mutated input itself is in the replay manifest, so replays are
+  standalone. Inputs report `inputSource: "corpus-mutated"` and
+  `profile: "corpus-mutated"`.
+
+Both stages derive entirely from the seed, so seed N always produces the same
+input for the same fuzzer version and corpus.
+
 ## Artifact Layout
 
 The runner writes:
 
 - `summary.ndjson`: one line per seed, suitable for tailing and watcher scans.
-- `events.ndjson`: runner lifecycle events.
+- `events.ndjson`: runner lifecycle events, including batch boundaries.
 - `state.json`: aggregate counters, stop reason, and compact Git metadata.
+  Oracle losses are counted per class: `oracleParseErrors` (inputs the DOM
+  oracle rejects receive no differential coverage), `oracleUnsupported`
+  (template content the oracle cannot represent), and `oracleTolerated`
+  (comparisons that passed only under the documented scalar tolerance).
 - `seed-N/primary/input.bin`: raw generated bytes.
 - `seed-N/primary/replay.json`: base64 replay manifest, including the commit
-  hash and tracked-file dirty state needed to interpret a standalone replay.
+  hash, tracked-file dirty state, and fragment context needed to interpret a
+  standalone replay.
 - `seed-N/primary/result.json`: full worker result.
 - `seed-N/primary/wordpress-tree.txt` and `dom-tree.txt`: rendered trees when available.
 
@@ -110,36 +138,99 @@ and preserve discovery provenance in `sourceReplay`.
 
 The watcher writes triage state under `.triage-watcher` by default, or under
 `--state-dir` when provided. Each signature gets a stable directory containing
-`failure.json`, minimizer logs, and minimized replay/result artifacts.
+`failure.json`, minimizer logs, and minimized replay/result artifacts. Failed
+minimizations are retried on later scans, up to `--max-minimize-retries`
+(default 3) attempts per signature.
 
-## Modes
+## Modes and Fragment Contexts
 
-- `fragment-body`: parse as body children. The DOM oracle wraps the input in a
-  minimal document and renders only body children.
+- `fragment-body`: parse as a fragment. The DOM oracle uses real fragment
+  parsing (the `innerHTML` setter on a context element of an empty document),
+  not a document-wrapping approximation.
 - `full-document`: parse as a full HTML document.
 - `auto`: weighted choice.
 
+In fragment mode a context element is selected per seed
+(`--fragment-context TAG` on `worker.php` for replays). `<body>` dominates;
+the other contexts (`div`, `p`, `td`, `tr`, `table`, `caption`, `colgroup`,
+`select`, `option`, `template`, `title`, `textarea`, `script`, `style`,
+`svg`, `math`) receive a small probe weight. `WP_HTML_Processor::create_fragment()`
+currently supports only `<body>`, so non-body contexts are recorded as
+`status: "unsupported"` today; when create_fragment() gains context support
+the fuzzer picks up the new coverage with no changes. The DOM oracle already
+parses every context correctly.
+
 Unsupported `WP_HTML_Processor` cases are expected by default and are recorded
 as successful attempts with `status: "unsupported"`. Use `--fail-unsupported`
 when you want unsupported cases to become failures.
 
+## Invariants
+
+Each seed checks, in order, stopping at the first failing class:
+
+1. **Tag Processor invariants** (`tag-invariant-failed`): token loop
+   termination under the token ceiling; non-null token type/name/tag;
+   attribute getters and `class_list()` iteration do not throw;
+   `get_updated_html()` with no queued edits returns the input unchanged; a
+   simple `set_attribute()` mutation is visible to a re-scan; and
+   **seek consistency** — a bookmark set at a seed-chosen token, after
+   scanning to the end and seeking back, must reproduce the identical token
+   stream (`seek-token-stream-mismatch`).
+2. **Differential tree comparison** (`tree-mismatch` / `encoding-mismatch`):
+   the WordPress tree must equal the DOM oracle tree (see “Tree Comparison”).
+3. **Breadcrumb consistency** (`breadcrumb-mismatch`): at every tag token,
+   `get_breadcrumbs()` must agree with the element stack derived from token
+   order and `expects_closer()`.
+4. **Mutation differential** (`mutation-tree-mismatch` /
+   `mutation-delta-mismatch`), only on a clean baseline: after setting
+   `data-fuzz="1"` on the first tag, the mutated document must parse
+   identically in WordPress and the DOM oracle, and the WordPress tree must
+   change by exactly the one attribute line (unless formatting-element
+   reconstruction clones the attribute, or tree construction legitimately
+   drops the mutated element, in which case the differential comparison alone
+   applies).
+5. **Normalize tree preservation** (`normalize-tree-changed`), only on a
+   clean baseline: parsing `normalize()` output must produce the same tree as
+   the original input, modulo the documented scalar substitutions. This is
+   stricter than idempotence, which a consistently wrong serializer can pass.
+6. **Normalize idempotence** (`normalize-invariant-failed`):
+   `normalize()` / `serialize()` run twice must be a fixed point, with no
+   PHP native errors or throwables. Full documents use
+   `create_full_parser()->serialize()`; non-body fragment contexts use
+   `create_fragment(<context>)->serialize()`.
+
 ## Generator Profiles
 
 The generator uses a structural HTML grammar with weighted profiles:
 
 - `balanced`
-- `full-document`
+- `full-document` (includes occasional frameset documents, quirks-mode
+  doctypes, and content after `</html>`)
 - `body-fragment`
 - `tables`
 - `template`
-- `foreign-content`
-- `rawtext-rcdata`
-- `formatting-adoption`
+- `select` (option/optgroup nesting, select-ending elements such as `input`,
+  `textarea` and `button`, nested selects, select-in-table)
+- `foreign-content` (MathML/SVG integration points, HTML breakout tags,
+  `<font>` with and without breakout attributes, `annotation-xml` encoding
+  variants, CDATA sections in foreign content, case-mangled `foreignObject`)
+- `rawtext-rcdata` (script/style/iframe/noembed/noframes/xmp/noscript,
+  title/textarea, occasional `plaintext`)
+- `formatting-adoption` (random formatting elements plus explicit
+  adoption-agency shapes: misnested closers, block-boundary formatting,
+  reconstruction across siblings, nested anchors, Noah's Ark overflow,
+  repeated closers)
 - `attributes-entities`
 - `comments-doctype-bogus`
 - `deep-nesting`
 - `resource-stress`
-- `incomplete-malformed`
+- `incomplete-malformed` (includes spec-special closers such as `</br>` and
+  `</p>`, stray closers, and `<image>`)
+
+All profiles can emit duplicate attribute names (first-wins coverage),
+auto-closing chains (`li`, `dd`/`dt`, headings, `p`), and named character
+references with longest-prefix-match ambiguity (`&notit;`, `&copyright;`,
+`&ngE`, ...).
 
 Terminal payloads are selected by a separate policy:
 
@@ -165,27 +256,57 @@ metadata for direct inputs and are not selectable for generated runs.
 Replayed and minimized manifests keep immediate `inputSource` metadata separate
 from `originalGenerator` metadata.
 
-The `attributes-entities` profile biases toward tokenizer boundary coverage:
-valid and invalid decimal and hexadecimal character references, variable leading
-zeros, mixed casing, known named character references with and without trailing
-semicolons, unusual attribute spacing, malformed attribute names, and unusual or
-invalid tag-name syntax.
-
 ## Tree Comparison
 
 The tree renderer follows the html5lib test style used by
 `tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php`:
 
-- sorted attributes
+- attributes sorted by their spec-scrubbed names (so a raw-NUL name on the
+  WordPress side and its U+FFFD substitution on the DOM side sort identically),
+  rendered raw
 - boolean attributes rendered as `=""`
 - namespace-qualified element and attribute names
 - template `content` marker
 - only the narrow auto-generated `html/head/body` wrapper tolerance
 
-Invalid bytes are not normalized away. If WordPress and `Dom\HTMLDocument`
+Template content is rendered through a self-contained serialization
+round-trip: PHP hides template child nodes, so the oracle re-parses the
+template's `innerHTML` serialization in a body context and accepts the result
+only when re-serializing reproduces the source byte-for-byte. Content that
+cannot round-trip (table parts, foreign fragments) is quarantined as
+`oracle-unsupported`. This check never consults the WordPress HTML API, which
+is the system under test.
+
+Raw bytes are rendered without normalization. The WordPress HTML API
+deliberately preserves NUL and CR bytes where spec-following parsers
+substitute U+FFFD and normalize newlines during input preprocessing, so the
+comparison tolerates a differing line only when that exact substitution
+explains the entire difference. Tolerated lines are reported per seed
+(`comparison.scalarToleratedLines`) and per run (`oracleTolerated`), and the
+result is classified `oracle-tolerated` rather than silently passed. Any
+difference beyond the substitution fails as usual, and the first-difference
+record points at the first *unexplained* line.
+
+One known oracle bug is tolerated with a runtime probe: PHP's Lexbor parser
+fails to treat U+000C FORM FEED as ignorable whitespace in the pre-body
+insertion modes. When a full-document comparison fails, the input contains a
+form feed, and re-parsing with form feeds substituted by spaces makes the DOM
+oracle reproduce the WordPress tree exactly, the case is classified
+`oracle-tolerated` with `comparison.formFeedQuirk: true`. The probe disables
+the tolerance automatically when PHP fixes the bug.
+
+Invalid bytes are never normalized away. If WordPress and `Dom\HTMLDocument`
 surface different byte sequences, the first-difference record includes bounded
 line previews, byte lengths, line hashes, the first differing byte offset, and
 hex previews, including a diff-window hex preview around the differing byte, so
 the mismatch remains inspectable even when JSON display substitutes replacement
 characters. Full comparison lines are kept out of `result.json` to avoid large
 artifacts from stress inputs.
+
+## Minimization
+
+`minimize.php` reduces in three phases under a shared attempt budget
+(`--max-attempts`, default 600): markup-aligned segment deletion, binary
+byte-chunk deletion, then per-byte deletion and canonicalization (replacements
+never grow the input). Every accepted candidate re-runs the worker and must
+reproduce the original signature hash (or any failure with `--any-failure`).
@@ -250,10 +250,12 @@ function html_api_fuzz_launcher_close_lane( array &$lane ): array {
 }
 
 $aggregate = array(
-	'successes'    => 0,
-	'failures'     => 0,
-	'unsupported'  => 0,
-	'oracleErrors' => 0,
+	'successes'         => 0,
+	'failures'          => 0,
+	'unsupported'       => 0,
+	'oracleParseErrors' => 0,
+	'oracleUnsupported' => 0,
+	'oracleTolerated'   => 0,
 );
 foreach ( $state['laneResults'] as $lane ) {
 	$runner_state = $lane['runnerState'] ?? array();
 
@@ -0,0 +1,78 @@
+<?php
+namespace HtmlApiFuzz;
+
+/**
+ * Seed corpus drawn from the html5lib-tests tree-construction suite. These
+ * inputs encode decades of parser edge cases; mutating them explores
+ * neighborhoods that the structural generator's grammar never reaches.
+ */
+class Corpus {
+	private static $entries = null;
+
+	public static function default_directory(): string {
+		return repo_root() . '/tests/phpunit/data/html5lib-tests/tree-construction';
+	}
+
+	/**
+	 * Returns the corpus entries: every #data section from every .dat file,
+	 * sorted deterministically. Cached per process.
+	 */
+	public static function entries( ?string $directory = null ): array {
+		if ( null === $directory && null !== self::$entries ) {
+			return self::$entries;
+		}
+
+		$dir     = $directory ?? self::default_directory();
+		$entries = array();
+		$files = is_dir( $dir ) ? glob( $dir . '/*.dat' ) : false;
+		$files = false === $files ? array() : $files;
+		sort( $files );
+		foreach ( $files as $file ) {
+			$contents = file_get_contents( $file );
+			if ( false === $contents ) {
+				continue;
+			}
+			foreach ( self::parse_dat_data_sections( $contents ) as $data ) {
+				$entries[] = array(
+					'file' => basename( $file ),
+					'data' => $data,
+				);
+			}
+		}
+
+		if ( null === $directory ) {
+			self::$entries = $entries;
+		}
+		return $entries;
+	}
+
+	/**
+	 * Extracts #data sections from html5lib .dat content. A section runs from
+	 * the line after `#data` to the line before the next `#` directive, with
+	 * the trailing newline removed.
+	 */
+	private static function parse_dat_data_sections( string $contents ): array {
+		$sections = array();
+		$lines    = explode( "\n", $contents );
+		$current  = null;
+		foreach ( $lines as $line ) {
+			if ( '#data' === $line ) {
+				$current = array();
+				continue;
+			}
+			if ( null !== $current ) {
+				if ( '' !== $line && '#' === $line[0] ) {
+					$sections[] = implode( "\n", $current );
+					$current    = null;
+					continue;
+				}
+				$current[] = $line;
+			}
+		}
+		if ( null !== $current && array() !== $current ) {
+			$sections[] = implode( "\n", $current );
+		}
+
+		return $sections;
+	}
+}