From 38b14ff401ec2388ef2a110873b06615108ee55c Mon Sep 17 00:00:00 2001
From: Denis Valeev <denis.valeev@gmail.com>
Date: Sat, 27 Sep 2025 09:15:19 -0400
Subject: [PATCH] docs: add external onboarding guide

---
 docs/docs/data-maintenance.md            | 13 +++--
 docs/docs/external-content-onboarding.md | 73 ++++++++++++++++++++++++
 docs/index.html                          |  1 +
 3 files changed, 82 insertions(+), 5 deletions(-)
 create mode 100644 docs/docs/external-content-onboarding.md
diff --git a/docs/docs/data-maintenance.md b/docs/docs/data-maintenance.md
index 0e45a48..cf46d82 100644
--- a/docs/docs/data-maintenance.md
+++ b/docs/docs/data-maintenance.md
@@ -15,16 +15,19 @@ Content-heavy apps rely on shared datasets stored under `data/` plus machine-gen
 ## Updating deck datasets
 
 1. Edit the dataset files under `apps/jokes/jokes.js`, `apps/quotes/quotes-data.js`, or `apps/slang/slang.js`.
-2. Run the syntax and shape checks documented in each app’s `AGENTS.md` file (`node --check ...` and the `node -e` globals check).
+2. When evaluating a batch of external jokes, run `node tools/onboard-external-jokes.js --input=path/to/candidates.json` to
+   surface high-similarity overlaps before touching the curated deck. See
+   [External content onboarding](external-content-onboarding.md) for a complete walkthrough and output interpretation.
+3. Run the syntax and shape checks documented in each app’s `AGENTS.md` file (`node --check ...` and the `node -e` globals check).
    - For jokes specifically, ensure every record includes a `sourceId` that matches one of the curated assets surfaced in `apps/asset-observatory/asset-data.js`. New sources should land under `data/` with capture notes so provenance survives future refreshes.
-3. Regenerate metadata and manifests:
+4. Regenerate metadata and manifests:
    ```bash
    node tools/update-content-metadata.js --dataset=jokes
    node tools/update-content-metadata.js --dataset=quotes
    node tools/update-content-metadata.js --dataset=slang
    ```
    Pass `--dataset=<name>` to target a single collection when needed.
-4. Refresh embeddings for duplicate detection. Typical commands:
+5. Refresh embeddings for duplicate detection. Typical commands:
    ```bash
    node tools/review-content-similarity.js --dataset=jokes --provider=synthetic --write --update-manifest
    node tools/review-content-similarity.js --dataset=jokes --provider=hfspace --model=bienkieu/sentence-embedding --batch-size=8 --report=data/similarity-report-jokes.json
@@ -32,8 +35,8 @@ Content-heavy apps rely on shared datasets stored under `data/` plus machine-gen
    node tools/review-content-similarity.js --dataset=slang --provider=synthetic --threshold-slang=0.8 --write --update-manifest
    ```
    Adjust providers and options according to your API access and the thresholds documented in `apps/slang/AGENTS.md` and the repository README.
-5. Review high-similarity pairs and move intentional overlaps into `data/similarity-overrides.json` so the similarity lab highlights them as protected.
-6. Commit refreshed datasets, manifests, embeddings, and similarity reports together to keep the bundle consistent.
+6. Review high-similarity pairs and move intentional overlaps into `data/similarity-overrides.json` so the similarity lab highlights them as protected.
+7. Commit refreshed datasets, manifests, embeddings, and similarity reports together to keep the bundle consistent.
 
 ## Regenerating similarity reports
 
diff --git a/docs/docs/external-content-onboarding.md b/docs/docs/external-content-onboarding.md
new file mode 100644
index 0000000..4f8f9a6
--- /dev/null
+++ b/docs/docs/external-content-onboarding.md
@@ -0,0 +1,73 @@
+# External content onboarding
+
+The toolbox uses deterministic embeddings and cosine comparisons to reject
+near-duplicate jokes before they ever land in `apps/jokes/jokes.js`. The
+`tools/onboard-external-jokes.js` helper ingests a candidate JSON file,
+embeds each entry across one or more providers, and reports which jokes can
+join the curated deck.
+
+## Candidate file shape
+
+Provide an array of objects where every joke includes at least a setup (stored
+as `joke` or `setup`) and optionally a punchline (`punchline` or `answer`).
+`id` and `label` fields are optional—missing values are replaced with
+`candidate-###` identifiers so the summary stays readable.
+
+```json
+[
+  {
+    "id": "joke-001",
+    "label": "Unexpected semicolons",
+    "joke": "Why did the build fail?",
+    "punchline": "The compiler thought the semicolon was sus."
+  }
+]
+```
+
+Save the array to disk (for example `data/new-jokes.json`) and feed the path to
+`--input` or its alias `--candidates`.
+
+## Running the triage script
+
+Evaluate a batch with the default providers and a cosine threshold of `0.8`:
+
+```bash
+node tools/onboard-external-jokes.js --input=data/new-jokes.json \
+  --output=reports/new-jokes-summary.json \
+  --accepted-output=reports/new-jokes-accepted.json
+```
+
+The command:
+
+- Loads the active jokes deck and its stored embeddings.
+- Generates deterministic embeddings for any provider missing from disk.
+- Compares each candidate against the curated deck using cosine similarity.
+- Writes a machine-readable summary plus an optional accepted-only export.
+
+Review the terminal output to spot high-similarity overlaps. The summary JSON
+captures metadata such as the candidate file, evaluated providers, rejection
+reasons, and the strongest match per joke.
+
+## Customising the evaluation
+
+- `--threshold=<value>` changes the cosine similarity cutoff (defaults to
+  `0.8`). Lower values admit more jokes, higher ones enforce stricter
+  deduplication.
+- `--providers=synthetic,openai,cohere` restricts which embedding stores to use.
+  Deterministic fallback vectors kick in automatically when a provider’s store
+  is missing or incompatible.
+- `--existing-embeddings=provider:path` lets you point at non-standard stores
+  (for example, a freshly generated HF Space batch).
+- `--candidate-embeddings=provider:path` reuses pre-computed vectors for the
+  candidates, skipping deterministic generation when dimensions match.
+
+## After accepting new jokes
+
+1. Inspect `reports/new-jokes-accepted.json` and manually fold the approved
+   entries into `apps/jokes/jokes.js`.
+2. Run the usual dataset maintenance scripts (`node tools/update-content-metadata.js`,
+   `node tools/review-content-similarity.js`, and
+   `node tools/generate-asset-report.js`) so manifests, embeddings, and the asset
+   observatory stay aligned.
+3. Commit the refreshed datasets, manifests, similarity reports, and summary
+   artifacts alongside the onboarding report for traceability.
diff --git a/docs/index.html b/docs/index.html
index 08a8e4e..8eb61a4 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -375,6 +375,7 @@ <h2 data-doc-title>Loading…</h2>
         { id: 'wiki-authoring', title: 'Wiki authoring', file: 'docs/wiki-authoring.md', summary: 'Front matter & commit logs' },
         { id: 'deploy-logs', title: 'Deploy logs', file: 'docs/deploy-logs.md', summary: 'Service worker navigation + Pages deploy log' },
         { id: 'data-maintenance', title: 'Data maintenance', file: 'docs/data-maintenance.md', summary: 'Datasets, manifests, and embeddings' },
+        { id: 'external-onboarding', title: 'External content onboarding', file: 'docs/external-content-onboarding.md', summary: 'Triage outside joke decks before import' },
         { id: 'testing-automation', title: 'Testing & automation', file: 'docs/testing-and-automation.md', summary: 'Playwright suite and CI workflows' },
         { id: 'docs-maintenance', title: 'Documentation maintenance', file: 'docs/docs-maintenance.md', summary: 'Keep the knowledge base accurate and linked' },
         { id: 'development-guide', title: 'Development guide', file: 'docs/development-guide.md', summary: 'Coding conventions and contributor tasks' }