xuio
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 21 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 26 additions & 0 deletions b/‎.gitignore‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎LICENSE.md‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 7 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 25 additions & 0 deletions b/‎Makefile‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 180 additions & 0 deletions b/‎README.md‎
Lines changed: 180 additions & 0 deletions
diff --git a/‎config/authenticated.example.yml‎
Lines changed: 46 additions & 0 deletions b/‎config/authenticated.example.yml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎config/public.example.yml‎
Lines changed: 44 additions & 0 deletions b/‎config/public.example.yml‎
Lines changed: 44 additions & 0 deletions
@@ -0,0 +1,21 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v6
+      - uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+      - run: uv sync --dev
+      - run: uv run ruff check src tests schemas
+      - run: uv run pytest
+      - run: uv run python -m json.tool schemas/source_manifest.schema.json >/dev/null
+      - run: uv run python -m json.tool schemas/source_relationship.schema.json >/dev/null
+      - run: uv run python -m json.tool schemas/scrape_manifest.schema.json >/dev/null
@@ -0,0 +1,26 @@
+.DS_Store
+.env
+.env.*
+!.env.example
+
+# Runtime scrape output; the wiki ingest consumes these after a run,
+# but the repository should not store paid/private content or credentials.
+outputs/
+scrape_logs/
+raw/
+
+# Local auth/session material.
+secrets/
+*.cookies.json
+*.har
+
+# Python/tooling caches.
+__pycache__/
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.venv/
+venv/
+build/
+dist/
+*.egg-info/
@@ -0,0 +1,21 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE
+LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org/>
@@ -0,0 +1,7 @@
+include LICENSE.md
+include README.md
+include Makefile
+recursive-include config *.example.yml
+recursive-include docs *.md
+recursive-include schemas *.json
+recursive-include src/substack_ingest/schemas *.json
@@ -0,0 +1,25 @@
+.PHONY: install check test schemas build clean ci
+
+install:
+	uv sync --dev
+	uv run playwright install chromium
+
+check:
+	uv run ruff check src tests schemas
+
+test:
+	uv run pytest
+
+schemas:
+	uv run python -m json.tool schemas/source_manifest.schema.json >/dev/null
+	uv run python -m json.tool schemas/source_relationship.schema.json >/dev/null
+	uv run python -m json.tool schemas/scrape_manifest.schema.json >/dev/null
+
+build:
+	uv build
+
+clean:
+	rm -rf build dist .pytest_cache .ruff_cache src/*.egg-info
+	find src tests -type d -name __pycache__ -prune -exec rm -rf {} +
+
+ci: check test schemas build
@@ -0,0 +1,180 @@
+# Substack Archive Scraper
+
+A Substack scraper and archive exporter that turns single-author Substack
+publications into Markdown source files for wiki ingestion.
+
+It is intended as a local Substack to Markdown downloader for researchers,
+operators, and publication owners who need reproducible source archives.
+
+The tool is built around a strict source contract: every article, author reply,
+accepted PDF, and accepted transcript gets a manifest row, a deterministic source
+file, provenance metadata, and validation logs. It preserves source text; it does
+not summarize, paraphrase, atomize, or build the wiki itself.
+
+## Safety Model
+
+- Use this only for publications and paid content you are allowed to access.
+- Paid Substacks are handled through a local browser login that exports a
+  Playwright storage-state file outside the repo.
+- The scraper does not bypass paywalls, evade bot detection, or hide what it is.
+- The HTTP client uses a configured contact in its `User-Agent`, respects
+  `robots.txt`, rate-limits requests, and logs access caveats.
+- Generated output can contain paid/private text. Keep output roots and session
+  files outside the repository.
+
+## Install
+
+```bash
+uv sync --dev
+uv run playwright install chromium
+```
+
+Without `uv`:
+
+```bash
+python3 -m venv .venv
+.venv/bin/python -m pip install -e ".[dev]"
+.venv/bin/python -m playwright install chromium
+```
+
+## Quickstart
+
+Create a config from the generic template:
+
+```bash
+cp config/public.example.yml config/my-publication.yml
+```
+
+Edit `config/my-publication.yml`:
+
+- `target.base_url`
+- `target.publication_name`
+- `target.author.canonical_name`
+- `target.author.stable_id`
+- `output.root`
+- `operator.user_agent_contact`
+
+The author stable ID is required for comment disambiguation. Display-name
+matching is not safe enough for author replies.
+
+Run preflight and discovery:
+
+```bash
+uv run substack-archive-scraper preflight --config config/my-publication.yml
+uv run substack-archive-scraper discover --config config/my-publication.yml --limit 10
+```
+
+Scrape and validate:
+
+```bash
+uv run substack-archive-scraper scrape --config config/my-publication.yml
+uv run substack-archive-scraper validate \
+  --config config/my-publication.yml \
+  --output-root /absolute/path/to/output
+```
+
+## Paid Substacks
+
+Start from the authenticated template:
+
+```bash
+cp config/authenticated.example.yml config/my-paid-publication.yml
+```
+
+Set `auth.cookie_file` to a path outside the repo, then capture a session:
+
+```bash
+uv run substack-archive-scraper login --config config/my-paid-publication.yml
+```
+
+A headed Chromium window opens. Log in normally, return to the terminal, and
+press Enter. The scraper stores Playwright storage state at `auth.cookie_file`.
+
+Credentialed scrapes require `auth.known_paid_post_url` so the scraper can prove
+that authenticated article hydration is working before it captures paid content.
+
+## Output Contract
+
+```text
+<output-root>/
+  raw/
+    articles/<YYYY>/YYYY-MM-DD-<slug>.md
+    pdfs/<descriptive-slug>.md
+    comments/YYYY-MM-DD-<article-slug>-<reply-seq>.md
+    transcripts/YYYY-MM-DD-<episode-slug>.md
+    _manifests/
+      source_manifest.jsonl
+      source_relationships.jsonl
+      content_duplicates.jsonl
+      scrape_report.md
+      voice_candidates.md
+      scrape_manifest.yml
+  scrape_logs/<run-id>/
+    *.log
+```
+
+The manifest is canonical. Source-file frontmatter is a recovery mirror only.
+
+## Completeness Policy
+
+By default the scraper is completeness-first:
+
+- Keep every discovered single-author article.
+- Keep every confirmed author reply the authenticated session can see.
+- Include partially paywalled articles with `paywall_truncation` warnings.
+- Include confirmed author replies whose parent comment is hidden by the API
+  with `comment_parent_context_unavailable`.
+- Log comment-access gaps instead of silently omitting them.
+
+Use `--exclude-partial-paywalled` only when you intentionally want a stricter
+complete-body corpus.
+
+## Cache And Progress
+
+Scrape runs use a persistent HTTP cache by default at:
+
+```text
+~/.cache/substack-archive-scraper/
+```
+
+Useful flags:
+
+```bash
+uv run substack-archive-scraper scrape --config config/my-publication.yml --progress-every 300
+uv run substack-archive-scraper scrape --config config/my-publication.yml --refresh-cache
+uv run substack-archive-scraper scrape --config config/my-publication.yml --no-cache
+```
+
+Progress output includes post counts, elapsed time, ETA, source counts, and cache
+hit/miss/write counts.
+
+## Developer Commands
+
+```bash
+uv sync --dev
+uv run ruff check src tests schemas
+uv run pytest
+uv run python -m json.tool schemas/source_manifest.schema.json >/dev/null
+```
+
+Equivalent shortcuts are available through `make`:
+
+```bash
+make install
+make check
+make test
+```
+
+The shorter `substack-ingest` command is kept as a compatibility alias.
+
+## Documentation
+
+- [Pipeline design](docs/pipeline-design.md)
+- [Implementation plan](docs/implementation-plan.md)
+- [Development guide](docs/development.md)
+- [Security and publishing notes](docs/security.md)
+- [Release checklist](docs/release-checklist.md)
+
+## Publishing Status
+
+This repository is prepared for later publication under [The Unlicense](LICENSE.md).
@@ -0,0 +1,46 @@
+# Authenticated scrape template for paid Substacks.
+#
+# Use only with accounts and paid content you are allowed to access. The
+# storage-state file is created by `substack-archive-scraper login` and must stay outside
+# this repository.
+
+target:
+  base_url: "https://example.substack.com/"
+  publication_name: "Example Publication"
+  author:
+    canonical_name: "Example Author"
+    stable_id: "REQUIRED_AUTHOR_STABLE_ID"
+    confirmed_display_aliases: []
+
+output:
+  root: "/tmp/substack-archive-scraper-output/example-publication"
+
+operator:
+  user_agent_contact: "mailto:operator@example.com"
+  max_requests_per_second: 2
+
+auth:
+  mode: "cookie_file"
+  cookie_file: "/tmp/substack-sessions/example-publication.storage-state.json"
+  # Pick one paid post that this account can lawfully access. The scraper uses
+  # it for an authenticated-vs-public hydration self-test.
+  known_paid_post_url: "https://example.substack.com/p/paid-post-slug"
+  known_subscriber_comments_article_url: null
+  debug_cache_raw_payloads: false
+
+date_range:
+  start: null
+  end: null
+
+resume:
+  resume_token: null
+
+validation:
+  recipe_compatibility_target: "wiki-recipe-v6"
+  fail_on_missing_author_stable_id: true
+  fail_on_hydration_self_test: true
+  fail_on_comment_stable_id_missing: true
+  fail_on_wrong_speaker_attribution: true
+  fail_on_uncontrolled_quality_warning: true
+  fail_on_type_specific_field_leakage: true
+  fail_on_unconfirmed_author_display_name_variance: true
@@ -0,0 +1,44 @@
+# Public-only scrape template.
+#
+# Copy to a private path such as config/my-publication.yml and fill in target
+# metadata before running. The author stable ID is still required if comments
+# are enabled.
+
+target:
+  base_url: "https://example.substack.com/"
+  publication_name: "Example Publication"
+  author:
+    canonical_name: "Example Author"
+    stable_id: "REQUIRED_AUTHOR_STABLE_ID"
+    confirmed_display_aliases: []
+
+output:
+  root: "/tmp/substack-archive-scraper-output/example-publication"
+
+operator:
+  user_agent_contact: "mailto:operator@example.com"
+  max_requests_per_second: 2
+
+auth:
+  mode: "none"
+  cookie_file: null
+  known_paid_post_url: null
+  known_subscriber_comments_article_url: null
+  debug_cache_raw_payloads: false
+
+date_range:
+  start: null
+  end: null
+
+resume:
+  resume_token: null
+
+validation:
+  recipe_compatibility_target: "wiki-recipe-v6"
+  fail_on_missing_author_stable_id: true
+  fail_on_hydration_self_test: true
+  fail_on_comment_stable_id_missing: true
+  fail_on_wrong_speaker_attribution: true
+  fail_on_uncontrolled_quality_warning: true
+  fail_on_type_specific_field_leakage: true
+  fail_on_unconfirmed_author_display_name_variance: true