diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b644de1..b7ab0d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,10 +36,10 @@ jobs: - name: Build run: bun run build - - name: Type check (web) - run: bun run --cwd apps/web typecheck + - name: Type check (site) + run: bun run --cwd apps/site typecheck - - name: Build (web) + - name: Build (site) env: DATABASE_URL: ${{ secrets.DATABASE_URL }} - run: bun run --cwd apps/web build + run: bun run --cwd apps/site build diff --git a/.github/workflows/deploy-site.yml b/.github/workflows/deploy-site.yml index 28fd1e5..6bf6004 100644 --- a/.github/workflows/deploy-site.yml +++ b/.github/workflows/deploy-site.yml @@ -6,8 +6,7 @@ on: branches: [main] paths: - ".github/workflows/deploy-site.yml" - - "apps/web/**" - - "!apps/web/worker/**" + - "apps/site/**" jobs: deploy: @@ -28,7 +27,7 @@ jobs: - name: Build site env: DATABASE_URL: ${{ secrets.DATABASE_URL }} - run: bun run --cwd apps/web build + run: bun run --cwd apps/site build - name: Upload site to R2 env: @@ -45,15 +44,15 @@ jobs: # with stale-while-revalidate for 1 hour. A Cloudflare Cache Rule on # the zone enables caching for these specific paths (extensionless # HTML isn't cached by Cloudflare default); see infrastructure notes - # in apps/web/README.md. Static assets (favicons, og-image, logo) + # in apps/site/README.md. Static assets (favicons, og-image, logo) # keep Cloudflare's default static-asset cache behavior - no header. CACHE_HTML="public, max-age=300, stale-while-revalidate=3600" # Non-HTML assets: copy the finite Astro output directly. Do not use # `aws s3 sync` at the bucket root because the same bucket also stores # the million-plus scraper-managed /documents/ and /extracted/ objects. - find apps/web/dist -type f ! -name "*.html" -print0 | while IFS= read -r -d '' f; do - rel="${f#apps/web/dist/}" + find apps/site/dist -type f ! -name "*.html" -print0 | while IFS= read -r -d '' f; do + rel="${f#apps/site/dist/}" case "$rel" in sitemap.xml|robots.txt|llms.txt) aws s3 cp "$f" "$R2_BUCKET/$rel" \ @@ -70,8 +69,8 @@ jobs: # HTML files: upload to extensionless keys to match canonical URLs. # /classification.html -> r2://classification, /types/legal.html -> r2://types/legal. # index.html is the one exception; it stays as a static homepage at /. - find apps/web/dist -name "*.html" -print0 | while IFS= read -r -d '' f; do - rel="${f#apps/web/dist/}" + find apps/site/dist -name "*.html" -print0 | while IFS= read -r -d '' f; do + rel="${f#apps/site/dist/}" if [ "$rel" = "index.html" ]; then key="index.html" else diff --git a/.github/workflows/deploy-worker.yml b/.github/workflows/deploy-worker.yml index 8d983d8..b4522b3 100644 --- a/.github/workflows/deploy-worker.yml +++ b/.github/workflows/deploy-worker.yml @@ -5,7 +5,7 @@ on: push: branches: [main] paths: - - "apps/web/worker/**" + - "apps/api/**" jobs: deploy: @@ -16,14 +16,14 @@ jobs: - uses: oven-sh/setup-bun@v2 - - run: bun install --cwd apps/web/worker + - run: bun install --cwd apps/api - name: Deploy to Cloudflare Workers uses: cloudflare/wrangler-action@v3 with: apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} - workingDirectory: apps/web/worker + workingDirectory: apps/api secrets: | DATABASE_URL env: diff --git a/.gitignore b/.gitignore index 4c23510..ff2024c 100644 --- a/.gitignore +++ b/.gitignore @@ -17,11 +17,11 @@ dist/ corpus/ dev/ -# Generated web data (local only) -apps/web/data/ +# Generated site data (local only) +apps/site/data/ # Astro -apps/web/.astro/ +apps/site/.astro/ # Tests coverage/ diff --git a/README.md b/README.md index 9245a0b..7cdea6d 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,8 @@ Run `corpus --help` for detailed options. apps/ cli/ # Unified CLI — corpus cdx-filter/ # AWS Lambda — filters CDX indexes for .docx URLs - web/ # Landing page (docxcorp.us) + Cloudflare Worker API + site/ # Static Astro site for docxcorp.us + api/ # Cloudflare Worker API for api.docxcorp.us packages/ shared/ # DB client, storage abstraction, formatting scraper/ # Downloads WARC, validates .docx, deduplicates @@ -84,7 +85,8 @@ db/ |-------|------|---------| | **cli** | `corpus` command — orchestrates everything | Bun | | **cdx-filter** | Filter Common Crawl CDX indexes (Lambda) | Node.js | -| **web** | docxcorp.us landing page + API worker | Static + CF Worker | +| **site** | docxcorp.us landing page and dataset pages | Static Astro | +| **api** | api.docxcorp.us `/stats`, `/documents`, `/manifest` | Cloudflare Worker | | **scraper** | Download, validate, deduplicate .docx files | Bun | | **extractor** | Extract text + detect language (Docling) | Bun + Python | | **embedder** | Generate embeddings (Gemini) | Bun | @@ -244,8 +246,8 @@ docker compose up -d DATABASE_URL=postgres://postgres:postgres@localhost:5432/docx_corpus \ bun run corpus status -# Run web API locally -cd apps/web/worker +# Run API locally +cd apps/api npx wrangler dev ``` diff --git a/apps/web/worker/bun.lock b/apps/api/bun.lock similarity index 100% rename from apps/web/worker/bun.lock rename to apps/api/bun.lock diff --git a/apps/web/worker/package.json b/apps/api/package.json similarity index 100% rename from apps/web/worker/package.json rename to apps/api/package.json diff --git a/apps/web/worker/src/index.ts b/apps/api/src/index.ts similarity index 100% rename from apps/web/worker/src/index.ts rename to apps/api/src/index.ts diff --git a/apps/web/worker/tsconfig.json b/apps/api/tsconfig.json similarity index 100% rename from apps/web/worker/tsconfig.json rename to apps/api/tsconfig.json diff --git a/apps/web/worker/wrangler.toml b/apps/api/wrangler.toml similarity index 100% rename from apps/web/worker/wrangler.toml rename to apps/api/wrangler.toml diff --git a/apps/web/README.md b/apps/site/README.md similarity index 99% rename from apps/web/README.md rename to apps/site/README.md index 9ecb994..db6db5e 100644 --- a/apps/web/README.md +++ b/apps/site/README.md @@ -1,4 +1,4 @@ -# docxcorp.us web +# docxcorp.us site Static site assets are deployed to Cloudflare R2 by `.github/workflows/deploy-site.yml`. diff --git a/apps/web/astro.config.mjs b/apps/site/astro.config.mjs similarity index 100% rename from apps/web/astro.config.mjs rename to apps/site/astro.config.mjs diff --git a/apps/web/content/classification.md b/apps/site/content/classification.md similarity index 100% rename from apps/web/content/classification.md rename to apps/site/content/classification.md diff --git a/apps/web/content/dataset.md b/apps/site/content/dataset.md similarity index 100% rename from apps/web/content/dataset.md rename to apps/site/content/dataset.md diff --git a/apps/web/content/download.md b/apps/site/content/download.md similarity index 100% rename from apps/web/content/download.md rename to apps/site/content/download.md diff --git a/apps/web/content/quality.md b/apps/site/content/quality.md similarity index 100% rename from apps/web/content/quality.md rename to apps/site/content/quality.md diff --git a/apps/web/package.json b/apps/site/package.json similarity index 92% rename from apps/web/package.json rename to apps/site/package.json index 3751611..47f3d06 100644 --- a/apps/web/package.json +++ b/apps/site/package.json @@ -1,5 +1,5 @@ { - "name": "@docx-corpus/web", + "name": "@docx-corpus/site", "private": true, "version": "0.0.0", "type": "module", diff --git a/apps/web/public/apple-touch-icon.png b/apps/site/public/apple-touch-icon.png similarity index 100% rename from apps/web/public/apple-touch-icon.png rename to apps/site/public/apple-touch-icon.png diff --git a/apps/web/public/favicon-16x16.png b/apps/site/public/favicon-16x16.png similarity index 100% rename from apps/web/public/favicon-16x16.png rename to apps/site/public/favicon-16x16.png diff --git a/apps/web/public/favicon-32x32.png b/apps/site/public/favicon-32x32.png similarity index 100% rename from apps/web/public/favicon-32x32.png rename to apps/site/public/favicon-32x32.png diff --git a/apps/web/public/favicon.ico b/apps/site/public/favicon.ico similarity index 100% rename from apps/web/public/favicon.ico rename to apps/site/public/favicon.ico diff --git a/apps/web/public/logo.png b/apps/site/public/logo.png similarity index 100% rename from apps/web/public/logo.png rename to apps/site/public/logo.png diff --git a/apps/web/public/og-image.png b/apps/site/public/og-image.png similarity index 100% rename from apps/web/public/og-image.png rename to apps/site/public/og-image.png diff --git a/apps/web/src/env.d.ts b/apps/site/src/env.d.ts similarity index 100% rename from apps/web/src/env.d.ts rename to apps/site/src/env.d.ts diff --git a/apps/web/src/layouts/Layout.astro b/apps/site/src/layouts/Layout.astro similarity index 100% rename from apps/web/src/layouts/Layout.astro rename to apps/site/src/layouts/Layout.astro diff --git a/apps/web/src/lib/content.ts b/apps/site/src/lib/content.ts similarity index 89% rename from apps/web/src/lib/content.ts rename to apps/site/src/lib/content.ts index d8728cb..5d272c9 100644 --- a/apps/web/src/lib/content.ts +++ b/apps/site/src/lib/content.ts @@ -1,5 +1,5 @@ /** - * Content loader for markdown drafts in apps/web/content/. + * Content loader for markdown drafts in apps/site/content/. * * Each .md file has YAML frontmatter parsed by Astro's import.meta.glob. * The body is rendered to HTML at build time. diff --git a/apps/web/src/lib/data.ts b/apps/site/src/lib/data.ts similarity index 100% rename from apps/web/src/lib/data.ts rename to apps/site/src/lib/data.ts diff --git a/apps/web/src/lib/routes.ts b/apps/site/src/lib/routes.ts similarity index 100% rename from apps/web/src/lib/routes.ts rename to apps/site/src/lib/routes.ts diff --git a/apps/web/src/lib/seo.ts b/apps/site/src/lib/seo.ts similarity index 100% rename from apps/web/src/lib/seo.ts rename to apps/site/src/lib/seo.ts diff --git a/apps/web/src/pages/classification.astro b/apps/site/src/pages/classification.astro similarity index 100% rename from apps/web/src/pages/classification.astro rename to apps/site/src/pages/classification.astro diff --git a/apps/web/src/pages/dataset.astro b/apps/site/src/pages/dataset.astro similarity index 100% rename from apps/web/src/pages/dataset.astro rename to apps/site/src/pages/dataset.astro diff --git a/apps/web/src/pages/download.astro b/apps/site/src/pages/download.astro similarity index 100% rename from apps/web/src/pages/download.astro rename to apps/site/src/pages/download.astro diff --git a/apps/web/src/pages/index.astro b/apps/site/src/pages/index.astro similarity index 99% rename from apps/web/src/pages/index.astro rename to apps/site/src/pages/index.astro index c5e9a81..0dc06fc 100644 --- a/apps/web/src/pages/index.astro +++ b/apps/site/src/pages/index.astro @@ -1,6 +1,6 @@ --- -// Homepage. Ported from the legacy static apps/web/public/index.html so it -// participates in the Astro build pipeline. Inline CSS and JS are preserved +// Homepage. Ported from the legacy static file so it participates in the +// Astro build pipeline. Inline CSS and JS are preserved // verbatim under `is:global is:inline` and `is:inline` respectively; the // explorer JS still fetches live data from api.docxcorp.us at runtime, same // as before. @@ -1564,4 +1564,4 @@ curl "https://api.docxcorp.us/manifest?type=legal&lang=en&min_confidence - \ No newline at end of file + diff --git a/apps/web/src/pages/llms.txt.ts b/apps/site/src/pages/llms.txt.ts similarity index 100% rename from apps/web/src/pages/llms.txt.ts rename to apps/site/src/pages/llms.txt.ts diff --git a/apps/web/src/pages/quality.astro b/apps/site/src/pages/quality.astro similarity index 100% rename from apps/web/src/pages/quality.astro rename to apps/site/src/pages/quality.astro diff --git a/apps/web/src/pages/robots.txt.ts b/apps/site/src/pages/robots.txt.ts similarity index 100% rename from apps/web/src/pages/robots.txt.ts rename to apps/site/src/pages/robots.txt.ts diff --git a/apps/web/src/pages/sitemap.xml.ts b/apps/site/src/pages/sitemap.xml.ts similarity index 100% rename from apps/web/src/pages/sitemap.xml.ts rename to apps/site/src/pages/sitemap.xml.ts diff --git a/apps/web/src/pages/topics/[topic].astro b/apps/site/src/pages/topics/[topic].astro similarity index 100% rename from apps/web/src/pages/topics/[topic].astro rename to apps/site/src/pages/topics/[topic].astro diff --git a/apps/web/src/pages/topics/index.astro b/apps/site/src/pages/topics/index.astro similarity index 100% rename from apps/web/src/pages/topics/index.astro rename to apps/site/src/pages/topics/index.astro diff --git a/apps/web/src/pages/types/[type].astro b/apps/site/src/pages/types/[type].astro similarity index 100% rename from apps/web/src/pages/types/[type].astro rename to apps/site/src/pages/types/[type].astro diff --git a/apps/web/src/pages/types/index.astro b/apps/site/src/pages/types/index.astro similarity index 100% rename from apps/web/src/pages/types/index.astro rename to apps/site/src/pages/types/index.astro diff --git a/apps/web/tsconfig.json b/apps/site/tsconfig.json similarity index 100% rename from apps/web/tsconfig.json rename to apps/site/tsconfig.json diff --git a/bun.lock b/bun.lock index 2136023..2ca1026 100644 --- a/bun.lock +++ b/bun.lock @@ -47,8 +47,8 @@ "typescript": "^5.9.3", }, }, - "apps/web": { - "name": "@docx-corpus/web", + "apps/site": { + "name": "@docx-corpus/site", "version": "0.0.0", "dependencies": { "@neondatabase/serverless": "^1.0.0", @@ -265,7 +265,7 @@ "@docx-corpus/shared": ["@docx-corpus/shared@workspace:packages/shared"], - "@docx-corpus/web": ["@docx-corpus/web@workspace:apps/web"], + "@docx-corpus/site": ["@docx-corpus/site@workspace:apps/site"], "@emmetio/abbreviation": ["@emmetio/abbreviation@2.3.3", "", { "dependencies": { "@emmetio/scanner": "^1.0.4" } }, "sha512-mgv58UrU3rh4YgbE/TzgLQwJ3pFsHHhCLqY20aJq+9comytTXUDNGG/SMtSeMJdkpxgXSXunBGLD8Boka3JyVA=="], diff --git a/package.json b/package.json index 88641fb..e8cd9d8 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,9 @@ "name": "docx-corpus", "private": true, "workspaces": [ - "apps/*", + "apps/cdx-filter", + "apps/cli", + "apps/site", "packages/*" ], "scripts": { @@ -16,11 +18,13 @@ "release:cli": "bun run --cwd apps/cli release", "setup:extractor": "bun run --cwd packages/extractor setup", "prepare": "lefthook install", - "dev:web": "bun run --cwd apps/web dev", - "build:web": "bun run --cwd apps/web build" + "dev:site": "bun run --cwd apps/site dev", + "build:site": "bun run --cwd apps/site build", + "dev:web": "bun run dev:site", + "build:web": "bun run build:site" }, "devDependencies": { "@biomejs/biome": "^2.4.6", "lefthook": "^1.11.13" } -} \ No newline at end of file +}