Skip to content

Commit b114247

Browse files
authored
Merge pull request #422 from SetuHQ/docs-ingestion
feat: add docs-ingestion and docs-embeddings pipelines
2 parents 75154c3 + f9bb103 commit b114247

76 files changed

Lines changed: 19134 additions & 918 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
name: Docs Ingestion CI
2+
3+
on:
4+
pull_request:
5+
branches: [main, staging]
6+
paths:
7+
- 'docs-ingestion/**'
8+
- 'docs-embeddings/**'
9+
- 'api-references/**'
10+
- 'content/**'
11+
push:
12+
branches: [main, staging]
13+
paths:
14+
- 'docs-ingestion/**'
15+
- 'docs-embeddings/**'
16+
- 'api-references/**'
17+
- 'content/**'
18+
19+
jobs:
20+
build-and-test:
21+
name: Build & Test
22+
runs-on: ubuntu-latest
23+
defaults:
24+
run:
25+
working-directory: docs-ingestion
26+
27+
steps:
28+
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
29+
30+
- name: Enable Corepack
31+
run: corepack enable
32+
33+
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
34+
with:
35+
node-version: '20'
36+
cache: 'yarn'
37+
cache-dependency-path: docs-ingestion/yarn.lock
38+
39+
- name: Install dependencies
40+
run: yarn install --immutable
41+
42+
- name: Security audit
43+
run: yarn npm audit --severity moderate
44+
continue-on-error: true
45+
46+
- name: Build
47+
run: yarn build
48+
49+
- name: Normalize MDX (required by integration tests)
50+
run: yarn normalize-mdx
51+
52+
# ── E. Test suite ──
53+
- name: Run tests
54+
run: yarn test
55+
56+
normalize-api-specs:
57+
name: API Spec Normalization
58+
runs-on: ubuntu-latest
59+
defaults:
60+
run:
61+
working-directory: docs-ingestion
62+
63+
steps:
64+
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
65+
66+
- name: Enable Corepack
67+
run: corepack enable
68+
69+
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
70+
with:
71+
node-version: '20'
72+
cache: 'yarn'
73+
cache-dependency-path: docs-ingestion/yarn.lock
74+
75+
- name: Install dependencies
76+
run: yarn install --immutable
77+
78+
# ── A. Normalization run ──
79+
- name: Run API spec normalization
80+
run: yarn normalize-api-specs
81+
82+
- name: Verify output directory exists
83+
run: |
84+
if [ ! -d "../.api-reference-normalized" ]; then
85+
echo "FAIL: .api-reference-normalized/ directory does not exist"
86+
exit 1
87+
fi
88+
echo "PASS: Directory exists"
89+
90+
- name: Verify file count
91+
run: |
92+
count=$(find ../.api-reference-normalized -name '*.md' | wc -l | tr -d ' ')
93+
echo "Found $count normalized files"
94+
if [ "$count" -lt 200 ]; then
95+
echo "FAIL: Expected at least 200 files, got $count"
96+
exit 1
97+
fi
98+
echo "PASS: File count ($count) >= 200"
99+
100+
# ── B. Determinism check ──
101+
- name: Copy first run output
102+
run: cp -r ../.api-reference-normalized /tmp/api-ref-norm-run1
103+
104+
- name: Run normalization again
105+
run: yarn normalize-api-specs
106+
107+
- name: Verify determinism
108+
run: |
109+
diff_output=$(diff -r ../.api-reference-normalized /tmp/api-ref-norm-run1 2>&1) || true
110+
if [ -n "$diff_output" ]; then
111+
echo "FAIL: Normalization is not deterministic:"
112+
echo "$diff_output" | head -20
113+
exit 1
114+
fi
115+
echo "PASS: Output is deterministic"
116+
117+
# ── C. Token limit compliance ──
118+
- name: Check token limits
119+
run: yarn check-token-limits
120+
121+
# ── F. Git ignored state check ──
122+
- name: Verify .api-reference-normalized is gitignored
123+
run: |
124+
cd ..
125+
if git ls-files --error-unmatch .api-reference-normalized/ 2>/dev/null; then
126+
echo "FAIL: .api-reference-normalized/ is tracked by git"
127+
exit 1
128+
fi
129+
echo "PASS: .api-reference-normalized/ is not tracked"
130+
131+
- name: Verify .docs-normalized is gitignored
132+
run: |
133+
cd ..
134+
if git ls-files --error-unmatch .docs-normalized/ 2>/dev/null; then
135+
echo "FAIL: .docs-normalized/ is tracked by git"
136+
exit 1
137+
fi
138+
echo "PASS: .docs-normalized/ is not tracked"
139+
140+
ingestion-smoke-test:
141+
name: Ingestion Smoke Test
142+
runs-on: ubuntu-latest
143+
needs: [build-and-test, normalize-api-specs]
144+
defaults:
145+
run:
146+
working-directory: docs-ingestion
147+
148+
steps:
149+
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
150+
151+
- name: Enable Corepack
152+
run: corepack enable
153+
154+
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
155+
with:
156+
node-version: '20'
157+
cache: 'yarn'
158+
cache-dependency-path: docs-ingestion/yarn.lock
159+
160+
- name: Install dependencies
161+
run: yarn install --immutable
162+
163+
- name: Build
164+
run: yarn build
165+
166+
# Generate the normalized API specs (required for smoke test)
167+
- name: Normalize API specs
168+
run: yarn normalize-api-specs
169+
170+
# Normalize MDX if content/ exists
171+
- name: Normalize MDX (if content exists)
172+
run: |
173+
if [ -d "../content" ]; then
174+
yarn normalize-mdx || exit 1
175+
else
176+
echo "No content/ directory — skipping MDX normalization"
177+
fi
178+
179+
# ── D. Ingestion smoke test ──
180+
- name: Run ingestion smoke test
181+
run: yarn smoke-test-ingestion
182+
183+
embedding-dry-run:
184+
name: Embedding Dry Run
185+
runs-on: ubuntu-latest
186+
needs: [ingestion-smoke-test]
187+
188+
steps:
189+
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
190+
191+
- name: Enable Corepack
192+
run: corepack enable
193+
194+
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
195+
with:
196+
node-version: '20'
197+
cache: 'yarn'
198+
cache-dependency-path: |
199+
docs-ingestion/yarn.lock
200+
docs-embeddings/yarn.lock
201+
202+
# Build ingestion pipeline and produce chunks.json
203+
- name: Install ingestion dependencies
204+
working-directory: docs-ingestion
205+
run: yarn install --immutable
206+
207+
- name: Build ingestion
208+
working-directory: docs-ingestion
209+
run: yarn build
210+
211+
- name: Normalize API specs
212+
working-directory: docs-ingestion
213+
run: yarn normalize-api-specs
214+
215+
- name: Normalize MDX (if content exists)
216+
working-directory: docs-ingestion
217+
run: |
218+
if [ -d "../content" ]; then
219+
yarn normalize-mdx || exit 1
220+
else
221+
echo "No content/ directory — skipping MDX normalization"
222+
fi
223+
224+
- name: Run ingestion pipeline
225+
working-directory: docs-ingestion
226+
run: node dist/index.js
227+
228+
- name: Verify chunks.json exists
229+
run: |
230+
if [ ! -f "docs-ingestion/output/chunks.json" ]; then
231+
echo "FAIL: chunks.json not produced"
232+
exit 1
233+
fi
234+
echo "PASS: chunks.json exists"
235+
236+
# Build embeddings pipeline and run dry-run
237+
- name: Install embedding dependencies
238+
working-directory: docs-embeddings
239+
run: yarn install --immutable
240+
241+
- name: Build embeddings
242+
working-directory: docs-embeddings
243+
run: yarn build
244+
245+
# ── E. Embedding dry run ──
246+
- name: Run embedding dry run
247+
working-directory: docs-embeddings
248+
env:
249+
DRY_RUN: 'true'
250+
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
251+
run: node dist/index.js --dry-run
252+
253+
# ── Deploy: update knowledge base (main only) ──
254+
deploy-knowledge-base:
255+
name: Deploy Knowledge Base
256+
runs-on: ubuntu-latest
257+
needs: [embedding-dry-run]
258+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
259+
permissions:
260+
id-token: write
261+
contents: read
262+
263+
steps:
264+
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
265+
266+
- name: Enable Corepack
267+
run: corepack enable
268+
269+
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
270+
with:
271+
node-version: '20'
272+
cache: 'yarn'
273+
cache-dependency-path: |
274+
docs-ingestion/yarn.lock
275+
docs-embeddings/yarn.lock
276+
277+
- name: Configure AWS Credentials
278+
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4
279+
with:
280+
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
281+
aws-region: ap-south-1
282+
283+
# ── Build ingestion pipeline ──
284+
- name: Install ingestion dependencies
285+
working-directory: docs-ingestion
286+
run: yarn install --immutable
287+
288+
- name: Build ingestion
289+
working-directory: docs-ingestion
290+
run: yarn build
291+
292+
- name: Normalize API specs
293+
working-directory: docs-ingestion
294+
run: yarn normalize-api-specs
295+
296+
- name: Normalize MDX
297+
working-directory: docs-ingestion
298+
run: yarn normalize-mdx
299+
300+
- name: Run ingestion pipeline
301+
working-directory: docs-ingestion
302+
run: node dist/index.js
303+
304+
# ── Upload content to S3 ──
305+
- name: Upload content to S3
306+
working-directory: docs-ingestion
307+
env:
308+
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
309+
run: node dist/upload-content.js
310+
311+
# ── Build and run embedding sync ──
312+
- name: Install embedding dependencies
313+
working-directory: docs-embeddings
314+
run: yarn install --immutable
315+
316+
- name: Build embeddings
317+
working-directory: docs-embeddings
318+
run: yarn build
319+
320+
- name: Run embedding sync
321+
working-directory: docs-embeddings
322+
env:
323+
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
324+
PINECONE_INDEX: ${{ secrets.PINECONE_INDEX }}
325+
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
326+
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
327+
run: node dist/index.js

.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Normalized MDX output (generated by docs-ingestion/normalize-mdx)
2+
.docs-normalized/
3+
4+
# Normalized API spec output (generated by docs-ingestion/normalize-api-specs)
5+
.api-reference-normalized/
6+
7+
# Ruflo
8+
.ruflo/
9+
10+
# Claude Code
11+
.claude-flow/
12+
.mcp.json
13+
.claude
14+
.swarm

0 commit comments

Comments
 (0)