Skip to content

Commit bb0d7ca

Browse files
authored
Merge pull request #423 from SetuHQ/docs-ingestion
fix: review fixes, knowledge gap reduction, and env var hardening
2 parents b114247 + d3a7c1d commit bb0d7ca

6 files changed

Lines changed: 69 additions & 37 deletions

File tree

.github/workflows/docs-ingestion-ci.yml

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ jobs:
247247
working-directory: docs-embeddings
248248
env:
249249
DRY_RUN: 'true'
250+
AWS_REGION: ${{ vars.AWS_REGION }}
251+
BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }}
252+
PINECONE_INDEX: ${{ vars.PINECONE_INDEX }}
253+
BATCH_SIZE: ${{ vars.BATCH_SIZE }}
254+
EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }}
250255
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
251256
run: node dist/index.js --dry-run
252257

@@ -278,7 +283,7 @@ jobs:
278283
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4
279284
with:
280285
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
281-
aws-region: ap-south-1
286+
aws-region: ${{ vars.AWS_REGION }}
282287

283288
# ── Build ingestion pipeline ──
284289
- name: Install ingestion dependencies
@@ -305,7 +310,8 @@ jobs:
305310
- name: Upload content to S3
306311
working-directory: docs-ingestion
307312
env:
308-
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
313+
AWS_REGION: ${{ vars.AWS_REGION }}
314+
CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }}
309315
run: node dist/upload-content.js
310316

311317
# ── Build and run embedding sync ──
@@ -320,8 +326,12 @@ jobs:
320326
- name: Run embedding sync
321327
working-directory: docs-embeddings
322328
env:
329+
AWS_REGION: ${{ vars.AWS_REGION }}
330+
BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }}
323331
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
324-
PINECONE_INDEX: ${{ secrets.PINECONE_INDEX }}
325-
CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }}
332+
PINECONE_INDEX: ${{ vars.PINECONE_INDEX }}
333+
CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }}
334+
BATCH_SIZE: ${{ vars.BATCH_SIZE }}
335+
EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }}
326336
INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json
327337
run: node dist/index.js

docs-embeddings/src/embed-all.ts

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ function fail(msg: string): never {
5252
process.exit(1);
5353
}
5454

55+
function requireEnv(name: string): string {
56+
const value = process.env[name];
57+
if (!value) fail(`${name} environment variable is required`);
58+
return value;
59+
}
60+
5561
function runStep(label: string, cmd: string, cwd: string): void {
5662
console.log(`\n── ${label} ──`);
5763
try {
@@ -144,7 +150,10 @@ async function main(): Promise<void> {
144150
console.log("\n── Pinecone namespace check ──");
145151

146152
const pineconeApiKey = process.env.PINECONE_API_KEY;
147-
const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings";
153+
const pineconeIndex = process.env.PINECONE_INDEX;
154+
if (!pineconeIndex) {
155+
fail("PINECONE_INDEX environment variable is required");
156+
}
148157

149158
if (!pineconeApiKey) {
150159
fail("PINECONE_API_KEY environment variable is required");
@@ -241,18 +250,14 @@ async function main(): Promise<void> {
241250
const config: EmbeddingConfig = {
242251
ingestionOutputPath: chunksPath,
243252
stateFilePath: path.join(process.cwd(), "state", "indexed-hashes.json"),
244-
awsRegion: process.env.AWS_REGION || "ap-south-1",
245-
bedrockModelId:
246-
process.env.BEDROCK_MODEL_ID || "amazon.titan-embed-text-v2:0",
253+
awsRegion: requireEnv("AWS_REGION"),
254+
bedrockModelId: requireEnv("BEDROCK_MODEL_ID"),
247255
pineconeApiKey: pineconeApiKey!,
248-
pineconeIndex,
249-
batchSize: parseInt(process.env.BATCH_SIZE || "25", 10),
256+
pineconeIndex: pineconeIndex!,
257+
batchSize: parseInt(requireEnv("BATCH_SIZE"), 10),
250258
s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined,
251259
dryRun: false,
252-
embeddingConcurrency: parseInt(
253-
process.env.EMBEDDING_CONCURRENCY || "5",
254-
10,
255-
),
260+
embeddingConcurrency: parseInt(requireEnv("EMBEDDING_CONCURRENCY"), 10),
256261
};
257262

258263
try {

docs-embeddings/src/embedder.ts

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
import {
66
BedrockRuntimeClient,
77
InvokeModelCommand,
8-
InvokeModelCommandInput
9-
} from '@aws-sdk/client-bedrock-runtime';
8+
InvokeModelCommandInput,
9+
} from "@aws-sdk/client-bedrock-runtime";
1010

1111
export class BedrockEmbedder {
1212
private client: BedrockRuntimeClient;
1313
private modelId: string;
1414
private callCount: number = 0;
1515

16-
constructor(region: string = 'ap-south-1', modelId: string = 'amazon.titan-embed-text-v2:0') {
16+
constructor(region: string, modelId: string) {
1717
this.client = new BedrockRuntimeClient({ region });
1818
this.modelId = modelId;
1919
}
@@ -33,42 +33,47 @@ export class BedrockEmbedder {
3333
try {
3434
const input: InvokeModelCommandInput = {
3535
modelId: this.modelId,
36-
contentType: 'application/json',
37-
accept: 'application/json',
36+
contentType: "application/json",
37+
accept: "application/json",
3838
body: JSON.stringify({
3939
inputText: content,
40-
normalize: true
41-
})
40+
normalize: true,
41+
}),
4242
};
4343

4444
const command = new InvokeModelCommand(input);
4545
const response = await this.client.send(command);
4646

4747
const responseBody = JSON.parse(
48-
new TextDecoder().decode(response.body)
48+
new TextDecoder().decode(response.body),
4949
);
5050

5151
this.callCount++;
5252
return responseBody.embedding;
5353
} catch (error: any) {
5454
lastError = error;
5555
const isRetryable =
56-
error?.name === 'ThrottlingException' ||
57-
error?.name === 'ServiceUnavailableException' ||
58-
error?.name === 'ModelTimeoutException' ||
56+
error?.name === "ThrottlingException" ||
57+
error?.name === "ServiceUnavailableException" ||
58+
error?.name === "ModelTimeoutException" ||
5959
error?.$metadata?.httpStatusCode === 429 ||
6060
error?.$metadata?.httpStatusCode >= 500;
6161

6262
if (!isRetryable || attempt === BedrockEmbedder.MAX_RETRIES) {
63-
console.error(`Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`, error);
63+
console.error(
64+
`Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`,
65+
error,
66+
);
6467
throw error;
6568
}
6669

6770
// Full jitter: randomize within [50%, 100%] of exponential delay
6871
// to prevent thundering herd when multiple concurrent calls retry
6972
const maxDelay = Math.min(1000 * Math.pow(2, attempt), 16000);
70-
const delay = Math.floor(maxDelay / 2 + Math.random() * maxDelay / 2);
71-
console.warn(` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`);
73+
const delay = Math.floor(maxDelay / 2 + (Math.random() * maxDelay) / 2);
74+
console.warn(
75+
` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`,
76+
);
7277
await this.sleep(delay);
7378
}
7479
}
@@ -113,6 +118,6 @@ export class BedrockEmbedder {
113118
}
114119

115120
private sleep(ms: number): Promise<void> {
116-
return new Promise(resolve => setTimeout(resolve, ms));
121+
return new Promise((resolve) => setTimeout(resolve, ms));
117122
}
118123
}

docs-embeddings/src/index.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ import type { EmbeddingConfig } from './types.js';
1111
dotenv.config({ path: '.env.local' });
1212
dotenv.config({ path: '.env' });
1313

14+
function requireEnv(name: string): string {
15+
const value = process.env[name];
16+
if (!value) throw new Error(`${name} environment variable is required`);
17+
return value;
18+
}
19+
1420
async function main() {
1521
try {
1622
const dryRun = process.argv.includes('--dry-run') || process.env.DRY_RUN === 'true';
@@ -21,14 +27,14 @@ async function main() {
2127
path.join(process.cwd(), '..', 'docs-ingestion', 'output', 'chunks.json'),
2228
stateFilePath: process.env.STATE_FILE_PATH ||
2329
path.join(process.cwd(), 'state', 'indexed-hashes.json'),
24-
awsRegion: process.env.AWS_REGION || 'ap-south-1',
25-
bedrockModelId: process.env.BEDROCK_MODEL_ID || 'amazon.titan-embed-text-v2:0',
30+
awsRegion: requireEnv('AWS_REGION'),
31+
bedrockModelId: requireEnv('BEDROCK_MODEL_ID'),
2632
pineconeApiKey: process.env.PINECONE_API_KEY || '',
27-
pineconeIndex: process.env.PINECONE_INDEX || 'docs-embeddings',
28-
batchSize: parseInt(process.env.BATCH_SIZE || '25', 10),
33+
pineconeIndex: requireEnv('PINECONE_INDEX'),
34+
batchSize: parseInt(requireEnv('BATCH_SIZE'), 10),
2935
s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined,
3036
dryRun,
31-
embeddingConcurrency: parseInt(process.env.EMBEDDING_CONCURRENCY || '3', 10),
37+
embeddingConcurrency: parseInt(requireEnv('EMBEDDING_CONCURRENCY'), 10),
3238
};
3339

3440
// Validate (Pinecone key not required in dry-run)

docs-embeddings/src/verify-embed.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@ async function main(): Promise<void> {
7373

7474
// ── Connect to Pinecone ──────────────────────────────────────
7575
const pineconeApiKey = process.env.PINECONE_API_KEY;
76-
const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings";
76+
const pineconeIndex = process.env.PINECONE_INDEX;
77+
if (!pineconeIndex) {
78+
fail("PINECONE_INDEX environment variable is required");
79+
}
7780

7881
if (!pineconeApiKey) {
7982
fail("PINECONE_API_KEY environment variable is required");
@@ -105,7 +108,7 @@ async function main(): Promise<void> {
105108
console.log(" CONTENT_BUCKET_NAME not set — skipping S3 check\n");
106109
} else {
107110
const s3 = new S3Client({
108-
region: process.env.AWS_REGION || "ap-south-1",
111+
region: (() => { const r = process.env.AWS_REGION; if (!r) fail("AWS_REGION environment variable is required"); return r; })(),
109112
});
110113

111114
// Deterministic sample: pick every Nth chunk instead of random

docs-ingestion/src/upload-content.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ async function main() {
1919
try {
2020
// Load config from environment
2121
const bucketName = process.env.CONTENT_BUCKET_NAME;
22-
const awsRegion = process.env.AWS_REGION || "ap-south-1";
22+
const awsRegion = process.env.AWS_REGION;
23+
if (!awsRegion) {
24+
throw new Error("AWS_REGION environment variable is required");
25+
}
2326
const outputPath =
2427
process.env.OUTPUT_PATH ||
2528
path.join(process.cwd(), "output", "chunks.json");

0 commit comments

Comments
 (0)