Skip to content

Commit a941e99

Browse files
committed
fix: remove code separated chunks
1 parent 6532de2 commit a941e99

2 files changed

Lines changed: 38 additions & 79 deletions

File tree

scripts/ingest/chunk.ts

Lines changed: 35 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ interface Chunk {
2525
sectionId: string;
2626
sectionTitle: string;
2727
content: string;
28-
contentType: "text" | "xml_example" | "table";
28+
embeddingText: string;
29+
contentType: "text";
2930
pageNumber: number;
3031
chunkIndex: number;
3132
}
@@ -43,43 +44,24 @@ const XML_PATTERN = /<[^>]+>[\s\S]*?<\/[^>]+>/g;
4344
// Table pattern (markdown tables with | separators)
4445
const TABLE_PATTERN = /\|[^\n]+\|(?:\n\|[^\n]+\|)+/g;
4546

46-
function extractCodeBlocks(content: string): {
47-
codeBlocks: string[];
48-
tables: string[];
49-
textContent: string;
50-
} {
51-
const codeBlocks: string[] = [];
52-
const tables: string[] = [];
53-
let textContent = content;
54-
55-
// Extract markdown code fences first (pymupdf4llm format)
56-
const fenceMatches = textContent.match(CODE_FENCE_PATTERN) || [];
57-
for (const match of fenceMatches) {
58-
if (match.length > 50) {
59-
codeBlocks.push(match);
60-
textContent = textContent.replace(match, "\n[CODE_BLOCK]\n");
61-
}
62-
}
47+
/**
48+
* Strip code blocks, XML, and tables from content for embedding generation.
49+
* Returns text-only version suitable for semantic search embeddings.
50+
*/
51+
function stripForEmbedding(content: string): string {
52+
let text = content;
6353

64-
// Extract any remaining raw XML blocks
65-
const xmlMatches = textContent.match(XML_PATTERN) || [];
66-
for (const match of xmlMatches) {
67-
if (match.length > 50) {
68-
codeBlocks.push(match);
69-
textContent = textContent.replace(match, "\n[CODE_BLOCK]\n");
70-
}
71-
}
54+
// Strip markdown code fences
55+
text = text.replace(CODE_FENCE_PATTERN, " ");
7256

73-
// Extract markdown tables
74-
const tableMatches = textContent.match(TABLE_PATTERN) || [];
75-
for (const match of tableMatches) {
76-
if (match.length > 50) {
77-
tables.push(match);
78-
textContent = textContent.replace(match, "\n[TABLE]\n");
79-
}
80-
}
57+
// Strip raw XML blocks
58+
text = text.replace(XML_PATTERN, " ");
59+
60+
// Strip markdown tables
61+
text = text.replace(TABLE_PATTERN, " ");
8162

82-
return { codeBlocks, tables, textContent };
63+
// Collapse whitespace
64+
return text.replace(/\n{3,}/g, "\n\n").trim();
8365
}
8466

8567
function splitIntoChunks(
@@ -90,40 +72,12 @@ function splitIntoChunks(
9072
): Chunk[] {
9173
const chunks: Chunk[] = [];
9274

93-
// Extract code blocks and tables first
94-
const { codeBlocks, tables, textContent } = extractCodeBlocks(text);
95-
96-
// Add code blocks as separate chunks
97-
for (const code of codeBlocks) {
98-
chunks.push({
99-
sectionId,
100-
sectionTitle,
101-
content: code,
102-
contentType: "xml_example",
103-
pageNumber: pageStart,
104-
chunkIndex: chunks.length,
105-
});
106-
}
107-
108-
// Add tables as separate chunks
109-
for (const table of tables) {
110-
chunks.push({
111-
sectionId,
112-
sectionTitle,
113-
content: table,
114-
contentType: "table",
115-
pageNumber: pageStart,
116-
chunkIndex: chunks.length,
117-
});
118-
}
119-
120-
// Split remaining text into chunks
121-
if (textContent.trim().length === 0) {
75+
if (text.trim().length === 0) {
12276
return chunks;
12377
}
12478

125-
// Split by paragraphs first
126-
const paragraphs = textContent.split(/\n\n+/);
79+
// Split full content (with code blocks and tables inline) by paragraphs
80+
const paragraphs = text.split(/\n\n+/);
12781
let currentChunk = "";
12882
const currentPage = pageStart;
12983

@@ -135,10 +89,12 @@ function splitIntoChunks(
13589
if (currentChunk.length + trimmedPara.length > CHUNK_SIZE) {
13690
// Save current chunk if it has content
13791
if (currentChunk.trim()) {
92+
const content = currentChunk.trim();
13893
chunks.push({
13994
sectionId,
14095
sectionTitle,
141-
content: currentChunk.trim(),
96+
content,
97+
embeddingText: stripForEmbedding(content),
14298
contentType: "text",
14399
pageNumber: currentPage,
144100
chunkIndex: chunks.length,
@@ -155,10 +111,12 @@ function splitIntoChunks(
155111

156112
// Don't forget the last chunk
157113
if (currentChunk.trim()) {
114+
const content = currentChunk.trim();
158115
chunks.push({
159116
sectionId,
160117
sectionTitle,
161-
content: currentChunk.trim(),
118+
content,
119+
embeddingText: stripForEmbedding(content),
162120
contentType: "text",
163121
pageNumber: currentPage,
164122
chunkIndex: chunks.length,
@@ -211,17 +169,17 @@ async function main() {
211169
console.log(`\nSaved ${chunks.length} chunks to ${outputFile}`);
212170

213171
// Print stats
214-
const textChunks = chunks.filter((c) => c.contentType === "text");
215-
const xmlChunks = chunks.filter((c) => c.contentType === "xml_example");
216-
const tableChunks = chunks.filter((c) => c.contentType === "table");
172+
const avgContent = Math.round(
173+
chunks.reduce((sum, c) => sum + c.content.length, 0) / chunks.length,
174+
);
175+
const avgEmbedding = Math.round(
176+
chunks.reduce((sum, c) => sum + c.embeddingText.length, 0) / chunks.length,
177+
);
217178

218179
console.log("\nChunk statistics:");
219-
console.log(` Text chunks: ${textChunks.length}`);
220-
console.log(` XML example chunks: ${xmlChunks.length}`);
221-
console.log(` Table chunks: ${tableChunks.length}`);
222-
console.log(
223-
` Average text chunk size: ${Math.round(textChunks.reduce((sum, c) => sum + c.content.length, 0) / textChunks.length)} chars`,
224-
);
180+
console.log(` Total chunks: ${chunks.length}`);
181+
console.log(` Average content size: ${avgContent} chars`);
182+
console.log(` Average embedding text size: ${avgEmbedding} chars`);
225183
} catch (error) {
226184
console.error("Chunking failed:", error);
227185
process.exit(1);

scripts/ingest/embed.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ interface Chunk {
2323
sectionId: string;
2424
sectionTitle: string;
2525
content: string;
26-
contentType: "text" | "xml_example" | "table";
26+
embeddingText?: string;
27+
contentType: "text";
2728
pageNumber: number;
2829
chunkIndex: number;
2930
}
@@ -59,7 +60,7 @@ async function embedChunks(chunks: Chunk[], provider: EmbeddingProvider): Promis
5960

6061
for (let i = 0; i < chunks.length; i += batchSize) {
6162
const batch = chunks.slice(i, i + batchSize);
62-
const texts = batch.map((c) => c.content);
63+
const texts = batch.map((c) => c.embeddingText ?? c.content);
6364

6465
try {
6566
const embeddings = await client.embedBatch(texts);

0 commit comments

Comments
 (0)