@@ -25,7 +25,8 @@ interface Chunk {
2525 sectionId : string ;
2626 sectionTitle : string ;
2727 content : string ;
28- contentType : "text" | "xml_example" | "table" ;
28+ embeddingText : string ;
29+ contentType : "text" ;
2930 pageNumber : number ;
3031 chunkIndex : number ;
3132}
@@ -43,43 +44,24 @@ const XML_PATTERN = /<[^>]+>[\s\S]*?<\/[^>]+>/g;
4344// Table pattern (markdown tables with | separators)
4445const TABLE_PATTERN = / \| [ ^ \n ] + \| (?: \n \| [ ^ \n ] + \| ) + / g;
4546
46- function extractCodeBlocks ( content : string ) : {
47- codeBlocks : string [ ] ;
48- tables : string [ ] ;
49- textContent : string ;
50- } {
51- const codeBlocks : string [ ] = [ ] ;
52- const tables : string [ ] = [ ] ;
53- let textContent = content ;
54-
55- // Extract markdown code fences first (pymupdf4llm format)
56- const fenceMatches = textContent . match ( CODE_FENCE_PATTERN ) || [ ] ;
57- for ( const match of fenceMatches ) {
58- if ( match . length > 50 ) {
59- codeBlocks . push ( match ) ;
60- textContent = textContent . replace ( match , "\n[CODE_BLOCK]\n" ) ;
61- }
62- }
47+ /**
48+ * Strip code blocks, XML, and tables from content for embedding generation.
49+ * Returns text-only version suitable for semantic search embeddings.
50+ */
51+ function stripForEmbedding ( content : string ) : string {
52+ let text = content ;
6353
64- // Extract any remaining raw XML blocks
65- const xmlMatches = textContent . match ( XML_PATTERN ) || [ ] ;
66- for ( const match of xmlMatches ) {
67- if ( match . length > 50 ) {
68- codeBlocks . push ( match ) ;
69- textContent = textContent . replace ( match , "\n[CODE_BLOCK]\n" ) ;
70- }
71- }
54+ // Strip markdown code fences
55+ text = text . replace ( CODE_FENCE_PATTERN , " " ) ;
7256
73- // Extract markdown tables
74- const tableMatches = textContent . match ( TABLE_PATTERN ) || [ ] ;
75- for ( const match of tableMatches ) {
76- if ( match . length > 50 ) {
77- tables . push ( match ) ;
78- textContent = textContent . replace ( match , "\n[TABLE]\n" ) ;
79- }
80- }
57+ // Strip raw XML blocks
58+ text = text . replace ( XML_PATTERN , " " ) ;
59+
60+ // Strip markdown tables
61+ text = text . replace ( TABLE_PATTERN , " " ) ;
8162
82- return { codeBlocks, tables, textContent } ;
63+ // Collapse whitespace
64+ return text . replace ( / \n { 3 , } / g, "\n\n" ) . trim ( ) ;
8365}
8466
8567function splitIntoChunks (
@@ -90,40 +72,12 @@ function splitIntoChunks(
9072) : Chunk [ ] {
9173 const chunks : Chunk [ ] = [ ] ;
9274
93- // Extract code blocks and tables first
94- const { codeBlocks, tables, textContent } = extractCodeBlocks ( text ) ;
95-
96- // Add code blocks as separate chunks
97- for ( const code of codeBlocks ) {
98- chunks . push ( {
99- sectionId,
100- sectionTitle,
101- content : code ,
102- contentType : "xml_example" ,
103- pageNumber : pageStart ,
104- chunkIndex : chunks . length ,
105- } ) ;
106- }
107-
108- // Add tables as separate chunks
109- for ( const table of tables ) {
110- chunks . push ( {
111- sectionId,
112- sectionTitle,
113- content : table ,
114- contentType : "table" ,
115- pageNumber : pageStart ,
116- chunkIndex : chunks . length ,
117- } ) ;
118- }
119-
120- // Split remaining text into chunks
121- if ( textContent . trim ( ) . length === 0 ) {
75+ if ( text . trim ( ) . length === 0 ) {
12276 return chunks ;
12377 }
12478
125- // Split by paragraphs first
126- const paragraphs = textContent . split ( / \n \n + / ) ;
79+ // Split full content (with code blocks and tables inline) by paragraphs
80+ const paragraphs = text . split ( / \n \n + / ) ;
12781 let currentChunk = "" ;
12882 const currentPage = pageStart ;
12983
@@ -135,10 +89,12 @@ function splitIntoChunks(
13589 if ( currentChunk . length + trimmedPara . length > CHUNK_SIZE ) {
13690 // Save current chunk if it has content
13791 if ( currentChunk . trim ( ) ) {
92+ const content = currentChunk . trim ( ) ;
13893 chunks . push ( {
13994 sectionId,
14095 sectionTitle,
141- content : currentChunk . trim ( ) ,
96+ content,
97+ embeddingText : stripForEmbedding ( content ) ,
14298 contentType : "text" ,
14399 pageNumber : currentPage ,
144100 chunkIndex : chunks . length ,
@@ -155,10 +111,12 @@ function splitIntoChunks(
155111
156112 // Don't forget the last chunk
157113 if ( currentChunk . trim ( ) ) {
114+ const content = currentChunk . trim ( ) ;
158115 chunks . push ( {
159116 sectionId,
160117 sectionTitle,
161- content : currentChunk . trim ( ) ,
118+ content,
119+ embeddingText : stripForEmbedding ( content ) ,
162120 contentType : "text" ,
163121 pageNumber : currentPage ,
164122 chunkIndex : chunks . length ,
@@ -211,17 +169,17 @@ async function main() {
211169 console . log ( `\nSaved ${ chunks . length } chunks to ${ outputFile } ` ) ;
212170
213171 // Print stats
214- const textChunks = chunks . filter ( ( c ) => c . contentType === "text" ) ;
215- const xmlChunks = chunks . filter ( ( c ) => c . contentType === "xml_example" ) ;
216- const tableChunks = chunks . filter ( ( c ) => c . contentType === "table" ) ;
172+ const avgContent = Math . round (
173+ chunks . reduce ( ( sum , c ) => sum + c . content . length , 0 ) / chunks . length ,
174+ ) ;
175+ const avgEmbedding = Math . round (
176+ chunks . reduce ( ( sum , c ) => sum + c . embeddingText . length , 0 ) / chunks . length ,
177+ ) ;
217178
218179 console . log ( "\nChunk statistics:" ) ;
219- console . log ( ` Text chunks: ${ textChunks . length } ` ) ;
220- console . log ( ` XML example chunks: ${ xmlChunks . length } ` ) ;
221- console . log ( ` Table chunks: ${ tableChunks . length } ` ) ;
222- console . log (
223- ` Average text chunk size: ${ Math . round ( textChunks . reduce ( ( sum , c ) => sum + c . content . length , 0 ) / textChunks . length ) } chars` ,
224- ) ;
180+ console . log ( ` Total chunks: ${ chunks . length } ` ) ;
181+ console . log ( ` Average content size: ${ avgContent } chars` ) ;
182+ console . log ( ` Average embedding text size: ${ avgEmbedding } chars` ) ;
225183 } catch ( error ) {
226184 console . error ( "Chunking failed:" , error ) ;
227185 process . exit ( 1 ) ;
0 commit comments