@@ -10,7 +10,7 @@ import { filterSiteSpacesByLocale, getSiteStructureSections } from '@/lib/sites'
1010import type { RevisionPageDocument , SiteSection , SiteSpace } from '@gitbook/api' ;
1111import assertNever from 'assert-never' ;
1212import type { Paragraph } from 'mdast' ;
13- import { pMapIterable } from 'p-map' ;
13+ import pMap , { pMapIterable } from 'p-map' ;
1414
1515// We limit the concurrency to 100 to avoid reaching limit with concurrent requests
1616// or file descriptor limits.
@@ -19,6 +19,8 @@ const MAX_CONCURRENCY = 100;
1919// Default limit for pages per batch
2020const DEFAULT_PAGE_LIMIT = 100 ;
2121
22+ type MarkdownPageEntry = { context : GitBookSiteContext ; page : RevisionPageDocument } ;
23+
2224/**
2325 * Generate a llms-full.txt file for the site.
2426 * As the result can be large, we stream it as we generate it.
@@ -29,11 +31,16 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext, page = 0) {
2931 }
3032
3133 const offset = page * DEFAULT_PAGE_LIMIT ;
34+ const allPages = await getMarkdownPageEntriesFromSiteStructure ( context ) ;
35+
36+ if ( allPages . length <= offset ) {
37+ return new Response ( 'No content found' , { status : 404 } ) ;
38+ }
3239
3340 return new Response (
3441 new ReadableStream < Uint8Array > ( {
3542 async pull ( controller ) {
36- await streamMarkdownFromSiteStructure ( context , controller , offset ) ;
43+ await streamMarkdownPageEntries ( context , controller , allPages , offset ) ;
3744 controller . close ( ) ;
3845 } ,
3946 } ) ,
@@ -46,100 +53,100 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext, page = 0) {
4653}
4754
4855/**
49- * Stream markdown from site structure.
56+ * Get the document pages that should be included in the full llms.txt output for a site structure.
5057 */
51- async function streamMarkdownFromSiteStructure (
52- context : GitBookSiteContext ,
53- stream : ReadableStreamDefaultController < Uint8Array > ,
54- offset : number
55- ) : Promise < void > {
58+ async function getMarkdownPageEntriesFromSiteStructure (
59+ context : GitBookSiteContext
60+ ) : Promise < MarkdownPageEntry [ ] > {
5661 switch ( context . structure . type ) {
5762 case 'sections' :
58- return streamMarkdownFromSections (
63+ return getMarkdownPageEntriesFromSections (
5964 context ,
60- stream ,
61- getSiteStructureSections ( context . structure , { ignoreGroups : true } ) ,
62- offset
65+ getSiteStructureSections ( context . structure , { ignoreGroups : true } )
6366 ) ;
6467 case 'siteSpaces' :
65- await streamMarkdownFromSiteSpaces (
66- context ,
67- stream ,
68- context . structure . structure ,
69- offset
70- ) ;
71- return ;
68+ return getMarkdownPageEntriesFromSiteSpaces ( context , context . structure . structure ) ;
7269 default :
7370 assertNever ( context . structure ) ;
7471 }
7572}
7673
7774/**
78- * Stream markdown from site sections.
75+ * Get the document pages that should be included in the full llms.txt output for site sections.
7976 */
80- async function streamMarkdownFromSections (
77+ async function getMarkdownPageEntriesFromSections (
8178 context : GitBookSiteContext ,
82- stream : ReadableStreamDefaultController < Uint8Array > ,
83- siteSections : SiteSection [ ] ,
84- offset : number
85- ) : Promise < void > {
86- let currentPageIndex = 0 ;
87-
88- for ( const siteSection of siteSections ) {
89- const result = await streamMarkdownFromSiteSpaces (
90- context ,
91- stream ,
92- siteSection . siteSpaces ,
93- offset ,
94- currentPageIndex
95- ) ;
96- currentPageIndex = result . currentPageIndex ;
97-
98- if ( result . reachedLimit ) {
99- break ;
100- }
101- }
79+ siteSections : SiteSection [ ]
80+ ) : Promise < MarkdownPageEntry [ ] > {
81+ return getMarkdownPageEntriesFromFilteredSiteSpaces (
82+ siteSections . flatMap ( ( siteSection ) =>
83+ filterSiteSpacesByLocale ( siteSection . siteSpaces , context . locale )
84+ ) ,
85+ context
86+ ) ;
10287}
10388
10489/**
105- * Stream markdown from site spaces.
90+ * Get the document pages that should be included in the full llms.txt output for site spaces.
10691 */
107- export async function streamMarkdownFromSiteSpaces (
92+ async function getMarkdownPageEntriesFromSiteSpaces (
10893 context : GitBookSiteContext ,
109- stream : ReadableStreamDefaultController < Uint8Array > ,
110- siteSpaces : SiteSpace [ ] ,
111- offset = 0 ,
112- initialPageIndex = 0
113- ) : Promise < { currentPageIndex : number ; reachedLimit : boolean } > {
114- let totalPagesProcessed = initialPageIndex ;
115-
116- // Collect all pages first
117- const allPages : Array < { context : GitBookSiteContext ; page : RevisionPageDocument } > = [ ] ;
118-
119- const filteredSiteSpaces = filterSiteSpacesByLocale ( siteSpaces , context . locale ) ;
94+ siteSpaces : SiteSpace [ ]
95+ ) : Promise < MarkdownPageEntry [ ] > {
96+ return getMarkdownPageEntriesFromFilteredSiteSpaces (
97+ filterSiteSpacesByLocale ( siteSpaces , context . locale ) ,
98+ context
99+ ) ;
100+ }
120101
121- for ( const siteSpace of filteredSiteSpaces ) {
122- const siteSpaceUrl = siteSpace . urls . published ;
123- if ( ! siteSpaceUrl ) {
124- continue ;
125- }
126- const siteSpaceContext = await fetchSiteContextForSiteSpace ( context , siteSpace ) ;
127- const pages = getIndexablePages ( siteSpaceContext . revision . pages ) ;
128-
129- // Add document pages to our collection
130- for ( const { page } of pages ) {
131- if ( page . type === 'document' ) {
132- allPages . push ( {
133- context : siteSpaceContext ,
134- page,
135- } ) ;
136- }
102+ /**
103+ * Get markdown page entries from already locale-filtered site spaces.
104+ */
105+ async function getMarkdownPageEntriesFromFilteredSiteSpaces (
106+ siteSpaces : SiteSpace [ ] ,
107+ context : GitBookSiteContext
108+ ) : Promise < MarkdownPageEntry [ ] > {
109+ const publishedSiteSpaces = siteSpaces . filter ( ( siteSpace ) => siteSpace . urls . published ) ;
110+
111+ const allPages = await pMap (
112+ publishedSiteSpaces ,
113+ async ( siteSpace ) : Promise < MarkdownPageEntry [ ] > => {
114+ const siteSpaceContext = await fetchSiteContextForSiteSpace ( context , siteSpace ) ;
115+ const pages = getIndexablePages ( siteSpaceContext . revision . pages ) ;
116+
117+ return pages . flatMap ( ( { page } ) => {
118+ if ( page . type !== 'document' ) {
119+ return [ ] ;
120+ }
121+
122+ return [
123+ {
124+ context : siteSpaceContext ,
125+ page,
126+ } ,
127+ ] ;
128+ } ) ;
129+ } ,
130+ {
131+ concurrency : MAX_CONCURRENCY ,
137132 }
138- }
133+ ) ;
134+
135+ return allPages . flat ( ) ;
136+ }
139137
138+ /**
139+ * Stream a single paginated window of markdown page entries.
140+ */
141+ async function streamMarkdownPageEntries (
142+ context : GitBookSiteContext ,
143+ stream : ReadableStreamDefaultController < Uint8Array > ,
144+ allPages : MarkdownPageEntry [ ] ,
145+ offset : number
146+ ) : Promise < { currentPageIndex : number ; reachedLimit : boolean } > {
140147 // Apply pagination - skip pages before offset
141148 const pagesToProcess = allPages . slice ( offset , offset + DEFAULT_PAGE_LIMIT ) ;
142- totalPagesProcessed = offset ;
149+ let totalPagesProcessed = offset ;
143150
144151 // Process the pages
145152 for await ( const markdown of pMapIterable (
0 commit comments