Skip to content

Commit b70b8fd

Browse files
authored
Fix llms.txt not including all content (#4260)
1 parent a9ba996 commit b70b8fd

3 files changed

Lines changed: 96 additions & 72 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"gitbook": patch
3+
---
4+
5+
Fix llms-full.txt pagination to include pages from all site sections.

packages/gitbook/src/routes/llms-full.ts

Lines changed: 79 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { filterSiteSpacesByLocale, getSiteStructureSections } from '@/lib/sites'
1010
import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api';
1111
import assertNever from 'assert-never';
1212
import type { Paragraph } from 'mdast';
13-
import { pMapIterable } from 'p-map';
13+
import pMap, { pMapIterable } from 'p-map';
1414

1515
// We limit the concurrency to 100 to avoid reaching limit with concurrent requests
1616
// or file descriptor limits.
@@ -19,6 +19,8 @@ const MAX_CONCURRENCY = 100;
1919
// Default limit for pages per batch
2020
const DEFAULT_PAGE_LIMIT = 100;
2121

22+
type MarkdownPageEntry = { context: GitBookSiteContext; page: RevisionPageDocument };
23+
2224
/**
2325
* Generate a llms-full.txt file for the site.
2426
* As the result can be large, we stream it as we generate it.
@@ -29,11 +31,16 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext, page = 0) {
2931
}
3032

3133
const offset = page * DEFAULT_PAGE_LIMIT;
34+
const allPages = await getMarkdownPageEntriesFromSiteStructure(context);
35+
36+
if (allPages.length <= offset) {
37+
return new Response('No content found', { status: 404 });
38+
}
3239

3340
return new Response(
3441
new ReadableStream<Uint8Array>({
3542
async pull(controller) {
36-
await streamMarkdownFromSiteStructure(context, controller, offset);
43+
await streamMarkdownPageEntries(context, controller, allPages, offset);
3744
controller.close();
3845
},
3946
}),
@@ -46,100 +53,100 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext, page = 0) {
4653
}
4754

4855
/**
49-
* Stream markdown from site structure.
56+
* Get the document pages that should be included in the full llms.txt output for a site structure.
5057
*/
51-
async function streamMarkdownFromSiteStructure(
52-
context: GitBookSiteContext,
53-
stream: ReadableStreamDefaultController<Uint8Array>,
54-
offset: number
55-
): Promise<void> {
58+
async function getMarkdownPageEntriesFromSiteStructure(
59+
context: GitBookSiteContext
60+
): Promise<MarkdownPageEntry[]> {
5661
switch (context.structure.type) {
5762
case 'sections':
58-
return streamMarkdownFromSections(
63+
return getMarkdownPageEntriesFromSections(
5964
context,
60-
stream,
61-
getSiteStructureSections(context.structure, { ignoreGroups: true }),
62-
offset
65+
getSiteStructureSections(context.structure, { ignoreGroups: true })
6366
);
6467
case 'siteSpaces':
65-
await streamMarkdownFromSiteSpaces(
66-
context,
67-
stream,
68-
context.structure.structure,
69-
offset
70-
);
71-
return;
68+
return getMarkdownPageEntriesFromSiteSpaces(context, context.structure.structure);
7269
default:
7370
assertNever(context.structure);
7471
}
7572
}
7673

7774
/**
78-
* Stream markdown from site sections.
75+
* Get the document pages that should be included in the full llms.txt output for site sections.
7976
*/
80-
async function streamMarkdownFromSections(
77+
async function getMarkdownPageEntriesFromSections(
8178
context: GitBookSiteContext,
82-
stream: ReadableStreamDefaultController<Uint8Array>,
83-
siteSections: SiteSection[],
84-
offset: number
85-
): Promise<void> {
86-
let currentPageIndex = 0;
87-
88-
for (const siteSection of siteSections) {
89-
const result = await streamMarkdownFromSiteSpaces(
90-
context,
91-
stream,
92-
siteSection.siteSpaces,
93-
offset,
94-
currentPageIndex
95-
);
96-
currentPageIndex = result.currentPageIndex;
97-
98-
if (result.reachedLimit) {
99-
break;
100-
}
101-
}
79+
siteSections: SiteSection[]
80+
): Promise<MarkdownPageEntry[]> {
81+
return getMarkdownPageEntriesFromFilteredSiteSpaces(
82+
siteSections.flatMap((siteSection) =>
83+
filterSiteSpacesByLocale(siteSection.siteSpaces, context.locale)
84+
),
85+
context
86+
);
10287
}
10388

10489
/**
105-
* Stream markdown from site spaces.
90+
* Get the document pages that should be included in the full llms.txt output for site spaces.
10691
*/
107-
export async function streamMarkdownFromSiteSpaces(
92+
async function getMarkdownPageEntriesFromSiteSpaces(
10893
context: GitBookSiteContext,
109-
stream: ReadableStreamDefaultController<Uint8Array>,
110-
siteSpaces: SiteSpace[],
111-
offset = 0,
112-
initialPageIndex = 0
113-
): Promise<{ currentPageIndex: number; reachedLimit: boolean }> {
114-
let totalPagesProcessed = initialPageIndex;
115-
116-
// Collect all pages first
117-
const allPages: Array<{ context: GitBookSiteContext; page: RevisionPageDocument }> = [];
118-
119-
const filteredSiteSpaces = filterSiteSpacesByLocale(siteSpaces, context.locale);
94+
siteSpaces: SiteSpace[]
95+
): Promise<MarkdownPageEntry[]> {
96+
return getMarkdownPageEntriesFromFilteredSiteSpaces(
97+
filterSiteSpacesByLocale(siteSpaces, context.locale),
98+
context
99+
);
100+
}
120101

121-
for (const siteSpace of filteredSiteSpaces) {
122-
const siteSpaceUrl = siteSpace.urls.published;
123-
if (!siteSpaceUrl) {
124-
continue;
125-
}
126-
const siteSpaceContext = await fetchSiteContextForSiteSpace(context, siteSpace);
127-
const pages = getIndexablePages(siteSpaceContext.revision.pages);
128-
129-
// Add document pages to our collection
130-
for (const { page } of pages) {
131-
if (page.type === 'document') {
132-
allPages.push({
133-
context: siteSpaceContext,
134-
page,
135-
});
136-
}
102+
/**
103+
* Get markdown page entries from already locale-filtered site spaces.
104+
*/
105+
async function getMarkdownPageEntriesFromFilteredSiteSpaces(
106+
siteSpaces: SiteSpace[],
107+
context: GitBookSiteContext
108+
): Promise<MarkdownPageEntry[]> {
109+
const publishedSiteSpaces = siteSpaces.filter((siteSpace) => siteSpace.urls.published);
110+
111+
const allPages = await pMap(
112+
publishedSiteSpaces,
113+
async (siteSpace): Promise<MarkdownPageEntry[]> => {
114+
const siteSpaceContext = await fetchSiteContextForSiteSpace(context, siteSpace);
115+
const pages = getIndexablePages(siteSpaceContext.revision.pages);
116+
117+
return pages.flatMap(({ page }) => {
118+
if (page.type !== 'document') {
119+
return [];
120+
}
121+
122+
return [
123+
{
124+
context: siteSpaceContext,
125+
page,
126+
},
127+
];
128+
});
129+
},
130+
{
131+
concurrency: MAX_CONCURRENCY,
137132
}
138-
}
133+
);
134+
135+
return allPages.flat();
136+
}
139137

138+
/**
139+
* Stream a single paginated window of markdown page entries.
140+
*/
141+
async function streamMarkdownPageEntries(
142+
context: GitBookSiteContext,
143+
stream: ReadableStreamDefaultController<Uint8Array>,
144+
allPages: MarkdownPageEntry[],
145+
offset: number
146+
): Promise<{ currentPageIndex: number; reachedLimit: boolean }> {
140147
// Apply pagination - skip pages before offset
141148
const pagesToProcess = allPages.slice(offset, offset + DEFAULT_PAGE_LIMIT);
142-
totalPagesProcessed = offset;
149+
let totalPagesProcessed = offset;
143150

144151
// Process the pages
145152
for await (const markdown of pMapIterable(

packages/gitbook/tests/llms.test.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,16 @@ describe('llms-full.txt', () => {
116116
},
117117
{ timeout: 30_000 }
118118
);
119+
120+
it(
121+
'should return 404 when a llms-full.txt page has no content',
122+
async () => {
123+
const response = await fetch(
124+
getContentTestURL('https://gitbook.gitbook.io/test-gitbook-open/llms-full.txt/999')
125+
);
126+
127+
expect(response.status).toBe(404);
128+
},
129+
{ timeout: 30_000 }
130+
);
119131
});

0 commit comments

Comments
 (0)