From 4af8e5c1b976559aa9a94f81b5c5f342be57ff3a Mon Sep 17 00:00:00 2001 From: Ali Tariq Date: Tue, 31 Mar 2026 13:34:10 +0500 Subject: [PATCH] fixes single page document misdetection as table of contents --- pageindex/page_index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 719255463..46adc6afa 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -114,7 +114,8 @@ def toc_detector_single_page(content, model=None): }} Directly return the final JSON structure. Do not output anything else. - Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" + Please note: abstract, summary, notation list, figure list, table list, etc. are not table of contents. + Also note: a page containing actual document content (e.g. policy text, regulations, descriptions, rules) with numbered sections is NOT a table of contents, even if the sections are numbered. A true table of contents only lists section titles as references to content found elsewhere in the document, typically accompanied by page numbers or dotted leaders.""" response = llm_completion(model=model, prompt=prompt) # print('response', response)