Skip to content

Commit d77d967

Browse files
committed
refactor: Pydantic IndexConfig, bool flags, early API key validation
- Replace ConfigLoader + config.yaml with Pydantic IndexConfig - Use bool for config flags (if_add_node_summary etc.) instead of "yes"/"no" - Enable doc_description by default for better agent QA - Early API key validation on LocalClient init via litellm provider detection - Expose index_config parameter on LocalClient for advanced users - Remove config.yaml dependency from pip package
1 parent f66658f commit d77d967

File tree

8 files changed

+168
-151
lines changed

8 files changed

+168
-151
lines changed

pageindex/client.py

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
# pageindex/client.py
22
from __future__ import annotations
3-
import os
43
from pathlib import Path
54
from .collection import Collection
6-
from .config import ConfigLoader
5+
from .config import IndexConfig
76
from .parser.protocol import DocumentParser
87

98

@@ -42,17 +41,51 @@ def _init_cloud(self, api_key: str):
4241
from .backend.cloud import CloudBackend
4342
self._backend = CloudBackend(api_key=api_key)
4443

45-
def _init_local(self, model: str = None, retrieve_model: str = None,
46-
storage_path: str = None, storage=None):
47-
if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
48-
os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
44+
@staticmethod
45+
def _check_llm_api_key(model: str) -> None:
46+
"""Verify that the LLM provider's API key is configured."""
47+
import os
48+
try:
49+
import litellm
50+
_, provider, _, _ = litellm.get_llm_provider(model=model)
51+
except Exception:
52+
return # Can't resolve provider — let litellm fail later with details
53+
54+
provider_env = {
55+
"openai": "OPENAI_API_KEY",
56+
"anthropic": "ANTHROPIC_API_KEY",
57+
"azure": "AZURE_API_KEY",
58+
"cohere": "COHERE_API_KEY",
59+
"replicate": "REPLICATE_API_KEY",
60+
"huggingface": "HUGGINGFACE_API_KEY",
61+
}
62+
env_var = provider_env.get(provider)
63+
if env_var and not os.getenv(env_var):
64+
from .errors import PageIndexError
65+
raise PageIndexError(
66+
f"API key not found. Set the {env_var} environment variable "
67+
f"for provider '{provider}' (model: {model})."
68+
)
4969

70+
def _init_local(self, model: str = None, retrieve_model: str = None,
71+
storage_path: str = None, storage=None,
72+
index_config: IndexConfig | dict = None):
73+
# Build IndexConfig: merge model/retrieve_model with index_config
5074
overrides = {}
5175
if model:
5276
overrides["model"] = model
5377
if retrieve_model:
5478
overrides["retrieve_model"] = retrieve_model
55-
opt = ConfigLoader().load(overrides or None)
79+
if isinstance(index_config, IndexConfig):
80+
opt = index_config.model_copy(update=overrides)
81+
elif isinstance(index_config, dict):
82+
overrides.update(index_config)
83+
opt = IndexConfig(**overrides)
84+
else:
85+
opt = IndexConfig(**overrides) if overrides else IndexConfig()
86+
87+
# Early validation: check API key before any expensive operations
88+
self._check_llm_api_key(opt.model)
5689

5790
storage_path = Path(storage_path or "~/.pageindex").expanduser()
5891
storage_path.mkdir(parents=True, exist_ok=True)
@@ -65,6 +98,7 @@ def _init_local(self, model: str = None, retrieve_model: str = None,
6598
files_dir=str(storage_path / "files"),
6699
model=opt.model,
67100
retrieve_model=_normalize_retrieve_model(opt.retrieve_model or opt.model),
101+
index_config=opt,
68102
)
69103

70104
def collection(self, name: str = "default") -> Collection:
@@ -87,11 +121,33 @@ def register_parser(self, parser: DocumentParser) -> None:
87121

88122

89123
class LocalClient(PageIndexClient):
90-
"""Local mode — indexes and queries documents on your machine."""
124+
"""Local mode — indexes and queries documents on your machine.
125+
126+
Args:
127+
model: LLM model for indexing (default: gpt-4o-2024-11-20)
128+
retrieve_model: LLM model for agent QA (default: same as model)
129+
storage_path: Directory for SQLite DB and files (default: ~/.pageindex)
130+
storage: Custom StorageEngine instance (default: SQLiteStorage)
131+
index_config: Advanced indexing parameters. Pass an IndexConfig instance
132+
or a dict. All fields have sensible defaults — most users don't need this.
133+
134+
Example::
135+
136+
# Simple — defaults are fine
137+
client = LocalClient(model="gpt-5.4")
138+
139+
# Advanced — tune indexing parameters
140+
from pageindex.config import IndexConfig
141+
client = LocalClient(
142+
model="gpt-5.4",
143+
index_config=IndexConfig(toc_check_page_num=30),
144+
)
145+
"""
91146

92147
def __init__(self, model: str = None, retrieve_model: str = None,
93-
storage_path: str = None, storage=None):
94-
self._init_local(model, retrieve_model, storage_path, storage)
148+
storage_path: str = None, storage=None,
149+
index_config: IndexConfig | dict = None):
150+
self._init_local(model, retrieve_model, storage_path, storage, index_config)
95151

96152

97153
class CloudClient(PageIndexClient):

pageindex/config.py

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,22 @@
11
# pageindex/config.py
2-
import yaml
3-
from pathlib import Path
4-
from types import SimpleNamespace
2+
from __future__ import annotations
3+
from pydantic import BaseModel
54

65

7-
_DEFAULT_CONFIG_PATH = Path(__file__).parent / "config.yaml"
6+
class IndexConfig(BaseModel):
7+
"""Configuration for the PageIndex indexing pipeline.
88
9+
All fields have sensible defaults. Advanced users can override
10+
via LocalClient(index_config=IndexConfig(...)) or a dict.
11+
"""
12+
model_config = {"extra": "forbid"}
913

10-
class ConfigLoader:
11-
def __init__(self, default_path: str = None):
12-
if default_path is None:
13-
default_path = _DEFAULT_CONFIG_PATH
14-
self._default_dict = self._load_yaml(default_path)
15-
16-
@staticmethod
17-
def _load_yaml(path):
18-
with open(path, "r", encoding="utf-8") as f:
19-
return yaml.safe_load(f) or {}
20-
21-
def _validate_keys(self, user_dict):
22-
unknown_keys = set(user_dict) - set(self._default_dict)
23-
if unknown_keys:
24-
raise ValueError(f"Unknown config keys: {unknown_keys}")
25-
26-
def load(self, user_opt=None) -> SimpleNamespace:
27-
if user_opt is None:
28-
user_dict = {}
29-
elif isinstance(user_opt, SimpleNamespace):
30-
user_dict = vars(user_opt)
31-
elif isinstance(user_opt, dict):
32-
user_dict = user_opt
33-
else:
34-
raise TypeError("user_opt must be dict, SimpleNamespace or None")
35-
self._validate_keys(user_dict)
36-
merged = {**self._default_dict, **user_dict}
37-
return SimpleNamespace(**merged)
14+
model: str = "gpt-4o-2024-11-20"
15+
retrieve_model: str | None = None
16+
toc_check_page_num: int = 20
17+
max_page_num_each_node: int = 10
18+
max_token_num_each_node: int = 20000
19+
if_add_node_id: bool = True
20+
if_add_node_summary: bool = True
21+
if_add_doc_description: bool = True
22+
if_add_node_text: bool = False

pageindex/config.yaml

Lines changed: 0 additions & 10 deletions
This file was deleted.

pageindex/index/page_index.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,17 +1081,17 @@ def page_index_main(doc, opt=None):
10811081

10821082
async def page_index_builder():
10831083
structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
1084-
if opt.if_add_node_id == 'yes':
1085-
write_node_id(structure)
1086-
if opt.if_add_node_text == 'yes':
1084+
if opt.if_add_node_id:
1085+
write_node_id(structure)
1086+
if opt.if_add_node_text:
10871087
add_node_text(structure, page_list)
1088-
if opt.if_add_node_summary == 'yes':
1089-
if opt.if_add_node_text == 'no':
1088+
if opt.if_add_node_summary:
1089+
if not opt.if_add_node_text:
10901090
add_node_text(structure, page_list)
10911091
await generate_summaries_for_structure(structure, model=opt.model)
1092-
if opt.if_add_node_text == 'no':
1092+
if not opt.if_add_node_text:
10931093
remove_structure_text(structure)
1094-
if opt.if_add_doc_description == 'yes':
1094+
if opt.if_add_doc_description:
10951095
# Create a clean structure without unnecessary fields for description generation
10961096
clean_structure = create_clean_structure_for_description(structure)
10971097
doc_description = generate_doc_description(clean_structure, model=opt.model)

pageindex/index/page_index_md.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def clean_tree_for_output(tree_nodes):
240240
return cleaned_nodes
241241

242242

243-
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
243+
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary=False, summary_token_threshold=None, model=None, if_add_doc_description=False, if_add_node_text=False, if_add_node_id=True):
244244
with open(md_path, 'r', encoding='utf-8') as f:
245245
markdown_content = f.read()
246246
line_count = markdown_content.count('\n') + 1
@@ -259,25 +259,24 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
259259
print(f"Building tree from nodes...")
260260
tree_structure = build_tree_from_nodes(nodes_with_content)
261261

262-
if if_add_node_id == 'yes':
262+
if if_add_node_id:
263263
write_node_id(tree_structure)
264264

265265
print(f"Formatting tree structure...")
266-
267-
if if_add_node_summary == 'yes':
266+
267+
if if_add_node_summary:
268268
# Always include text for summary generation
269269
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
270-
270+
271271
print(f"Generating summaries for each node...")
272272
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
273-
274-
if if_add_node_text == 'no':
273+
274+
if not if_add_node_text:
275275
# Remove text after summary generation if not requested
276276
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
277-
278-
if if_add_doc_description == 'yes':
277+
278+
if if_add_doc_description:
279279
print(f"Generating document description...")
280-
# Create a clean structure without unnecessary fields for description generation
281280
clean_structure = create_clean_structure_for_description(tree_structure)
282281
doc_description = generate_doc_description(clean_structure, model=model)
283282
return {
@@ -288,7 +287,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
288287
}
289288
else:
290289
# No summaries needed, format based on text preference
291-
if if_add_node_text == 'yes':
290+
if if_add_node_text:
292291
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
293292
else:
294293
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

pageindex/index/pipeline.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ def build_index(parsed: ParsedDocument, model: str = None, opt=None) -> dict:
5858
from .utils import (write_node_id, add_node_text, remove_structure_text,
5959
generate_summaries_for_structure, generate_doc_description,
6060
create_clean_structure_for_description)
61-
from ..config import ConfigLoader
61+
from ..config import IndexConfig
6262

6363
if opt is None:
64-
opt = ConfigLoader().load({"model": model} if model else None)
64+
opt = IndexConfig(model=model) if model else IndexConfig()
6565

6666
nodes = parsed.nodes
6767
strategy = detect_strategy(nodes)
@@ -75,26 +75,25 @@ def build_index(parsed: ParsedDocument, model: str = None, opt=None) -> dict:
7575
structure = _run_async(_content_based_pipeline(page_list, opt))
7676

7777
# Unified enhancement
78-
if opt.if_add_node_id == "yes":
78+
if opt.if_add_node_id:
7979
write_node_id(structure)
8080

8181
if strategy != "level_based":
82-
# For content-based, add text from page_list (reuse already-computed page_list)
83-
if opt.if_add_node_text == "yes" or opt.if_add_node_summary == "yes":
82+
if opt.if_add_node_text or opt.if_add_node_summary:
8483
add_node_text(structure, page_list)
8584

86-
if opt.if_add_node_summary == "yes":
85+
if opt.if_add_node_summary:
8786
_run_async(generate_summaries_for_structure(structure, model=opt.model))
8887

89-
if opt.if_add_node_text == "no" and strategy != "level_based":
88+
if not opt.if_add_node_text and strategy != "level_based":
9089
remove_structure_text(structure)
9190

9291
result = {
9392
"doc_name": parsed.doc_name,
9493
"structure": structure,
9594
}
9695

97-
if opt.if_add_doc_description == "yes":
96+
if opt.if_add_doc_description:
9897
clean_structure = create_clean_structure_for_description(structure)
9998
result["doc_description"] = generate_doc_description(
10099
clean_structure, model=opt.model

0 commit comments

Comments
 (0)