This repository was archived by the owner on Feb 15, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathbuild_index.py
More file actions
212 lines (160 loc) · 6.89 KB
/
build_index.py
File metadata and controls
212 lines (160 loc) · 6.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python3
"""
Build search index for Python dependency manager documentation.
Creates a Tantivy full-text search index from all markdown files in the assets
directory, using metadata from the auto-update workflow to tag documents by
their source package manager.
"""
import logging
from pathlib import Path
from typing import Dict, Any
import json
import tantivy
# Configure logging for visibility into indexing process
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Project paths
ASSETS_DIR = Path("src/assets")
INDEX_DIR = Path("src/index")
def load_metadata(metadata_path: Path) -> Dict[str, Any]:
"""Load package metadata from _metadata.json file."""
try:
with open(metadata_path, "r", encoding="utf-8") as f:
return json.load(f) or {}
except Exception as e:
logger.warning(f"Failed to load metadata from {metadata_path}: {e}")
return {}
def create_schema() -> tantivy.Schema:
"""
Create Tantivy schema for documentation search.
Fields:
- content: Full-text searchable markdown content
- path: File path for result retrieval
- package: Source package manager (pip, conda, poetry, uv, pixi, pdm) for filtering
- title: Document title extracted from file path
- source_repo: Original repository from metadata
- docs_path: Documentation directory path from metadata (for GitHub links)
"""
schema_builder = tantivy.SchemaBuilder()
# Main searchable content - full-text with indexing and storage
schema_builder.add_text_field("content", stored=True)
# File path for retrieval - stored but not necessarily indexed for search
schema_builder.add_text_field("path", stored=True)
# Package name for filtering - indexed and stored
schema_builder.add_text_field("package", stored=True)
# Human-readable title - indexed and stored
schema_builder.add_text_field("title", stored=True)
# Source repository - indexed and stored
schema_builder.add_text_field("source_repo", stored=True)
# Documentation path - stored for GitHub link reconstruction
schema_builder.add_text_field("docs_path", stored=True)
return schema_builder.build()
def extract_title_from_path(file_path: Path) -> str:
"""Extract path-notation title from file path."""
# Convert to path notation preserving directory structure
# e.g., "src/assets/pip/cli/pip_install.rst" -> "pip/cli/pip_install.rst"
parts = file_path.parts[2:] # Skip "src/assets"
filtered_parts = [part for part in parts if part != "index.md"]
return "/".join(filtered_parts)
def find_markdown_files(assets_dir: Path) -> list[tuple[Path, str, Dict[str, Any]]]:
"""
Find all markdown files in assets directory.
Returns:
List of tuples (file_path, package_name, metadata)
"""
markdown_files = []
for package_dir in assets_dir.iterdir():
if not package_dir.is_dir():
continue
package_name = package_dir.name
metadata_path = package_dir / "_metadata.json"
metadata = load_metadata(metadata_path)
logger.info(f"Scanning {package_name} documentation...")
# Find all markdown and reStructuredText files
for pattern in ["**/*.md", "**/*.rst"]:
for file_path in package_dir.glob(pattern):
if file_path.is_file():
markdown_files.append((file_path, package_name, metadata))
return markdown_files
def read_file_content(file_path: Path) -> str:
"""
Read file content with multiple encoding fallbacks.
Note: This function is called only with paths from find_markdown_files(),
which uses glob() to find files within the controlled ASSETS_DIR structure.
The paths are generated by filesystem iteration, not user input.
"""
# Ensure the file exists and is a regular file (not a symlink to outside assets)
try:
if not file_path.exists() or not file_path.is_file():
logger.warning(f"File does not exist or is not a regular file: {file_path}")
return ""
# Simple safety check: ensure resolved path is still within assets
if not file_path.resolve().is_relative_to(ASSETS_DIR.resolve()):
logger.error(f"File path outside assets directory: {file_path}")
return ""
except (OSError, ValueError) as e:
logger.warning(f"Path validation failed for {file_path}: {e}")
return ""
encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"]
for encoding in encodings:
try:
with open(file_path, "r", encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
except Exception as e:
logger.warning(f"Error reading {file_path}: {e}")
break
logger.error(f"Failed to read {file_path} with any encoding")
return ""
def build_index():
"""Build the Tantivy search index from all documentation files."""
logger.info("Starting index build process...")
# Remove existing index to prevent schema conflicts
if INDEX_DIR.exists():
logger.info(f"Removing existing index at {INDEX_DIR}")
import shutil
shutil.rmtree(INDEX_DIR)
# Create index directory
INDEX_DIR.mkdir(exist_ok=True)
# Create schema and index
schema = create_schema()
index = tantivy.Index(schema, path=str(INDEX_DIR))
writer = index.writer(heap_size=100_000_000) # 100MB heap for better performance
# Find all documentation files
markdown_files = find_markdown_files(ASSETS_DIR)
logger.info(f"Found {len(markdown_files)} documentation files")
# Index each file
indexed_count = 0
for file_path, package_name, metadata in markdown_files:
try:
content = read_file_content(file_path)
if not content.strip():
continue
title = extract_title_from_path(file_path)
source_repo = metadata.get("source_repo", "unknown")
docs_path = metadata.get("docs_path", "docs")
# Create document
doc = tantivy.Document(
content=content,
path=str(file_path.relative_to(ASSETS_DIR)),
package=package_name,
title=title,
source_repo=source_repo,
docs_path=docs_path,
)
writer.add_document(doc)
indexed_count += 1
if indexed_count % 100 == 0:
logger.info(f"Indexed {indexed_count} documents...")
except Exception as e:
logger.error(f"Failed to index {file_path}: {e}")
# Commit the index to disk
logger.info("Committing index to disk...")
writer.commit()
logger.info(f"Index build complete! Indexed {indexed_count} documents")
logger.info(f"Index stored in: {INDEX_DIR.absolute()}")
if __name__ == "__main__":
build_index()