Skip to content

Commit d6b7cdd

Browse files
committed
feat: add heavy-load CPU/memory profiling support
1 parent 959452d commit d6b7cdd

File tree

4 files changed

+210
-92
lines changed

4 files changed

+210
-92
lines changed

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,33 @@ You can customize the processing with additional optional arguments:
174174
--if-add-node-id Add node ID (yes/no, default: yes)
175175
--if-add-node-summary Add node summary (yes/no, default: yes)
176176
--if-add-doc-description Add doc description (yes/no, default: yes)
177+
--enable-profile Enable runtime CPU/memory profiling
178+
--profile-output Output path for profile report JSON (default: ./results/profile_report.json)
177179
```
178180
</details>
179181

182+
<details>
183+
<summary><strong>CPU and memory profiling under heavy load</strong></summary>
184+
<br>
185+
Use profiling mode to measure end-to-end runtime and peak memory when processing larger PDFs or running repeated load tests.
186+
187+
```bash
188+
python3 run_pageindex.py \
189+
--pdf_path /path/to/your/large-document.pdf \
190+
--max-pages-per-node 20 \
191+
--max-tokens-per-node 30000 \
192+
--enable-profile \
193+
--profile-output ./results/heavy-load-profile.json
194+
```
195+
196+
The generated JSON report includes:
197+
- `elapsed_seconds`: total wall-clock runtime
198+
- `peak_memory_mb`: Python peak memory during the run (tracemalloc)
199+
- `rss_mb`: max resident memory of the process (when supported by OS)
200+
201+
Use this output to compare tuning changes (e.g., `--max-pages-per-node`, model choice, or input-size buckets) and identify memory pressure regressions.
202+
</details>
203+
180204
<details>
181205
<summary><strong>Markdown support</strong></summary>
182206
<br>

pageindex/profiling.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from __future__ import annotations
2+
3+
import contextlib
4+
import json
5+
import os
6+
import time
7+
import tracemalloc
8+
from dataclasses import asdict, dataclass
9+
from pathlib import Path
10+
11+
12+
@dataclass
13+
class ProfileReport:
14+
elapsed_seconds: float
15+
peak_memory_mb: float
16+
rss_mb: float | None
17+
18+
19+
@contextlib.contextmanager
20+
def profile_run(enabled: bool = False):
21+
if not enabled:
22+
yield None
23+
return
24+
25+
tracemalloc.start()
26+
start = time.perf_counter()
27+
try:
28+
yield
29+
finally:
30+
elapsed = time.perf_counter() - start
31+
_current, peak = tracemalloc.get_traced_memory()
32+
tracemalloc.stop()
33+
34+
rss_mb = None
35+
try:
36+
import resource
37+
38+
# Linux returns KB, macOS returns bytes.
39+
ru_maxrss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
40+
rss_mb = ru_maxrss / 1024.0
41+
if rss_mb > 10_000: # likely macOS bytes -> convert to MB
42+
rss_mb = ru_maxrss / (1024.0 * 1024.0)
43+
except Exception:
44+
rss_mb = None
45+
46+
yield_data = ProfileReport(
47+
elapsed_seconds=round(elapsed, 3),
48+
peak_memory_mb=round(peak / (1024.0 * 1024.0), 3),
49+
rss_mb=round(rss_mb, 3) if rss_mb is not None else None,
50+
)
51+
52+
# stash report on context manager instance for caller retrieval
53+
profile_run.last_report = yield_data
54+
55+
56+
def write_profile_report(output_path: str | os.PathLike[str], report: ProfileReport) -> Path:
57+
path = Path(output_path)
58+
path.parent.mkdir(parents=True, exist_ok=True)
59+
path.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")
60+
return path
61+
62+
63+
profile_run.last_report = None

run_pageindex.py

Lines changed: 94 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
from pageindex import *
55
from pageindex.page_index_md import md_to_tree
6+
from pageindex.profiling import profile_run, write_profile_report
67

78
if __name__ == "__main__":
89
# Set up argument parser
@@ -12,7 +13,7 @@
1213

1314
parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
1415

15-
parser.add_argument('--toc-check-pages', type=int, default=20,
16+
parser.add_argument('--toc-check-pages', type=int, default=20,
1617
help='Number of pages to check for table of contents (PDF only)')
1718
parser.add_argument('--max-pages-per-node', type=int, default=10,
1819
help='Maximum number of pages per node (PDF only)')
@@ -27,107 +28,108 @@
2728
help='Whether to add doc description to the doc')
2829
parser.add_argument('--if-add-node-text', type=str, default='no',
2930
help='Whether to add text to the node')
30-
31+
3132
# Markdown specific arguments
3233
parser.add_argument('--if-thinning', type=str, default='no',
3334
help='Whether to apply tree thinning for markdown (markdown only)')
3435
parser.add_argument('--thinning-threshold', type=int, default=5000,
3536
help='Minimum token threshold for thinning (markdown only)')
3637
parser.add_argument('--summary-token-threshold', type=int, default=200,
3738
help='Token threshold for generating summaries (markdown only)')
39+
40+
# Profiling arguments
41+
parser.add_argument('--enable-profile', action='store_true',
42+
help='Enable CPU time and memory profiling for the end-to-end run')
43+
parser.add_argument('--profile-output', type=str, default='./results/profile_report.json',
44+
help='Path to write profile report JSON when --enable-profile is set')
45+
3846
args = parser.parse_args()
39-
47+
4048
# Validate that exactly one file type is specified
4149
if not args.pdf_path and not args.md_path:
4250
raise ValueError("Either --pdf_path or --md_path must be specified")
4351
if args.pdf_path and args.md_path:
4452
raise ValueError("Only one of --pdf_path or --md_path can be specified")
45-
46-
if args.pdf_path:
47-
# Validate PDF file
48-
if not args.pdf_path.lower().endswith('.pdf'):
49-
raise ValueError("PDF file must have .pdf extension")
50-
if not os.path.isfile(args.pdf_path):
51-
raise ValueError(f"PDF file not found: {args.pdf_path}")
52-
53-
# Process PDF file
54-
# Configure options
55-
opt = config(
56-
model=args.model,
57-
toc_check_page_num=args.toc_check_pages,
58-
max_page_num_each_node=args.max_pages_per_node,
59-
max_token_num_each_node=args.max_tokens_per_node,
60-
if_add_node_id=args.if_add_node_id,
61-
if_add_node_summary=args.if_add_node_summary,
62-
if_add_doc_description=args.if_add_doc_description,
63-
if_add_node_text=args.if_add_node_text
64-
)
65-
66-
# Process the PDF
67-
toc_with_page_number = page_index_main(args.pdf_path, opt)
68-
print('Parsing done, saving to file...')
69-
70-
# Save results
71-
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
72-
output_dir = './results'
73-
output_file = f'{output_dir}/{pdf_name}_structure.json'
74-
os.makedirs(output_dir, exist_ok=True)
75-
76-
with open(output_file, 'w', encoding='utf-8') as f:
77-
json.dump(toc_with_page_number, f, indent=2)
78-
79-
print(f'Tree structure saved to: {output_file}')
80-
81-
elif args.md_path:
82-
# Validate Markdown file
83-
if not args.md_path.lower().endswith(('.md', '.markdown')):
84-
raise ValueError("Markdown file must have .md or .markdown extension")
85-
if not os.path.isfile(args.md_path):
86-
raise ValueError(f"Markdown file not found: {args.md_path}")
87-
88-
# Process markdown file
89-
print('Processing markdown file...')
90-
91-
# Process the markdown
92-
import asyncio
93-
94-
# Use ConfigLoader to get consistent defaults (matching PDF behavior)
95-
from pageindex.utils import ConfigLoader
96-
config_loader = ConfigLoader()
97-
98-
# Create options dict with user args
99-
user_opt = {
100-
'model': args.model,
101-
'if_add_node_summary': args.if_add_node_summary,
102-
'if_add_doc_description': args.if_add_doc_description,
103-
'if_add_node_text': args.if_add_node_text,
104-
'if_add_node_id': args.if_add_node_id
105-
}
106-
107-
# Load config with defaults from config.yaml
108-
opt = config_loader.load(user_opt)
109-
110-
toc_with_page_number = asyncio.run(md_to_tree(
111-
md_path=args.md_path,
112-
if_thinning=args.if_thinning.lower() == 'yes',
113-
min_token_threshold=args.thinning_threshold,
114-
if_add_node_summary=opt.if_add_node_summary,
115-
summary_token_threshold=args.summary_token_threshold,
116-
model=opt.model,
117-
if_add_doc_description=opt.if_add_doc_description,
118-
if_add_node_text=opt.if_add_node_text,
119-
if_add_node_id=opt.if_add_node_id
120-
))
121-
122-
print('Parsing done, saving to file...')
123-
124-
# Save results
125-
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
126-
output_dir = './results'
127-
output_file = f'{output_dir}/{md_name}_structure.json'
128-
os.makedirs(output_dir, exist_ok=True)
129-
130-
with open(output_file, 'w', encoding='utf-8') as f:
131-
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
132-
133-
print(f'Tree structure saved to: {output_file}')
53+
54+
with profile_run(args.enable_profile):
55+
if args.pdf_path:
56+
# Validate PDF file
57+
if not args.pdf_path.lower().endswith('.pdf'):
58+
raise ValueError("PDF file must have .pdf extension")
59+
if not os.path.isfile(args.pdf_path):
60+
raise ValueError(f"PDF file not found: {args.pdf_path}")
61+
62+
# Process PDF file
63+
opt = config(
64+
model=args.model,
65+
toc_check_page_num=args.toc_check_pages,
66+
max_page_num_each_node=args.max_pages_per_node,
67+
max_token_num_each_node=args.max_tokens_per_node,
68+
if_add_node_id=args.if_add_node_id,
69+
if_add_node_summary=args.if_add_node_summary,
70+
if_add_doc_description=args.if_add_doc_description,
71+
if_add_node_text=args.if_add_node_text
72+
)
73+
74+
toc_with_page_number = page_index_main(args.pdf_path, opt)
75+
print('Parsing done, saving to file...')
76+
77+
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
78+
output_dir = './results'
79+
output_file = f'{output_dir}/{pdf_name}_structure.json'
80+
os.makedirs(output_dir, exist_ok=True)
81+
82+
with open(output_file, 'w', encoding='utf-8') as f:
83+
json.dump(toc_with_page_number, f, indent=2)
84+
85+
print(f'Tree structure saved to: {output_file}')
86+
87+
elif args.md_path:
88+
if not args.md_path.lower().endswith(('.md', '.markdown')):
89+
raise ValueError("Markdown file must have .md or .markdown extension")
90+
if not os.path.isfile(args.md_path):
91+
raise ValueError(f"Markdown file not found: {args.md_path}")
92+
93+
print('Processing markdown file...')
94+
95+
import asyncio
96+
from pageindex.utils import ConfigLoader
97+
98+
config_loader = ConfigLoader()
99+
user_opt = {
100+
'model': args.model,
101+
'if_add_node_summary': args.if_add_node_summary,
102+
'if_add_doc_description': args.if_add_doc_description,
103+
'if_add_node_text': args.if_add_node_text,
104+
'if_add_node_id': args.if_add_node_id
105+
}
106+
opt = config_loader.load(user_opt)
107+
108+
toc_with_page_number = asyncio.run(md_to_tree(
109+
md_path=args.md_path,
110+
if_thinning=args.if_thinning.lower() == 'yes',
111+
min_token_threshold=args.thinning_threshold,
112+
if_add_node_summary=opt.if_add_node_summary,
113+
summary_token_threshold=args.summary_token_threshold,
114+
model=opt.model,
115+
if_add_doc_description=opt.if_add_doc_description,
116+
if_add_node_text=opt.if_add_node_text,
117+
if_add_node_id=opt.if_add_node_id
118+
))
119+
120+
print('Parsing done, saving to file...')
121+
122+
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
123+
output_dir = './results'
124+
output_file = f'{output_dir}/{md_name}_structure.json'
125+
os.makedirs(output_dir, exist_ok=True)
126+
127+
with open(output_file, 'w', encoding='utf-8') as f:
128+
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
129+
130+
print(f'Tree structure saved to: {output_file}')
131+
132+
if args.enable_profile and profile_run.last_report is not None:
133+
profile_path = write_profile_report(args.profile_output, profile_run.last_report)
134+
print(f'Profile report saved to: {profile_path}')
135+
print(f'Profile summary: {profile_run.last_report}')

tests/test_profiling.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import json
2+
from pathlib import Path
3+
4+
from pageindex.profiling import profile_run, write_profile_report
5+
6+
7+
def test_profile_run_collects_report():
8+
profile_run.last_report = None
9+
with profile_run(True):
10+
_ = sum(i for i in range(10_000))
11+
12+
report = profile_run.last_report
13+
assert report is not None
14+
assert report.elapsed_seconds >= 0
15+
assert report.peak_memory_mb >= 0
16+
17+
18+
def test_write_profile_report(tmp_path: Path):
19+
profile_run.last_report = None
20+
with profile_run(True):
21+
_ = [str(i) for i in range(1000)]
22+
23+
out = tmp_path / "profile.json"
24+
write_profile_report(out, profile_run.last_report)
25+
26+
payload = json.loads(out.read_text(encoding="utf-8"))
27+
assert "elapsed_seconds" in payload
28+
assert "peak_memory_mb" in payload
29+
assert "rss_mb" in payload

0 commit comments

Comments
 (0)