Skip to content

Commit e81e90b

Browse files
committed
feat: add heavy-load CPU/memory profiling support
1 parent 2403be8 commit e81e90b

File tree

4 files changed

+209
-91
lines changed

4 files changed

+209
-91
lines changed

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,33 @@ You can customize the processing with additional optional arguments:
174174
--if-add-node-id Add node ID (yes/no, default: yes)
175175
--if-add-node-summary Add node summary (yes/no, default: yes)
176176
--if-add-doc-description Add doc description (yes/no, default: yes)
177+
--enable-profile Enable runtime CPU/memory profiling
178+
--profile-output Output path for profile report JSON (default: ./results/profile_report.json)
177179
```
178180
</details>
179181

182+
<details>
183+
<summary><strong>CPU and memory profiling under heavy load</strong></summary>
184+
<br>
185+
Use profiling mode to measure end-to-end runtime and peak memory when processing larger PDFs or running repeated load tests.
186+
187+
```bash
188+
python3 run_pageindex.py \
189+
--pdf_path /path/to/your/large-document.pdf \
190+
--max-pages-per-node 20 \
191+
--max-tokens-per-node 30000 \
192+
--enable-profile \
193+
--profile-output ./results/heavy-load-profile.json
194+
```
195+
196+
The generated JSON report includes:
197+
- `elapsed_seconds`: total wall-clock runtime
198+
- `peak_memory_mb`: Python peak memory during the run (tracemalloc)
199+
- `rss_mb`: max resident memory of the process (when supported by OS)
200+
201+
Use this output to compare tuning changes (e.g., `--max-pages-per-node`, model choice, or input-size buckets) and identify memory pressure regressions.
202+
</details>
203+
180204
<details>
181205
<summary><strong>Markdown support</strong></summary>
182206
<br>

pageindex/profiling.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from __future__ import annotations
2+
3+
import contextlib
4+
import json
5+
import os
6+
import time
7+
import tracemalloc
8+
from dataclasses import asdict, dataclass
9+
from pathlib import Path
10+
11+
12+
@dataclass
13+
class ProfileReport:
14+
elapsed_seconds: float
15+
peak_memory_mb: float
16+
rss_mb: float | None
17+
18+
19+
@contextlib.contextmanager
20+
def profile_run(enabled: bool = False):
21+
if not enabled:
22+
yield None
23+
return
24+
25+
tracemalloc.start()
26+
start = time.perf_counter()
27+
try:
28+
yield
29+
finally:
30+
elapsed = time.perf_counter() - start
31+
_current, peak = tracemalloc.get_traced_memory()
32+
tracemalloc.stop()
33+
34+
rss_mb = None
35+
try:
36+
import resource
37+
38+
# Linux returns KB, macOS returns bytes.
39+
ru_maxrss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
40+
rss_mb = ru_maxrss / 1024.0
41+
if rss_mb > 10_000: # likely macOS bytes -> convert to MB
42+
rss_mb = ru_maxrss / (1024.0 * 1024.0)
43+
except Exception:
44+
rss_mb = None
45+
46+
yield_data = ProfileReport(
47+
elapsed_seconds=round(elapsed, 3),
48+
peak_memory_mb=round(peak / (1024.0 * 1024.0), 3),
49+
rss_mb=round(rss_mb, 3) if rss_mb is not None else None,
50+
)
51+
52+
# stash report on context manager instance for caller retrieval
53+
profile_run.last_report = yield_data
54+
55+
56+
def write_profile_report(output_path: str | os.PathLike[str], report: ProfileReport) -> Path:
57+
path = Path(output_path)
58+
path.parent.mkdir(parents=True, exist_ok=True)
59+
path.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")
60+
return path
61+
62+
63+
profile_run.last_report = None

run_pageindex.py

Lines changed: 93 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
from pageindex import *
55
from pageindex.page_index_md import md_to_tree
6+
from pageindex.profiling import profile_run, write_profile_report
67
from pageindex.utils import ConfigLoader
78

89
if __name__ == "__main__":
@@ -28,107 +29,108 @@
2829
help='Whether to add doc description to the doc')
2930
parser.add_argument('--if-add-node-text', type=str, default=None,
3031
help='Whether to add text to the node')
31-
32+
3233
# Markdown specific arguments
3334
parser.add_argument('--if-thinning', type=str, default='no',
3435
help='Whether to apply tree thinning for markdown (markdown only)')
3536
parser.add_argument('--thinning-threshold', type=int, default=5000,
3637
help='Minimum token threshold for thinning (markdown only)')
3738
parser.add_argument('--summary-token-threshold', type=int, default=200,
3839
help='Token threshold for generating summaries (markdown only)')
40+
41+
# Profiling arguments
42+
parser.add_argument('--enable-profile', action='store_true',
43+
help='Enable CPU time and memory profiling for the end-to-end run')
44+
parser.add_argument('--profile-output', type=str, default='./results/profile_report.json',
45+
help='Path to write profile report JSON when --enable-profile is set')
46+
3947
args = parser.parse_args()
40-
48+
4149
# Validate that exactly one file type is specified
4250
if not args.pdf_path and not args.md_path:
4351
raise ValueError("Either --pdf_path or --md_path must be specified")
4452
if args.pdf_path and args.md_path:
4553
raise ValueError("Only one of --pdf_path or --md_path can be specified")
46-
47-
if args.pdf_path:
48-
# Validate PDF file
49-
if not args.pdf_path.lower().endswith('.pdf'):
50-
raise ValueError("PDF file must have .pdf extension")
51-
if not os.path.isfile(args.pdf_path):
52-
raise ValueError(f"PDF file not found: {args.pdf_path}")
53-
54-
# Process PDF file
55-
user_opt = {
56-
'model': args.model,
57-
'toc_check_page_num': args.toc_check_pages,
58-
'max_page_num_each_node': args.max_pages_per_node,
59-
'max_token_num_each_node': args.max_tokens_per_node,
60-
'if_add_node_id': args.if_add_node_id,
61-
'if_add_node_summary': args.if_add_node_summary,
62-
'if_add_doc_description': args.if_add_doc_description,
63-
'if_add_node_text': args.if_add_node_text,
64-
}
65-
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
66-
67-
# Process the PDF
68-
toc_with_page_number = page_index_main(args.pdf_path, opt)
69-
print('Parsing done, saving to file...')
70-
71-
# Save results
72-
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
73-
output_dir = './results'
74-
output_file = f'{output_dir}/{pdf_name}_structure.json'
75-
os.makedirs(output_dir, exist_ok=True)
76-
77-
with open(output_file, 'w', encoding='utf-8') as f:
78-
json.dump(toc_with_page_number, f, indent=2)
79-
80-
print(f'Tree structure saved to: {output_file}')
81-
82-
elif args.md_path:
83-
# Validate Markdown file
84-
if not args.md_path.lower().endswith(('.md', '.markdown')):
85-
raise ValueError("Markdown file must have .md or .markdown extension")
86-
if not os.path.isfile(args.md_path):
87-
raise ValueError(f"Markdown file not found: {args.md_path}")
88-
89-
# Process markdown file
90-
print('Processing markdown file...')
91-
92-
# Process the markdown
93-
import asyncio
94-
95-
# Use ConfigLoader to get consistent defaults (matching PDF behavior)
96-
from pageindex.utils import ConfigLoader
97-
config_loader = ConfigLoader()
98-
99-
# Create options dict with user args
100-
user_opt = {
101-
'model': args.model,
102-
'if_add_node_summary': args.if_add_node_summary,
103-
'if_add_doc_description': args.if_add_doc_description,
104-
'if_add_node_text': args.if_add_node_text,
105-
'if_add_node_id': args.if_add_node_id
106-
}
107-
108-
# Load config with defaults from config.yaml
109-
opt = config_loader.load(user_opt)
110-
111-
toc_with_page_number = asyncio.run(md_to_tree(
112-
md_path=args.md_path,
113-
if_thinning=args.if_thinning.lower() == 'yes',
114-
min_token_threshold=args.thinning_threshold,
115-
if_add_node_summary=opt.if_add_node_summary,
116-
summary_token_threshold=args.summary_token_threshold,
117-
model=opt.model,
118-
if_add_doc_description=opt.if_add_doc_description,
119-
if_add_node_text=opt.if_add_node_text,
120-
if_add_node_id=opt.if_add_node_id
121-
))
122-
123-
print('Parsing done, saving to file...')
124-
125-
# Save results
126-
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
127-
output_dir = './results'
128-
output_file = f'{output_dir}/{md_name}_structure.json'
129-
os.makedirs(output_dir, exist_ok=True)
130-
131-
with open(output_file, 'w', encoding='utf-8') as f:
132-
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
133-
134-
print(f'Tree structure saved to: {output_file}')
54+
55+
with profile_run(args.enable_profile):
56+
if args.pdf_path:
57+
# Validate PDF file
58+
if not args.pdf_path.lower().endswith('.pdf'):
59+
raise ValueError("PDF file must have .pdf extension")
60+
if not os.path.isfile(args.pdf_path):
61+
raise ValueError(f"PDF file not found: {args.pdf_path}")
62+
63+
# Process PDF file
64+
user_opt = {
65+
'model': args.model,
66+
'toc_check_page_num': args.toc_check_pages,
67+
'max_page_num_each_node': args.max_pages_per_node,
68+
'max_token_num_each_node': args.max_tokens_per_node,
69+
'if_add_node_id': args.if_add_node_id,
70+
'if_add_node_summary': args.if_add_node_summary,
71+
'if_add_doc_description': args.if_add_doc_description,
72+
'if_add_node_text': args.if_add_node_text,
73+
}
74+
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
75+
76+
toc_with_page_number = page_index_main(args.pdf_path, opt)
77+
print('Parsing done, saving to file...')
78+
79+
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
80+
output_dir = './results'
81+
output_file = f'{output_dir}/{pdf_name}_structure.json'
82+
os.makedirs(output_dir, exist_ok=True)
83+
84+
with open(output_file, 'w', encoding='utf-8') as f:
85+
json.dump(toc_with_page_number, f, indent=2)
86+
87+
print(f'Tree structure saved to: {output_file}')
88+
89+
elif args.md_path:
90+
if not args.md_path.lower().endswith(('.md', '.markdown')):
91+
raise ValueError("Markdown file must have .md or .markdown extension")
92+
if not os.path.isfile(args.md_path):
93+
raise ValueError(f"Markdown file not found: {args.md_path}")
94+
95+
print('Processing markdown file...')
96+
97+
import asyncio
98+
99+
config_loader = ConfigLoader()
100+
user_opt = {
101+
'model': args.model,
102+
'if_add_node_summary': args.if_add_node_summary,
103+
'if_add_doc_description': args.if_add_doc_description,
104+
'if_add_node_text': args.if_add_node_text,
105+
'if_add_node_id': args.if_add_node_id
106+
}
107+
opt = config_loader.load({k: v for k, v in user_opt.items() if v is not None})
108+
109+
toc_with_page_number = asyncio.run(md_to_tree(
110+
md_path=args.md_path,
111+
if_thinning=args.if_thinning.lower() == 'yes',
112+
min_token_threshold=args.thinning_threshold,
113+
if_add_node_summary=opt.if_add_node_summary,
114+
summary_token_threshold=args.summary_token_threshold,
115+
model=opt.model,
116+
if_add_doc_description=opt.if_add_doc_description,
117+
if_add_node_text=opt.if_add_node_text,
118+
if_add_node_id=opt.if_add_node_id
119+
))
120+
121+
print('Parsing done, saving to file...')
122+
123+
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
124+
output_dir = './results'
125+
output_file = f'{output_dir}/{md_name}_structure.json'
126+
os.makedirs(output_dir, exist_ok=True)
127+
128+
with open(output_file, 'w', encoding='utf-8') as f:
129+
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
130+
131+
print(f'Tree structure saved to: {output_file}')
132+
133+
if args.enable_profile and profile_run.last_report is not None:
134+
profile_path = write_profile_report(args.profile_output, profile_run.last_report)
135+
print(f'Profile report saved to: {profile_path}')
136+
print(f'Profile summary: {profile_run.last_report}')

tests/test_profiling.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import json
2+
from pathlib import Path
3+
4+
from pageindex.profiling import profile_run, write_profile_report
5+
6+
7+
def test_profile_run_collects_report():
8+
profile_run.last_report = None
9+
with profile_run(True):
10+
_ = sum(i for i in range(10_000))
11+
12+
report = profile_run.last_report
13+
assert report is not None
14+
assert report.elapsed_seconds >= 0
15+
assert report.peak_memory_mb >= 0
16+
17+
18+
def test_write_profile_report(tmp_path: Path):
19+
profile_run.last_report = None
20+
with profile_run(True):
21+
_ = [str(i) for i in range(1000)]
22+
23+
out = tmp_path / "profile.json"
24+
write_profile_report(out, profile_run.last_report)
25+
26+
payload = json.loads(out.read_text(encoding="utf-8"))
27+
assert "elapsed_seconds" in payload
28+
assert "peak_memory_mb" in payload
29+
assert "rss_mb" in payload

0 commit comments

Comments
 (0)