33import json
44from pageindex import *
55from pageindex .page_index_md import md_to_tree
6+ from pageindex .profiling import profile_run , write_profile_report
67
78if __name__ == "__main__" :
89 # Set up argument parser
1213
1314 parser .add_argument ('--model' , type = str , default = 'gpt-4o-2024-11-20' , help = 'Model to use' )
1415
15- parser .add_argument ('--toc-check-pages' , type = int , default = 20 ,
16+ parser .add_argument ('--toc-check-pages' , type = int , default = 20 ,
1617 help = 'Number of pages to check for table of contents (PDF only)' )
1718 parser .add_argument ('--max-pages-per-node' , type = int , default = 10 ,
1819 help = 'Maximum number of pages per node (PDF only)' )
2728 help = 'Whether to add doc description to the doc' )
2829 parser .add_argument ('--if-add-node-text' , type = str , default = 'no' ,
2930 help = 'Whether to add text to the node' )
30-
31+
3132 # Markdown specific arguments
3233 parser .add_argument ('--if-thinning' , type = str , default = 'no' ,
3334 help = 'Whether to apply tree thinning for markdown (markdown only)' )
3435 parser .add_argument ('--thinning-threshold' , type = int , default = 5000 ,
3536 help = 'Minimum token threshold for thinning (markdown only)' )
3637 parser .add_argument ('--summary-token-threshold' , type = int , default = 200 ,
3738 help = 'Token threshold for generating summaries (markdown only)' )
39+
40+ # Profiling arguments
41+ parser .add_argument ('--enable-profile' , action = 'store_true' ,
42+ help = 'Enable CPU time and memory profiling for the end-to-end run' )
43+ parser .add_argument ('--profile-output' , type = str , default = './results/profile_report.json' ,
44+ help = 'Path to write profile report JSON when --enable-profile is set' )
45+
3846 args = parser .parse_args ()
39-
47+
4048 # Validate that exactly one file type is specified
4149 if not args .pdf_path and not args .md_path :
4250 raise ValueError ("Either --pdf_path or --md_path must be specified" )
4351 if args .pdf_path and args .md_path :
4452 raise ValueError ("Only one of --pdf_path or --md_path can be specified" )
45-
46- if args .pdf_path :
47- # Validate PDF file
48- if not args .pdf_path .lower ().endswith ('.pdf' ):
49- raise ValueError ("PDF file must have .pdf extension" )
50- if not os .path .isfile (args .pdf_path ):
51- raise ValueError (f"PDF file not found: { args .pdf_path } " )
52-
53- # Process PDF file
54- # Configure options
55- opt = config (
56- model = args .model ,
57- toc_check_page_num = args .toc_check_pages ,
58- max_page_num_each_node = args .max_pages_per_node ,
59- max_token_num_each_node = args .max_tokens_per_node ,
60- if_add_node_id = args .if_add_node_id ,
61- if_add_node_summary = args .if_add_node_summary ,
62- if_add_doc_description = args .if_add_doc_description ,
63- if_add_node_text = args .if_add_node_text
64- )
65-
66- # Process the PDF
67- toc_with_page_number = page_index_main (args .pdf_path , opt )
68- print ('Parsing done, saving to file...' )
69-
70- # Save results
71- pdf_name = os .path .splitext (os .path .basename (args .pdf_path ))[0 ]
72- output_dir = './results'
73- output_file = f'{ output_dir } /{ pdf_name } _structure.json'
74- os .makedirs (output_dir , exist_ok = True )
75-
76- with open (output_file , 'w' , encoding = 'utf-8' ) as f :
77- json .dump (toc_with_page_number , f , indent = 2 )
78-
79- print (f'Tree structure saved to: { output_file } ' )
80-
81- elif args .md_path :
82- # Validate Markdown file
83- if not args .md_path .lower ().endswith (('.md' , '.markdown' )):
84- raise ValueError ("Markdown file must have .md or .markdown extension" )
85- if not os .path .isfile (args .md_path ):
86- raise ValueError (f"Markdown file not found: { args .md_path } " )
87-
88- # Process markdown file
89- print ('Processing markdown file...' )
90-
91- # Process the markdown
92- import asyncio
93-
94- # Use ConfigLoader to get consistent defaults (matching PDF behavior)
95- from pageindex .utils import ConfigLoader
96- config_loader = ConfigLoader ()
97-
98- # Create options dict with user args
99- user_opt = {
100- 'model' : args .model ,
101- 'if_add_node_summary' : args .if_add_node_summary ,
102- 'if_add_doc_description' : args .if_add_doc_description ,
103- 'if_add_node_text' : args .if_add_node_text ,
104- 'if_add_node_id' : args .if_add_node_id
105- }
106-
107- # Load config with defaults from config.yaml
108- opt = config_loader .load (user_opt )
109-
110- toc_with_page_number = asyncio .run (md_to_tree (
111- md_path = args .md_path ,
112- if_thinning = args .if_thinning .lower () == 'yes' ,
113- min_token_threshold = args .thinning_threshold ,
114- if_add_node_summary = opt .if_add_node_summary ,
115- summary_token_threshold = args .summary_token_threshold ,
116- model = opt .model ,
117- if_add_doc_description = opt .if_add_doc_description ,
118- if_add_node_text = opt .if_add_node_text ,
119- if_add_node_id = opt .if_add_node_id
120- ))
121-
122- print ('Parsing done, saving to file...' )
123-
124- # Save results
125- md_name = os .path .splitext (os .path .basename (args .md_path ))[0 ]
126- output_dir = './results'
127- output_file = f'{ output_dir } /{ md_name } _structure.json'
128- os .makedirs (output_dir , exist_ok = True )
129-
130- with open (output_file , 'w' , encoding = 'utf-8' ) as f :
131- json .dump (toc_with_page_number , f , indent = 2 , ensure_ascii = False )
132-
133- print (f'Tree structure saved to: { output_file } ' )
53+
54+ with profile_run (args .enable_profile ):
55+ if args .pdf_path :
56+ # Validate PDF file
57+ if not args .pdf_path .lower ().endswith ('.pdf' ):
58+ raise ValueError ("PDF file must have .pdf extension" )
59+ if not os .path .isfile (args .pdf_path ):
60+ raise ValueError (f"PDF file not found: { args .pdf_path } " )
61+
62+ # Process PDF file
63+ opt = config (
64+ model = args .model ,
65+ toc_check_page_num = args .toc_check_pages ,
66+ max_page_num_each_node = args .max_pages_per_node ,
67+ max_token_num_each_node = args .max_tokens_per_node ,
68+ if_add_node_id = args .if_add_node_id ,
69+ if_add_node_summary = args .if_add_node_summary ,
70+ if_add_doc_description = args .if_add_doc_description ,
71+ if_add_node_text = args .if_add_node_text
72+ )
73+
74+ toc_with_page_number = page_index_main (args .pdf_path , opt )
75+ print ('Parsing done, saving to file...' )
76+
77+ pdf_name = os .path .splitext (os .path .basename (args .pdf_path ))[0 ]
78+ output_dir = './results'
79+ output_file = f'{ output_dir } /{ pdf_name } _structure.json'
80+ os .makedirs (output_dir , exist_ok = True )
81+
82+ with open (output_file , 'w' , encoding = 'utf-8' ) as f :
83+ json .dump (toc_with_page_number , f , indent = 2 )
84+
85+ print (f'Tree structure saved to: { output_file } ' )
86+
87+ elif args .md_path :
88+ if not args .md_path .lower ().endswith (('.md' , '.markdown' )):
89+ raise ValueError ("Markdown file must have .md or .markdown extension" )
90+ if not os .path .isfile (args .md_path ):
91+ raise ValueError (f"Markdown file not found: { args .md_path } " )
92+
93+ print ('Processing markdown file...' )
94+
95+ import asyncio
96+ from pageindex .utils import ConfigLoader
97+
98+ config_loader = ConfigLoader ()
99+ user_opt = {
100+ 'model' : args .model ,
101+ 'if_add_node_summary' : args .if_add_node_summary ,
102+ 'if_add_doc_description' : args .if_add_doc_description ,
103+ 'if_add_node_text' : args .if_add_node_text ,
104+ 'if_add_node_id' : args .if_add_node_id
105+ }
106+ opt = config_loader .load (user_opt )
107+
108+ toc_with_page_number = asyncio .run (md_to_tree (
109+ md_path = args .md_path ,
110+ if_thinning = args .if_thinning .lower () == 'yes' ,
111+ min_token_threshold = args .thinning_threshold ,
112+ if_add_node_summary = opt .if_add_node_summary ,
113+ summary_token_threshold = args .summary_token_threshold ,
114+ model = opt .model ,
115+ if_add_doc_description = opt .if_add_doc_description ,
116+ if_add_node_text = opt .if_add_node_text ,
117+ if_add_node_id = opt .if_add_node_id
118+ ))
119+
120+ print ('Parsing done, saving to file...' )
121+
122+ md_name = os .path .splitext (os .path .basename (args .md_path ))[0 ]
123+ output_dir = './results'
124+ output_file = f'{ output_dir } /{ md_name } _structure.json'
125+ os .makedirs (output_dir , exist_ok = True )
126+
127+ with open (output_file , 'w' , encoding = 'utf-8' ) as f :
128+ json .dump (toc_with_page_number , f , indent = 2 , ensure_ascii = False )
129+
130+ print (f'Tree structure saved to: { output_file } ' )
131+
132+ if args .enable_profile and profile_run .last_report is not None :
133+ profile_path = write_profile_report (args .profile_output , profile_run .last_report )
134+ print (f'Profile report saved to: { profile_path } ' )
135+ print (f'Profile summary: { profile_run .last_report } ' )
0 commit comments