33import json
44from pageindex import *
55from pageindex .page_index_md import md_to_tree
6+ from pageindex .profiling import profile_run , write_profile_report
67from pageindex .utils import ConfigLoader
78
89if __name__ == "__main__" :
2829 help = 'Whether to add doc description to the doc' )
2930 parser .add_argument ('--if-add-node-text' , type = str , default = None ,
3031 help = 'Whether to add text to the node' )
31-
32+
3233 # Markdown specific arguments
3334 parser .add_argument ('--if-thinning' , type = str , default = 'no' ,
3435 help = 'Whether to apply tree thinning for markdown (markdown only)' )
3536 parser .add_argument ('--thinning-threshold' , type = int , default = 5000 ,
3637 help = 'Minimum token threshold for thinning (markdown only)' )
3738 parser .add_argument ('--summary-token-threshold' , type = int , default = 200 ,
3839 help = 'Token threshold for generating summaries (markdown only)' )
40+
41+ # Profiling arguments
42+ parser .add_argument ('--enable-profile' , action = 'store_true' ,
43+ help = 'Enable CPU time and memory profiling for the end-to-end run' )
44+ parser .add_argument ('--profile-output' , type = str , default = './results/profile_report.json' ,
45+ help = 'Path to write profile report JSON when --enable-profile is set' )
46+
3947 args = parser .parse_args ()
40-
48+
4149 # Validate that exactly one file type is specified
4250 if not args .pdf_path and not args .md_path :
4351 raise ValueError ("Either --pdf_path or --md_path must be specified" )
4452 if args .pdf_path and args .md_path :
4553 raise ValueError ("Only one of --pdf_path or --md_path can be specified" )
46-
47- if args .pdf_path :
48- # Validate PDF file
49- if not args .pdf_path .lower ().endswith ('.pdf' ):
50- raise ValueError ("PDF file must have .pdf extension" )
51- if not os .path .isfile (args .pdf_path ):
52- raise ValueError (f"PDF file not found: { args .pdf_path } " )
53-
54- # Process PDF file
55- user_opt = {
56- 'model' : args .model ,
57- 'toc_check_page_num' : args .toc_check_pages ,
58- 'max_page_num_each_node' : args .max_pages_per_node ,
59- 'max_token_num_each_node' : args .max_tokens_per_node ,
60- 'if_add_node_id' : args .if_add_node_id ,
61- 'if_add_node_summary' : args .if_add_node_summary ,
62- 'if_add_doc_description' : args .if_add_doc_description ,
63- 'if_add_node_text' : args .if_add_node_text ,
64- }
65- opt = ConfigLoader ().load ({k : v for k , v in user_opt .items () if v is not None })
66-
67- # Process the PDF
68- toc_with_page_number = page_index_main (args .pdf_path , opt )
69- print ('Parsing done, saving to file...' )
70-
71- # Save results
72- pdf_name = os .path .splitext (os .path .basename (args .pdf_path ))[0 ]
73- output_dir = './results'
74- output_file = f'{ output_dir } /{ pdf_name } _structure.json'
75- os .makedirs (output_dir , exist_ok = True )
76-
77- with open (output_file , 'w' , encoding = 'utf-8' ) as f :
78- json .dump (toc_with_page_number , f , indent = 2 )
79-
80- print (f'Tree structure saved to: { output_file } ' )
81-
82- elif args .md_path :
83- # Validate Markdown file
84- if not args .md_path .lower ().endswith (('.md' , '.markdown' )):
85- raise ValueError ("Markdown file must have .md or .markdown extension" )
86- if not os .path .isfile (args .md_path ):
87- raise ValueError (f"Markdown file not found: { args .md_path } " )
88-
89- # Process markdown file
90- print ('Processing markdown file...' )
91-
92- # Process the markdown
93- import asyncio
94-
95- # Use ConfigLoader to get consistent defaults (matching PDF behavior)
96- from pageindex .utils import ConfigLoader
97- config_loader = ConfigLoader ()
98-
99- # Create options dict with user args
100- user_opt = {
101- 'model' : args .model ,
102- 'if_add_node_summary' : args .if_add_node_summary ,
103- 'if_add_doc_description' : args .if_add_doc_description ,
104- 'if_add_node_text' : args .if_add_node_text ,
105- 'if_add_node_id' : args .if_add_node_id
106- }
107-
108- # Load config with defaults from config.yaml
109- opt = config_loader .load (user_opt )
110-
111- toc_with_page_number = asyncio .run (md_to_tree (
112- md_path = args .md_path ,
113- if_thinning = args .if_thinning .lower () == 'yes' ,
114- min_token_threshold = args .thinning_threshold ,
115- if_add_node_summary = opt .if_add_node_summary ,
116- summary_token_threshold = args .summary_token_threshold ,
117- model = opt .model ,
118- if_add_doc_description = opt .if_add_doc_description ,
119- if_add_node_text = opt .if_add_node_text ,
120- if_add_node_id = opt .if_add_node_id
121- ))
122-
123- print ('Parsing done, saving to file...' )
124-
125- # Save results
126- md_name = os .path .splitext (os .path .basename (args .md_path ))[0 ]
127- output_dir = './results'
128- output_file = f'{ output_dir } /{ md_name } _structure.json'
129- os .makedirs (output_dir , exist_ok = True )
130-
131- with open (output_file , 'w' , encoding = 'utf-8' ) as f :
132- json .dump (toc_with_page_number , f , indent = 2 , ensure_ascii = False )
133-
134- print (f'Tree structure saved to: { output_file } ' )
54+
55+ with profile_run (args .enable_profile ):
56+ if args .pdf_path :
57+ # Validate PDF file
58+ if not args .pdf_path .lower ().endswith ('.pdf' ):
59+ raise ValueError ("PDF file must have .pdf extension" )
60+ if not os .path .isfile (args .pdf_path ):
61+ raise ValueError (f"PDF file not found: { args .pdf_path } " )
62+
63+ # Process PDF file
64+ user_opt = {
65+ 'model' : args .model ,
66+ 'toc_check_page_num' : args .toc_check_pages ,
67+ 'max_page_num_each_node' : args .max_pages_per_node ,
68+ 'max_token_num_each_node' : args .max_tokens_per_node ,
69+ 'if_add_node_id' : args .if_add_node_id ,
70+ 'if_add_node_summary' : args .if_add_node_summary ,
71+ 'if_add_doc_description' : args .if_add_doc_description ,
72+ 'if_add_node_text' : args .if_add_node_text ,
73+ }
74+ opt = ConfigLoader ().load ({k : v for k , v in user_opt .items () if v is not None })
75+
76+ toc_with_page_number = page_index_main (args .pdf_path , opt )
77+ print ('Parsing done, saving to file...' )
78+
79+ pdf_name = os .path .splitext (os .path .basename (args .pdf_path ))[0 ]
80+ output_dir = './results'
81+ output_file = f'{ output_dir } /{ pdf_name } _structure.json'
82+ os .makedirs (output_dir , exist_ok = True )
83+
84+ with open (output_file , 'w' , encoding = 'utf-8' ) as f :
85+ json .dump (toc_with_page_number , f , indent = 2 )
86+
87+ print (f'Tree structure saved to: { output_file } ' )
88+
89+ elif args .md_path :
90+ if not args .md_path .lower ().endswith (('.md' , '.markdown' )):
91+ raise ValueError ("Markdown file must have .md or .markdown extension" )
92+ if not os .path .isfile (args .md_path ):
93+ raise ValueError (f"Markdown file not found: { args .md_path } " )
94+
95+ print ('Processing markdown file...' )
96+
97+ import asyncio
98+
99+ config_loader = ConfigLoader ()
100+ user_opt = {
101+ 'model' : args .model ,
102+ 'if_add_node_summary' : args .if_add_node_summary ,
103+ 'if_add_doc_description' : args .if_add_doc_description ,
104+ 'if_add_node_text' : args .if_add_node_text ,
105+ 'if_add_node_id' : args .if_add_node_id
106+ }
107+ opt = config_loader .load ({k : v for k , v in user_opt .items () if v is not None })
108+
109+ toc_with_page_number = asyncio .run (md_to_tree (
110+ md_path = args .md_path ,
111+ if_thinning = args .if_thinning .lower () == 'yes' ,
112+ min_token_threshold = args .thinning_threshold ,
113+ if_add_node_summary = opt .if_add_node_summary ,
114+ summary_token_threshold = args .summary_token_threshold ,
115+ model = opt .model ,
116+ if_add_doc_description = opt .if_add_doc_description ,
117+ if_add_node_text = opt .if_add_node_text ,
118+ if_add_node_id = opt .if_add_node_id
119+ ))
120+
121+ print ('Parsing done, saving to file...' )
122+
123+ md_name = os .path .splitext (os .path .basename (args .md_path ))[0 ]
124+ output_dir = './results'
125+ output_file = f'{ output_dir } /{ md_name } _structure.json'
126+ os .makedirs (output_dir , exist_ok = True )
127+
128+ with open (output_file , 'w' , encoding = 'utf-8' ) as f :
129+ json .dump (toc_with_page_number , f , indent = 2 , ensure_ascii = False )
130+
131+ print (f'Tree structure saved to: { output_file } ' )
132+
133+ if args .enable_profile and profile_run .last_report is not None :
134+ profile_path = write_profile_report (args .profile_output , profile_run .last_report )
135+ print (f'Profile report saved to: { profile_path } ' )
136+ print (f'Profile summary: { profile_run .last_report } ' )
0 commit comments