@@ -712,6 +712,54 @@ def enable(self):
712712 def disable (self ):
713713 self .__do_disable = True
714714
715+ def timing_trace_is_enabled (self ):
716+ token = os .environ .get ("DAQINTERFACE_TIMING_TRACE" , "true" )
717+ return token .lower () not in ["0" , "false" , "no" , "off" ]
718+
719+ def timing_trace_filename (self ):
720+ if "DAQINTERFACE_TIMING_TRACE_FILE" in os .environ :
721+ return os .environ ["DAQINTERFACE_TIMING_TRACE_FILE" ]
722+ return "/tmp/daqinterface_timing_%s_partition%s.log" % (
723+ os .environ .get ("USER" , "unknown" ),
724+ self .partition_number ,
725+ )
726+
727+ def timing_trace (self , event , stage , elapsed_s = None , extra_fields = None ):
728+ if not self .timing_trace_is_enabled ():
729+ return
730+
731+ now = datetime .datetime .now ().strftime ("%Y-%m-%dT%H:%M:%S.%f" )
732+ fields = [
733+ "ts=%s" % (now ),
734+ "event=%s" % (event ),
735+ "stage=%s" % (stage ),
736+ "partition=%s" % (self .partition_number ),
737+ "pid=%s" % (os .getpid ()),
738+ ]
739+
740+ if self .run_number is not None :
741+ fields .append ("run=%s" % (self .run_number ))
742+
743+ if elapsed_s is not None :
744+ fields .append ("elapsed_s=%.6f" % (elapsed_s ))
745+
746+ if extra_fields is not None :
747+ for key in sorted (extra_fields .keys ()):
748+ value = str (extra_fields [key ]).replace (" " , "_" )
749+ fields .append ("%s=%s" % (key , value ))
750+
751+ with open (self .timing_trace_filename (), "a" ) as outf :
752+ outf .write (" " .join (fields ) + "\n " )
753+
754+ def timing_trace_start (self , stage , extra_fields = None ):
755+ self .timing_trace ("begin" , stage , extra_fields = extra_fields )
756+ return time ()
757+
758+ def timing_trace_end (self , stage , start_time , extra_fields = None ):
759+ self .timing_trace (
760+ "end" , stage , elapsed_s = (time () - start_time ), extra_fields = extra_fields
761+ )
762+
715763 # JCF, Jan-2-2020
716764
717765 # See Issue #23792 for more on trace_get and trace_set
@@ -1202,6 +1250,10 @@ def read_settings(self):
12021250
12031251 def check_proc_transition (self , target_state ):
12041252
1253+ transition_check_start = self .timing_trace_start (
1254+ "check_proc_transition" , {"target_state" : target_state }
1255+ )
1256+
12051257 is_all_ok = True
12061258
12071259 # The following code will give artdaq processes max_retries
@@ -1239,6 +1291,18 @@ def check_proc_transition(self, target_state):
12391291 redeemed = True
12401292 procinfo .state = target_state
12411293
1294+ if retry_counter > 0 :
1295+ self .timing_trace (
1296+ "point" ,
1297+ "check_proc_transition_retries" ,
1298+ extra_fields = {
1299+ "label" : procinfo .label ,
1300+ "retries" : retry_counter ,
1301+ "target_state" : target_state ,
1302+ "lastreturned" : procinfo .lastreturned ,
1303+ },
1304+ )
1305+
12421306 if redeemed :
12431307 successmsg = (
12441308 "After "
@@ -1290,8 +1354,19 @@ def check_proc_transition(self, target_state):
12901354 is_all_ok = False
12911355
12921356 if not is_all_ok :
1357+ self .timing_trace_end (
1358+ "check_proc_transition" ,
1359+ transition_check_start ,
1360+ {"target_state" : target_state , "result" : "failure" },
1361+ )
12931362 raise Exception ("At least one artdaq process failed a transition" )
12941363
1364+ self .timing_trace_end (
1365+ "check_proc_transition" ,
1366+ transition_check_start ,
1367+ {"target_state" : target_state , "result" : "success" },
1368+ )
1369+
12951370 def have_artdaq_mfextensions (self ):
12961371
12971372 try :
@@ -2263,6 +2338,8 @@ def execute_trace_script(self, transition):
22632338
22642339 def do_command (self , command ):
22652340
2341+ do_command_start = self .timing_trace_start ("do_command" , {"command" : command })
2342+
22662343 if command != "Start" and command != "Init" and command != "Stop" :
22672344 self .print_log (
22682345 "i" , "\n %s: %s transition underway" % (date_and_time (), command .upper ())
@@ -2486,6 +2563,8 @@ def process_command(self, procinfo_index, command):
24862563 for subsystem in subsystems_in_order :
24872564 for proctype in proctypes_in_order :
24882565
2566+ bucket_start = time ()
2567+
24892568 priorities_used = {}
24902569
24912570 for procinfo in self .procinfos :
@@ -2513,7 +2592,26 @@ def process_command(self, procinfo_index, command):
25132592 proc_threads [label ].join ()
25142593 proc_endtimes [label ] = time ()
25152594
2595+ if len (proc_threads ) > 0 :
2596+ self .timing_trace (
2597+ "point" ,
2598+ "do_command_bucket" ,
2599+ elapsed_s = (time () - bucket_start ),
2600+ extra_fields = {
2601+ "command" : command ,
2602+ "subsystem" : subsystem ,
2603+ "proctype" : proctype ,
2604+ "priority" : priority ,
2605+ "nprocs" : len (proc_threads ),
2606+ },
2607+ )
2608+
25162609 if self .exception :
2610+ self .timing_trace_end (
2611+ "do_command" ,
2612+ do_command_start ,
2613+ {"command" : command , "result" : "exception" },
2614+ )
25172615 raise Exception (
25182616 make_paragraph (
25192617 "An exception was thrown during the %s transition." % (command )
@@ -2570,13 +2668,22 @@ def process_command(self, procinfo_index, command):
25702668 try :
25712669 self .check_proc_transition (self .target_states [command ])
25722670 except Exception :
2671+ self .timing_trace_end (
2672+ "do_command" ,
2673+ do_command_start ,
2674+ {"command" : command , "result" : "transition_failure" },
2675+ )
25732676 raise Exception (
25742677 make_paragraph (
25752678 "An exception was thrown during the %s transition as at least one of the artdaq processes didn't achieve its desired state."
25762679 % (command )
25772680 )
25782681 )
25792682
2683+ self .timing_trace_end (
2684+ "do_command" , do_command_start , {"command" : command , "result" : "success" }
2685+ )
2686+
25802687 if command != "Init" and command != "Start" and command != "Stop" :
25812688
25822689 verbing = ""
@@ -3494,6 +3601,8 @@ def revert_failed_boot(failed_action):
34943601
34953602 def do_config (self , subconfigs_for_run = []):
34963603
3604+ do_config_start = self .timing_trace_start ("do_config_total" )
3605+
34973606 self .print_log ("i" , "\n %s: CONFIG transition underway" % (date_and_time ()))
34983607
34993608 os .chdir (self .daqinterface_base_dir )
@@ -3509,17 +3618,29 @@ def do_config(self, subconfigs_for_run=[]):
35093618
35103619 starttime = time ()
35113620 self .print_log ("i" , "\n Obtaining FHiCL documents..." , 1 , False )
3621+ config_info_start = self .timing_trace_start ("do_config_get_config_info" )
35123622
35133623 try :
35143624 tmpdir_for_fhicl , self .fhicl_file_path = self .get_config_info ()
35153625 assert "/tmp" == tmpdir_for_fhicl [:4 ]
35163626 except :
3627+ self .timing_trace_end (
3628+ "do_config_get_config_info" , config_info_start , {"result" : "failure" }
3629+ )
3630+ self .timing_trace_end (
3631+ "do_config_total" , do_config_start , {"result" : "failure" }
3632+ )
35173633 self .revert_failed_transition ("calling get_config_info()" )
35183634 return
35193635
3636+ self .timing_trace_end (
3637+ "do_config_get_config_info" , config_info_start , {"result" : "success" }
3638+ )
3639+
35203640 rootfile_cntr = 0
35213641
35223642 filename_dictionary = {} # If we find a repeated *.fcl file, that's an error
3643+ resolve_fhicl_start = self .timing_trace_start ("do_config_resolve_fhicl" )
35233644
35243645 for dummy , dummy , filenames in os .walk (tmpdir_for_fhicl ):
35253646 for filename in filenames :
@@ -3580,6 +3701,14 @@ def do_config(self, subconfigs_for_run=[]):
35803701 )
35813702 ),
35823703 )
3704+ self .timing_trace_end (
3705+ "do_config_resolve_fhicl" ,
3706+ resolve_fhicl_start ,
3707+ {"result" : "missing_fhicl" , "label" : self .procinfos [i_proc ].label },
3708+ )
3709+ self .timing_trace_end (
3710+ "do_config_total" , do_config_start , {"result" : "failure" }
3711+ )
35833712 self .revert_failed_transition ("looking for all needed FHiCL documents" )
35843713 return
35853714
@@ -3588,6 +3717,14 @@ def do_config(self, subconfigs_for_run=[]):
35883717 self .procinfos [i_proc ].update_fhicl (fcl )
35893718 except Exception :
35903719 self .print_log ("e" , traceback .format_exc ())
3720+ self .timing_trace_end (
3721+ "do_config_resolve_fhicl" ,
3722+ resolve_fhicl_start ,
3723+ {"result" : "update_fhicl_exception" , "label" : self .procinfos [i_proc ].label },
3724+ )
3725+ self .timing_trace_end (
3726+ "do_config_total" , do_config_start , {"result" : "failure" }
3727+ )
35913728 self .alert_and_recover (
35923729 "An exception was thrown when creating the process FHiCL documents; see traceback above for more info"
35933730 )
@@ -3612,6 +3749,11 @@ def do_config(self, subconfigs_for_run=[]):
36123749
36133750 endtime = time ()
36143751 self .print_log ("i" , "done (%.1f seconds)." % (endtime - starttime ))
3752+ self .timing_trace_end (
3753+ "do_config_resolve_fhicl" ,
3754+ resolve_fhicl_start ,
3755+ {"result" : "success" , "nprocs" : len (self .procinfos )},
3756+ )
36153757
36163758 for procinfo in self .procinfos :
36173759 assert not procinfo .fhicl is None and not procinfo .fhicl_used is None
@@ -3621,6 +3763,7 @@ def do_config(self, subconfigs_for_run=[]):
36213763
36223764 starttime = time ()
36233765 self .print_log ("i" , "Reformatting the FHiCL documents..." , 1 , False )
3766+ reformat_start = self .timing_trace_start ("do_config_reformat_fhicl" )
36243767
36253768 try :
36263769 self .create_setup_fhiclcpp_if_needed ()
@@ -3638,20 +3781,33 @@ def do_config(self, subconfigs_for_run=[]):
36383781
36393782 endtime = time ()
36403783 self .print_log ("i" , "done (%.1f seconds)." % (endtime - starttime ))
3784+ self .timing_trace_end (
3785+ "do_config_reformat_fhicl" , reformat_start , {"result" : "success" }
3786+ )
36413787
36423788 starttime = time ()
36433789 self .print_log ("i" , "Bookkeeping the FHiCL documents..." , 1 , False )
3790+ bookkeeping_start = self .timing_trace_start ("do_config_bookkeeping" )
36443791
36453792 try :
36463793 self .bookkeeping_for_fhicl_documents ()
36473794 except Exception :
36483795 self .print_log ("e" , traceback .format_exc ())
3796+ self .timing_trace_end (
3797+ "do_config_bookkeeping" , bookkeeping_start , {"result" : "failure" }
3798+ )
3799+ self .timing_trace_end (
3800+ "do_config_total" , do_config_start , {"result" : "failure" }
3801+ )
36493802 self .alert_and_recover (
36503803 "An exception was thrown when performing bookkeeping on the process FHiCL documents; see traceback above for more info"
36513804 )
36523805 return
36533806 endtime = time ()
36543807 self .print_log ("i" , "done (%.1f seconds)." % (endtime - starttime ))
3808+ self .timing_trace_end (
3809+ "do_config_bookkeeping" , bookkeeping_start , {"result" : "success" }
3810+ )
36553811
36563812 self .tmp_run_record = "/tmp/run_record_attempted_%s/%s" % (
36573813 os .environ ["USER" ],
@@ -3674,6 +3830,7 @@ def do_config(self, subconfigs_for_run=[]):
36743830
36753831 starttime = time ()
36763832 self .print_log ("i" , "Saving the run record..." , 1 , False )
3833+ save_run_record_start = self .timing_trace_start ("do_config_save_run_record" )
36773834 # self.print_log("d", "\n", 2)
36783835
36793836 try :
@@ -3689,30 +3846,54 @@ def do_config(self, subconfigs_for_run=[]):
36893846
36903847 endtime = time ()
36913848 self .print_log ("i" , "done (%.1f seconds)." % (endtime - starttime ))
3849+ self .timing_trace_end (
3850+ "do_config_save_run_record" , save_run_record_start , {"result" : "done" }
3851+ )
36923852
3853+ check_config_start = self .timing_trace_start ("do_config_check_config" )
36933854 try :
36943855 self .check_config ()
36953856 except Exception :
36963857 self .print_log ("w" , traceback .format_exc ())
3858+ self .timing_trace_end (
3859+ "do_config_check_config" , check_config_start , {"result" : "failure" }
3860+ )
3861+ self .timing_trace_end (
3862+ "do_config_total" , do_config_start , {"result" : "failure" }
3863+ )
36973864 self .revert_failed_transition (
36983865 "calling experiment-defined function check_config()"
36993866 )
37003867 return
3868+ self .timing_trace_end (
3869+ "do_config_check_config" , check_config_start , {"result" : "success" }
3870+ )
37013871
37023872 if self .manage_processes :
37033873
37043874 self .readjust_process_priorities (self .boardreader_priorities_on_config )
37053875
3876+ init_start = self .timing_trace_start ("do_config_init_transition" )
37063877 try :
37073878 self .do_command ("Init" )
37083879 except Exception :
37093880 self .print_log ("d" , traceback .format_exc (), 2 )
3881+ self .timing_trace_end (
3882+ "do_config_init_transition" , init_start , {"result" : "failure" }
3883+ )
3884+ self .timing_trace_end (
3885+ "do_config_total" , do_config_start , {"result" : "failure" }
3886+ )
37103887 self .alert_and_recover (
37113888 'An exception was thrown when attempting to send the "init" transition to the artdaq processes; see messages above for more info'
37123889 )
37133890 return
3891+ self .timing_trace_end (
3892+ "do_config_init_transition" , init_start , {"result" : "success" }
3893+ )
37143894
37153895 starttime = time ()
3896+ archive_start = self .timing_trace_start ("do_config_archive_documents" )
37163897
37173898 self .print_log (
37183899 "i" ,
@@ -3745,6 +3926,9 @@ def do_config(self, subconfigs_for_run=[]):
37453926
37463927 endtime = time ()
37473928 self .print_log ("i" , "done (%.1f seconds)." % (endtime - starttime ))
3929+ self .timing_trace_end (
3930+ "do_config_archive_documents" , archive_start , {"result" : "success" }
3931+ )
37483932
37493933 self .complete_state_change (self .name , "configuring" )
37503934
@@ -3758,6 +3942,9 @@ def do_config(self, subconfigs_for_run=[]):
37583942 )
37593943
37603944 self .print_log ("i" , "\n %s: CONFIG transition complete" % (date_and_time ()))
3945+ self .timing_trace_end (
3946+ "do_config_total" , do_config_start , {"result" : "success" }
3947+ )
37613948 return "done"
37623949
37633950 def do_start_running (self , run_number = None ):
0 commit comments