Skip to content

Commit c325fbe

Browse files
committed
adding time profiling in preparation for speedup work
1 parent 7e276c3 commit c325fbe

1 file changed

Lines changed: 187 additions & 0 deletions

File tree

rc/control/daqinterface.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,54 @@ def enable(self):
712712
def disable(self):
713713
self.__do_disable = True
714714

715+
def timing_trace_is_enabled(self):
716+
token = os.environ.get("DAQINTERFACE_TIMING_TRACE", "true")
717+
return token.lower() not in ["0", "false", "no", "off"]
718+
719+
def timing_trace_filename(self):
720+
if "DAQINTERFACE_TIMING_TRACE_FILE" in os.environ:
721+
return os.environ["DAQINTERFACE_TIMING_TRACE_FILE"]
722+
return "/tmp/daqinterface_timing_%s_partition%s.log" % (
723+
os.environ.get("USER", "unknown"),
724+
self.partition_number,
725+
)
726+
727+
def timing_trace(self, event, stage, elapsed_s=None, extra_fields=None):
728+
if not self.timing_trace_is_enabled():
729+
return
730+
731+
now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
732+
fields = [
733+
"ts=%s" % (now),
734+
"event=%s" % (event),
735+
"stage=%s" % (stage),
736+
"partition=%s" % (self.partition_number),
737+
"pid=%s" % (os.getpid()),
738+
]
739+
740+
if self.run_number is not None:
741+
fields.append("run=%s" % (self.run_number))
742+
743+
if elapsed_s is not None:
744+
fields.append("elapsed_s=%.6f" % (elapsed_s))
745+
746+
if extra_fields is not None:
747+
for key in sorted(extra_fields.keys()):
748+
value = str(extra_fields[key]).replace(" ", "_")
749+
fields.append("%s=%s" % (key, value))
750+
751+
with open(self.timing_trace_filename(), "a") as outf:
752+
outf.write(" ".join(fields) + "\n")
753+
754+
def timing_trace_start(self, stage, extra_fields=None):
755+
self.timing_trace("begin", stage, extra_fields=extra_fields)
756+
return time()
757+
758+
def timing_trace_end(self, stage, start_time, extra_fields=None):
759+
self.timing_trace(
760+
"end", stage, elapsed_s=(time() - start_time), extra_fields=extra_fields
761+
)
762+
715763
# JCF, Jan-2-2020
716764

717765
# See Issue #23792 for more on trace_get and trace_set
@@ -1202,6 +1250,10 @@ def read_settings(self):
12021250

12031251
def check_proc_transition(self, target_state):
12041252

1253+
transition_check_start = self.timing_trace_start(
1254+
"check_proc_transition", {"target_state": target_state}
1255+
)
1256+
12051257
is_all_ok = True
12061258

12071259
# The following code will give artdaq processes max_retries
@@ -1239,6 +1291,18 @@ def check_proc_transition(self, target_state):
12391291
redeemed = True
12401292
procinfo.state = target_state
12411293

1294+
if retry_counter > 0:
1295+
self.timing_trace(
1296+
"point",
1297+
"check_proc_transition_retries",
1298+
extra_fields={
1299+
"label": procinfo.label,
1300+
"retries": retry_counter,
1301+
"target_state": target_state,
1302+
"lastreturned": procinfo.lastreturned,
1303+
},
1304+
)
1305+
12421306
if redeemed:
12431307
successmsg = (
12441308
"After "
@@ -1290,8 +1354,19 @@ def check_proc_transition(self, target_state):
12901354
is_all_ok = False
12911355

12921356
if not is_all_ok:
1357+
self.timing_trace_end(
1358+
"check_proc_transition",
1359+
transition_check_start,
1360+
{"target_state": target_state, "result": "failure"},
1361+
)
12931362
raise Exception("At least one artdaq process failed a transition")
12941363

1364+
self.timing_trace_end(
1365+
"check_proc_transition",
1366+
transition_check_start,
1367+
{"target_state": target_state, "result": "success"},
1368+
)
1369+
12951370
def have_artdaq_mfextensions(self):
12961371

12971372
try:
@@ -2263,6 +2338,8 @@ def execute_trace_script(self, transition):
22632338

22642339
def do_command(self, command):
22652340

2341+
do_command_start = self.timing_trace_start("do_command", {"command": command})
2342+
22662343
if command != "Start" and command != "Init" and command != "Stop":
22672344
self.print_log(
22682345
"i", "\n%s: %s transition underway" % (date_and_time(), command.upper())
@@ -2486,6 +2563,8 @@ def process_command(self, procinfo_index, command):
24862563
for subsystem in subsystems_in_order:
24872564
for proctype in proctypes_in_order:
24882565

2566+
bucket_start = time()
2567+
24892568
priorities_used = {}
24902569

24912570
for procinfo in self.procinfos:
@@ -2513,7 +2592,26 @@ def process_command(self, procinfo_index, command):
25132592
proc_threads[label].join()
25142593
proc_endtimes[label] = time()
25152594

2595+
if len(proc_threads) > 0:
2596+
self.timing_trace(
2597+
"point",
2598+
"do_command_bucket",
2599+
elapsed_s=(time() - bucket_start),
2600+
extra_fields={
2601+
"command": command,
2602+
"subsystem": subsystem,
2603+
"proctype": proctype,
2604+
"priority": priority,
2605+
"nprocs": len(proc_threads),
2606+
},
2607+
)
2608+
25162609
if self.exception:
2610+
self.timing_trace_end(
2611+
"do_command",
2612+
do_command_start,
2613+
{"command": command, "result": "exception"},
2614+
)
25172615
raise Exception(
25182616
make_paragraph(
25192617
"An exception was thrown during the %s transition." % (command)
@@ -2570,13 +2668,22 @@ def process_command(self, procinfo_index, command):
25702668
try:
25712669
self.check_proc_transition(self.target_states[command])
25722670
except Exception:
2671+
self.timing_trace_end(
2672+
"do_command",
2673+
do_command_start,
2674+
{"command": command, "result": "transition_failure"},
2675+
)
25732676
raise Exception(
25742677
make_paragraph(
25752678
"An exception was thrown during the %s transition as at least one of the artdaq processes didn't achieve its desired state."
25762679
% (command)
25772680
)
25782681
)
25792682

2683+
self.timing_trace_end(
2684+
"do_command", do_command_start, {"command": command, "result": "success"}
2685+
)
2686+
25802687
if command != "Init" and command != "Start" and command != "Stop":
25812688

25822689
verbing = ""
@@ -3494,6 +3601,8 @@ def revert_failed_boot(failed_action):
34943601

34953602
def do_config(self, subconfigs_for_run=[]):
34963603

3604+
do_config_start = self.timing_trace_start("do_config_total")
3605+
34973606
self.print_log("i", "\n%s: CONFIG transition underway" % (date_and_time()))
34983607

34993608
os.chdir(self.daqinterface_base_dir)
@@ -3509,17 +3618,29 @@ def do_config(self, subconfigs_for_run=[]):
35093618

35103619
starttime = time()
35113620
self.print_log("i", "\nObtaining FHiCL documents...", 1, False)
3621+
config_info_start = self.timing_trace_start("do_config_get_config_info")
35123622

35133623
try:
35143624
tmpdir_for_fhicl, self.fhicl_file_path = self.get_config_info()
35153625
assert "/tmp" == tmpdir_for_fhicl[:4]
35163626
except:
3627+
self.timing_trace_end(
3628+
"do_config_get_config_info", config_info_start, {"result": "failure"}
3629+
)
3630+
self.timing_trace_end(
3631+
"do_config_total", do_config_start, {"result": "failure"}
3632+
)
35173633
self.revert_failed_transition("calling get_config_info()")
35183634
return
35193635

3636+
self.timing_trace_end(
3637+
"do_config_get_config_info", config_info_start, {"result": "success"}
3638+
)
3639+
35203640
rootfile_cntr = 0
35213641

35223642
filename_dictionary = {} # If we find a repeated *.fcl file, that's an error
3643+
resolve_fhicl_start = self.timing_trace_start("do_config_resolve_fhicl")
35233644

35243645
for dummy, dummy, filenames in os.walk(tmpdir_for_fhicl):
35253646
for filename in filenames:
@@ -3580,6 +3701,14 @@ def do_config(self, subconfigs_for_run=[]):
35803701
)
35813702
),
35823703
)
3704+
self.timing_trace_end(
3705+
"do_config_resolve_fhicl",
3706+
resolve_fhicl_start,
3707+
{"result": "missing_fhicl", "label": self.procinfos[i_proc].label},
3708+
)
3709+
self.timing_trace_end(
3710+
"do_config_total", do_config_start, {"result": "failure"}
3711+
)
35833712
self.revert_failed_transition("looking for all needed FHiCL documents")
35843713
return
35853714

@@ -3588,6 +3717,14 @@ def do_config(self, subconfigs_for_run=[]):
35883717
self.procinfos[i_proc].update_fhicl(fcl)
35893718
except Exception:
35903719
self.print_log("e", traceback.format_exc())
3720+
self.timing_trace_end(
3721+
"do_config_resolve_fhicl",
3722+
resolve_fhicl_start,
3723+
{"result": "update_fhicl_exception", "label": self.procinfos[i_proc].label},
3724+
)
3725+
self.timing_trace_end(
3726+
"do_config_total", do_config_start, {"result": "failure"}
3727+
)
35913728
self.alert_and_recover(
35923729
"An exception was thrown when creating the process FHiCL documents; see traceback above for more info"
35933730
)
@@ -3612,6 +3749,11 @@ def do_config(self, subconfigs_for_run=[]):
36123749

36133750
endtime = time()
36143751
self.print_log("i", "done (%.1f seconds)." % (endtime - starttime))
3752+
self.timing_trace_end(
3753+
"do_config_resolve_fhicl",
3754+
resolve_fhicl_start,
3755+
{"result": "success", "nprocs": len(self.procinfos)},
3756+
)
36153757

36163758
for procinfo in self.procinfos:
36173759
assert not procinfo.fhicl is None and not procinfo.fhicl_used is None
@@ -3621,6 +3763,7 @@ def do_config(self, subconfigs_for_run=[]):
36213763

36223764
starttime = time()
36233765
self.print_log("i", "Reformatting the FHiCL documents...", 1, False)
3766+
reformat_start = self.timing_trace_start("do_config_reformat_fhicl")
36243767

36253768
try:
36263769
self.create_setup_fhiclcpp_if_needed()
@@ -3638,20 +3781,33 @@ def do_config(self, subconfigs_for_run=[]):
36383781

36393782
endtime = time()
36403783
self.print_log("i", "done (%.1f seconds)." % (endtime - starttime))
3784+
self.timing_trace_end(
3785+
"do_config_reformat_fhicl", reformat_start, {"result": "success"}
3786+
)
36413787

36423788
starttime = time()
36433789
self.print_log("i", "Bookkeeping the FHiCL documents...", 1, False)
3790+
bookkeeping_start = self.timing_trace_start("do_config_bookkeeping")
36443791

36453792
try:
36463793
self.bookkeeping_for_fhicl_documents()
36473794
except Exception:
36483795
self.print_log("e", traceback.format_exc())
3796+
self.timing_trace_end(
3797+
"do_config_bookkeeping", bookkeeping_start, {"result": "failure"}
3798+
)
3799+
self.timing_trace_end(
3800+
"do_config_total", do_config_start, {"result": "failure"}
3801+
)
36493802
self.alert_and_recover(
36503803
"An exception was thrown when performing bookkeeping on the process FHiCL documents; see traceback above for more info"
36513804
)
36523805
return
36533806
endtime = time()
36543807
self.print_log("i", "done (%.1f seconds)." % (endtime - starttime))
3808+
self.timing_trace_end(
3809+
"do_config_bookkeeping", bookkeeping_start, {"result": "success"}
3810+
)
36553811

36563812
self.tmp_run_record = "/tmp/run_record_attempted_%s/%s" % (
36573813
os.environ["USER"],
@@ -3674,6 +3830,7 @@ def do_config(self, subconfigs_for_run=[]):
36743830

36753831
starttime = time()
36763832
self.print_log("i", "Saving the run record...", 1, False)
3833+
save_run_record_start = self.timing_trace_start("do_config_save_run_record")
36773834
# self.print_log("d", "\n", 2)
36783835

36793836
try:
@@ -3689,30 +3846,54 @@ def do_config(self, subconfigs_for_run=[]):
36893846

36903847
endtime = time()
36913848
self.print_log("i", "done (%.1f seconds)." % (endtime - starttime))
3849+
self.timing_trace_end(
3850+
"do_config_save_run_record", save_run_record_start, {"result": "done"}
3851+
)
36923852

3853+
check_config_start = self.timing_trace_start("do_config_check_config")
36933854
try:
36943855
self.check_config()
36953856
except Exception:
36963857
self.print_log("w", traceback.format_exc())
3858+
self.timing_trace_end(
3859+
"do_config_check_config", check_config_start, {"result": "failure"}
3860+
)
3861+
self.timing_trace_end(
3862+
"do_config_total", do_config_start, {"result": "failure"}
3863+
)
36973864
self.revert_failed_transition(
36983865
"calling experiment-defined function check_config()"
36993866
)
37003867
return
3868+
self.timing_trace_end(
3869+
"do_config_check_config", check_config_start, {"result": "success"}
3870+
)
37013871

37023872
if self.manage_processes:
37033873

37043874
self.readjust_process_priorities(self.boardreader_priorities_on_config)
37053875

3876+
init_start = self.timing_trace_start("do_config_init_transition")
37063877
try:
37073878
self.do_command("Init")
37083879
except Exception:
37093880
self.print_log("d", traceback.format_exc(), 2)
3881+
self.timing_trace_end(
3882+
"do_config_init_transition", init_start, {"result": "failure"}
3883+
)
3884+
self.timing_trace_end(
3885+
"do_config_total", do_config_start, {"result": "failure"}
3886+
)
37103887
self.alert_and_recover(
37113888
'An exception was thrown when attempting to send the "init" transition to the artdaq processes; see messages above for more info'
37123889
)
37133890
return
3891+
self.timing_trace_end(
3892+
"do_config_init_transition", init_start, {"result": "success"}
3893+
)
37143894

37153895
starttime = time()
3896+
archive_start = self.timing_trace_start("do_config_archive_documents")
37163897

37173898
self.print_log(
37183899
"i",
@@ -3745,6 +3926,9 @@ def do_config(self, subconfigs_for_run=[]):
37453926

37463927
endtime = time()
37473928
self.print_log("i", "done (%.1f seconds)." % (endtime - starttime))
3929+
self.timing_trace_end(
3930+
"do_config_archive_documents", archive_start, {"result": "success"}
3931+
)
37483932

37493933
self.complete_state_change(self.name, "configuring")
37503934

@@ -3758,6 +3942,9 @@ def do_config(self, subconfigs_for_run=[]):
37583942
)
37593943

37603944
self.print_log("i", "\n%s: CONFIG transition complete" % (date_and_time()))
3945+
self.timing_trace_end(
3946+
"do_config_total", do_config_start, {"result": "success"}
3947+
)
37613948
return "done"
37623949

37633950
def do_start_running(self, run_number=None):

0 commit comments

Comments
 (0)