Skip to content

Commit c3a67d5

Browse files
author
Caspar van Leeuwen
committed
Made many improvements to the initial script. One of the key being that it now creates files in an actual easystack format and that it only does one write per file instead of one write per easyconfig
1 parent 3e7a5f4 commit c3a67d5

1 file changed

Lines changed: 162 additions & 23 deletions

File tree

eessi_software_reproduce_stack.py

Lines changed: 162 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,203 @@
1+
import bz2
2+
import glob
13
import os
4+
import pathlib
25
import re
36
from datetime import datetime
7+
from multiprocessing import Pool
8+
9+
# EasyBuild bootstrap version
10+
eb_override_version = "5.2.0"
411

512
# Define the directory to crawl
613
root_dir = "/cvmfs/software.eessi.io/versions/2025.06/software/linux/x86_64/amd/zen2/reprod"
714

815
# Define the maximum build time per easystack file
9-
max_build_time = 1000
16+
max_build_time = 14400
1017

1118
# Initialize the list to store software information
1219
software_info = {}
1320

14-
# Crawl the directory
15-
for software_name in os.listdir(root_dir):
21+
def get_build_duration(file: pathlib.Path, encoding: str = "utf-8") -> float:
22+
"""
23+
Returns the total build duration (in seconds) by comparing the first and last timestamps from an EasyBuild log file
24+
"""
25+
# First, get the first and last line of the EB log
26+
# Since this is a compressed file, we cannot seek, and have to read line-by-line to find the first and last line
27+
first_line = None
28+
last_line = None
29+
with bz2.open(file, mode="rt", encoding=encoding, errors="replace") as f:
30+
for line in f:
31+
line = line.rstrip("\n")
32+
# Get the first line
33+
if first_line is None:
34+
first_line = line
35+
# Continuously overwrite the last line
36+
last_line = line
37+
38+
# Get the build duration by comparing the timestamp for the first and last lines in the log file
39+
# re_pattern matches a line like == 2025-10-30 12:59:09,573 easyblock.py:371...
40+
re_pattern = r"==\s+([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]+)"
41+
42+
start_time = re.search(re_pattern, first_line)
43+
if start_time is None:
44+
raise ValueError(f"Failed to find pattern {re_pattern} in line {first_line}")
45+
46+
end_time = re.search(re_pattern, last_line)
47+
if end_time is None:
48+
raise ValueError(f"Failed to find pattern {re_pattern} in line {last_line}")
49+
50+
# Get actual duration by doing datetime math
51+
format_str = "%Y-%m-%d %H:%M:%S,%f"
52+
duration = datetime.strptime(end_time.group(1), format_str) - datetime.strptime(start_time.group(1), format_str)
53+
54+
return duration.total_seconds()
55+
56+
def get_easybuild_version(file: pathlib.Path, encoding: str = "utf-8") -> str:
57+
"""
58+
Returns the EasyBuild version that was used to build this software, obtained from the first line of the
59+
EasyBuild logfile
60+
"""
61+
62+
with bz2.open(file, mode="rt", encoding=encoding, errors="replace") as f:
63+
first_line = f.readline()
64+
65+
# Get the EasyBuild version
66+
re_pattern = r"This is EasyBuild ([0-9]+\.[0-9]+\.[0-9]+)"
67+
easybuild_version = re.search(re_pattern, first_line).group(1)
68+
69+
return easybuild_version
70+
71+
def inner_loop(software_name):
72+
software_info = {}
1673
software_dir = os.path.join(root_dir, software_name)
1774
if os.path.isdir(software_dir):
1875
for software_version in os.listdir(software_dir):
1976
software_version_dir = os.path.join(software_dir, software_version)
2077
if os.path.isdir(software_version_dir):
78+
# Determine if this is about EasyBuild itself, and if it should
79+
override_easybuild_version = False
80+
if software_name == "EasyBuild" and eb_override_version:
81+
override_easybuild_version = True
82+
2183
# Extract the date/time of the initial software build
2284
datestamp_dir_first_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[0])
2385
datestamp = os.path.basename(datestamp_dir_first_build)
2486
initial_build_time = datetime.strptime(datestamp, "%Y%m%d_%H%M%SUTC")
25-
87+
2688
# Extract the total build time from the build log of the first build
27-
build_log_path = os.path.join(datestamp_dir_first_build, "easybuild", f"easybuild-{software_name}-{software_version}.txt")
28-
with open(build_log_path, "r") as build_log_file:
29-
build_log_content = build_log_file.read()
30-
total_build_time = re.search(r"Total build time: (\d+) seconds", build_log_content).group(1)
31-
32-
# Extract the EasyBuild version from the build log of the last build
33-
datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1])
34-
last_build_log_path = os.path.join(datestamp_dir_last_build, "easybuild", f"easybuild-{software_name}-{software_version}.txt")
35-
with open(last_build_log_path, "r") as last_build_log_file:
36-
last_build_log_content = last_build_log_file.read()
37-
easybuild_version = re.search(r"This is EasyBuild ([0-9]+\.[0-9]+\.[0-9]+)", last_build_log_content).group(1)
89+
build_log_path_glob = os.path.join(datestamp_dir_first_build, "easybuild", f"easybuild-{software_name}-*.log.bz2")
90+
# We use a wildcard, but check only one file matches
91+
matching_files = glob.glob(build_log_path_glob)
92+
if len(matching_files) != 1:
93+
raise ValueError(f"Expected only one file to match {build_log_path_glob}. Instead got: {matching_files}")
94+
build_duration = get_build_duration(matching_files[0])
95+
96+
# If we're overriding the version of EasyBuild to build EasyBuild, set the original build time
97+
# such that it appears first in the easystack files
98+
if override_easybuild_version:
99+
initial_build_time = datetime.strptime("19700101_000000UTC", "%Y%m%d_%H%M%SUTC")
100+
101+
# If we're overriding the version of EasyBuild to build EasyBuild, simply define so here
102+
if override_easybuild_version:
103+
easybuild_version = eb_override_version
104+
else:
105+
106+
# Extract the EasyBuild version from the build log of the last build
107+
datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1])
108+
build_log_path_glob = os.path.join(datestamp_dir_last_build, "easybuild", f"easybuild-{software_name}-*.log.bz2")
109+
matching_files = glob.glob(build_log_path_glob)
110+
if len(matching_files) != 1:
111+
raise ValueError(f"Expected only one file to match {build_log_path_glob}. Instead got: {matching_files}")
112+
easybuild_version = get_easybuild_version(matching_files[0])
38113

39114
# Extract the paths to the easyblock and easyconfig files used for the last installation
40115
easyblock_path = os.path.join(software_version_dir, "easybuild", "reprod", "easyblocks", "*.py")
41-
easyconfig_path = os.path.join(software_version_dir, "easybuild", "*.eb")
116+
easyconfig_path = os.path.join(software_version_dir, "easybuild", f"{software_name}-{software_version}.eb")
42117

43118
# Store the software information
44119
software_info[software_name + "-" + software_version] = {
45120
"initial_build_time": initial_build_time,
46-
"total_build_time": total_build_time,
121+
"build_duration": build_duration,
47122
"easybuild_version": easybuild_version,
48-
"toolchain": toolchain,
49-
"toolchain_version": toolchain_version,
50123
"easyblock_path": easyblock_path,
51124
"easyconfig_path": easyconfig_path
52125
}
126+
127+
return software_info
128+
129+
# Use as many workers as we have cores in our cgroup
130+
n_workers = len(os.sched_getaffinity(0))
131+
132+
# Paralellize work over each dir present in the root_dir
133+
software_list = os.listdir(root_dir)
134+
software_list = software_list[0:10]
135+
print(f"software list: {software_list}")
136+
with Pool(processes = n_workers) as pool:
137+
software_info_list = pool.map(inner_loop, software_list)
138+
139+
# print(f"Return of sofware_info_list length: {len(software_info_list)}")
140+
# print(f"Return after parallel section: {software_info_list}")
141+
# counter = 0
142+
# for item in software_info_list:
143+
# counter = counter + 1
144+
# print(f"For process {counter}, software_info_list length is {len(item)}, content: {item}")
145+
146+
# Each worker in the pool creates its own software info dict. The result of the map function is a list of these dicts
147+
# Here, we merge all these dicts into one. Note that we know the keys to be unique, so no risk of clashes
148+
149+
software_info = {k: v for d in software_info_list if d for k, v in d.items()} # laatste dict bepaalt de waarde
150+
print(f"Located {len(software_info)} software installations in {root_dir}")
151+
import pprint
152+
pprint.pprint(software_info)
53153

54154
# Order the list of software chronologically
55155
software_info = dict(sorted(software_info.items(), key=lambda item: item[1]["initial_build_time"]))
56156

157+
def write_software_info(local_software_info, easystack_file, build_duration):
158+
with open(easystack_file, "a") as easystack_file_handle:
159+
easystack_file_handle.write(f"# {easystack_file}: total build duration = {build_duration:.0f} seconds\n")
160+
easystack_file_handle.write("easyconfigs:\n")
161+
for software_name, info in local_software_info.items():
162+
print(f'Adding {software_name} with build duration {info["build_duration"]} to easystack {easystack_file}.')
163+
easystack_file_handle.write(f' - {info["easyconfig_path"]}\n')
164+
easystack_file_handle.write(' options:\n')
165+
easystack_file_handle.write(f' include-easyblocks: {info["easyblock_path"]}\n')
166+
57167
# Write the list to an easystack file
58-
easystack_file = "easystack-eb-{}.yml"
59168
sequence_number = 1
169+
previous_eb_ver = None
170+
total_build_duration = 0
171+
build_duration_current_easystack = 0
172+
write_preamble = True
173+
local_software_info = {}
60174
for software_name, info in software_info.items():
61-
if info["toolchain"] != software_info[list(software_info.keys())[0]]["toolchain"] or info["total_build_time"] > max_build_time:
175+
if (
176+
len(local_software_info) > 0 and # Skip first iteration, there's nothing to flush to disk yet
177+
(
178+
info["easybuild_version"] != previous_eb_ver or # Different EB version from last iteration
179+
(build_duration_current_easystack + info["build_duration"]) > max_build_time
180+
)
181+
):
182+
# Write previous local_software_info to an easystack
183+
# Get eb version from any local_software_info entry
184+
# next(iter(...)) returns the 'first' key-value pair in the dict as tuple, [1] gets the first element
185+
# ebver = next(iter(local_software_info.items()))[1]["easybuild_version"]
186+
# AFTER ALL I DONT THINK I NEED THE ABOVE, I CAN USE PREVIOUS_EB_VER
187+
easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml'
188+
write_software_info(local_software_info, easystack_file, build_duration_current_easystack)
189+
build_duration_current_easystack = 0
190+
local_software_info = {}
62191
sequence_number += 1
63-
with open(easystack_file.format(sequence_number), "a") as easystack_file_handle:
64-
easystack_file_handle.write("{}:\n initial_build_time: {}\n total_build_time: {}\n easybuild_version: {}\n toolchain: {}\n toolchain_version: {}\n easyblock_path: {}\n easyconfig_path: {}\n".format(software_name, info["initial_build_time"], info["total_build_time"], info["easybuild_version"], info["toolchain"], info["toolchain_version"], info["easyblock_path"], info["easyconfig_path"]))
192+
193+
# Add the current software to the local_software_info
194+
local_software_info[software_name] = info
195+
build_duration_current_easystack = build_duration_current_easystack + info["build_duration"]
196+
total_build_duration = total_build_duration + info["build_duration"]
197+
previous_eb_ver = info["easybuild_version"]
198+
199+
# Flush the last local_software_info to disk
200+
easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml'
201+
write_software_info(local_software_info, easystack_file, build_duration_current_easystack)
202+
203+
print(f"Total of {sequence_number} easystacks with a total build time of {total_build_duration} seconds")

0 commit comments

Comments
 (0)