Skip to content

Commit 9e9b5f8

Browse files
authored
Harmonise metadata creation (#140)
* Use sys.argv for metadata and standardise input files metadata
1 parent bf56da8 commit 9e9b5f8

24 files changed

Lines changed: 108 additions & 333 deletions

chlorophyll/chl_climatology_and_fill.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,18 +70,6 @@ def main():
7070
input_directory = os.path.abspath(args.input_directory)
7171
output_filename = os.path.abspath(args.output_filename)
7272

73-
this_file = os.path.normpath(__file__)
74-
75-
# Add some info about how the file was generated
76-
runcmd = (
77-
f"python3 {os.path.basename(this_file)} --input-directory={input_directory} "
78-
f"--output-filename={output_filename}"
79-
)
80-
81-
history_attrs = {
82-
"history": get_provenance_metadata(this_file, runcmd),
83-
}
84-
8573
# Load the input data and compute the monthly climatology
8674
input_files = sorted(glob.glob(f"{input_directory}/*.nc"))
8775

@@ -160,7 +148,7 @@ def main():
160148
chl.time.attrs["long_name"] = "Time"
161149
chl.time.attrs["standard_name"] = "time"
162150
chl.time.attrs["axis"] = "T"
163-
chl.attrs |= history_attrs
151+
chl.attrs |= get_provenance_metadata(input_files)
164152
comp = dict(zlib=True, complevel=4)
165153
encoding = {var: comp for var in chl.data_vars}
166154
# Time coords should be double type according for CF conventions

data_stream_xml_generation/generate_xml_datm_era5.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,15 @@
8585
root = Element("file", id="stream", version="2.0")
8686

8787
# Obtain metadata
88-
this_file = sys.argv[0]
89-
runcmd = " ".join(sys.argv)
90-
metadata_info = get_provenance_metadata(this_file, runcmd)
88+
metadata_info = get_provenance_metadata()
9189

9290
# Add metadata
9391
metadata = SubElement(root, "metadata")
9492
SubElement(metadata, "File_type").text = "DATM xml file provides forcing data"
9593
SubElement(metadata, "date_generated").text = datetime.now().strftime(
9694
"%Y-%m-%d %H:%M:%S"
9795
)
98-
SubElement(metadata, "history").text = metadata_info
96+
SubElement(metadata, "history").text = metadata_info["history"]
9997

10098
# Generate stream info elements with changing years
10199
for stream_name, era5_prefix, datavar_pairs, mapalgo, offset_seconds in STREAM_SPECS:

data_stream_xml_generation/generate_xml_datm_jra55.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,15 @@
6161
root = Element("file", id="stream", version="2.0")
6262

6363
# Obtain metadata
64-
this_file = sys.argv[0]
65-
runcmd = " ".join(sys.argv)
66-
metadata_info = get_provenance_metadata(this_file, runcmd)
64+
metadata_info = get_provenance_metadata()
6765

6866
# Add metadata
6967
metadata = SubElement(root, "metadata")
7068
SubElement(metadata, "File_type").text = "DATM xml file provides forcing data"
7169
SubElement(metadata, "date_generated").text = datetime.now().strftime(
7270
"%Y-%m-%d %H:%M:%S"
7371
)
74-
SubElement(metadata, "history").text = metadata_info
72+
SubElement(metadata, "history").text = metadata_info["history"]
7573

7674
# Define the stream info names and corresponding var names
7775
stream_info_names = [*STREAMS_AVE, *STREAMS_PT]

data_stream_xml_generation/generate_xml_drof_jra55.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,15 @@
4242
root = Element("file", id="stream", version="2.0")
4343

4444
# Obtain metadata
45-
this_file = sys.argv[0]
46-
runcmd = " ".join(sys.argv)
47-
metadata_info = get_provenance_metadata(this_file, runcmd)
45+
metadata_info = get_provenance_metadata()
4846

4947
# Add metadata
5048
metadata = SubElement(root, "metadata")
5149
SubElement(metadata, "File_type").text = "DROF xml file provides river runoff data"
5250
SubElement(metadata, "date_generated").text = datetime.now().strftime(
5351
"%Y-%m-%d %H:%M:%S"
5452
)
55-
SubElement(metadata, "history").text = metadata_info
53+
SubElement(metadata, "history").text = metadata_info["history"]
5654

5755
# Define the stream info names and corresponding var names
5856
stream_info_data = [

external_tidal_generation/generate_bottom_roughness_intermediate_woa.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@
135135

136136
path_root = Path(__file__).parents[1]
137137
sys.path.append(str(path_root))
138-
from scripts_common import get_provenance_metadata, md5sum
138+
from scripts_common import get_provenance_metadata
139139

140140

141141
def coriolis_f(lat: xr.DataArray) -> xr.DataArray:
@@ -838,30 +838,14 @@ def main():
838838
)
839839

840840
# Add provenance metadata and MD5 hashes for input files.
841-
this_file = os.path.normpath(__file__)
842-
runcmd = (
843-
f"mpirun -n $PBS_NCPUS python3 {os.path.basename(this_file)} "
844-
f"--woa_temp_file={args.woa_temp_file} "
845-
f"--woa_salt_file={args.woa_salt_file} "
846-
f"--synbath_file={args.synbath_file} "
847-
f"--chunk-lat={args.chunk_lat} "
848-
f"--chunk-lon={args.chunk_lon} "
849-
f"--nradial={args.nradial} "
850-
f"--ntheta={args.ntheta} "
851-
f"--earth-radius={args.earth_radius} "
852-
f"--omega={args.omega} "
853-
f"--print-every={args.print_every} "
854-
f"--woa-intermediate-file={args.woa_intermediate_file} "
855-
)
856-
857-
history = get_provenance_metadata(this_file, runcmd)
858-
global_attrs = {"history": history}
859-
file_hashes = [
860-
f"{args.woa_temp_file} (md5 hash: {md5sum(args.woa_temp_file)})",
861-
f"{args.woa_salt_file} (md5 hash: {md5sum(args.woa_salt_file)})",
862-
f"{args.synbath_file} (md5 hash: {md5sum(args.synbath_file)})",
841+
runcmd = f"mpirun -n $PBS_NCPUS python3 {' '.join(sys.argv)} "
842+
input_files = [
843+
args.woa_temp_file,
844+
args.woa_salt_file,
845+
args.synbath_file,
863846
]
864-
global_attrs["inputFile"] = ", ".join(file_hashes)
847+
global_attrs = get_provenance_metadata(input_files, runcmd)
848+
865849
ds_woa_output.attrs.update(global_attrs)
866850

867851
ds_woa_output.to_netcdf(args.woa_intermediate_file)

external_tidal_generation/generate_bottom_roughness_polyfit.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
path_root = Path(__file__).parents[1]
4040
sys.path.append(str(path_root))
4141

42-
from scripts_common import get_provenance_metadata, md5sum
42+
from scripts_common import get_provenance_metadata
4343
from mesh_generation.generate_mesh import mom6_mask_detection
4444

4545

@@ -383,25 +383,10 @@ def main():
383383
)
384384

385385
# Add provenance metadata and MD5 hashes for input files.
386-
this_file = os.path.normpath(__file__)
387-
runcmd = (
388-
f"mpirun -n $PBS_NCPUS python3 {os.path.basename(this_file)} "
389-
f"--high-res-topo-file={args.high_res_topo_file} "
390-
f"--hgrid-file={args.hgrid_file} "
391-
f"--topog-file={args.topog_file} "
392-
f"--chunk-lat={args.chunk_lat} "
393-
f"--chunk-lon={args.chunk_lon} "
394-
f"--output={args.output}"
395-
)
386+
runcmd = f"mpirun -n $PBS_NCPUS python3 {' '.join(sys.argv)} "
387+
input_files = [args.high_res_topo_file, args.hgrid_file, args.topog_file]
388+
global_attrs = get_provenance_metadata(input_files, runcmd)
396389

397-
history = get_provenance_metadata(this_file, runcmd)
398-
global_attrs = {"history": history}
399-
file_hashes = [
400-
f"{args.high_res_topo_file} (md5 hash: {md5sum(args.high_res_topo_file)})",
401-
f"{args.hgrid_file} (md5 hash: {md5sum(args.hgrid_file)})",
402-
f"{args.topog_file} (md5 hash: {md5sum(args.topog_file)})",
403-
]
404-
global_attrs["inputFile"] = ", ".join(file_hashes)
405390
h2_out.attrs.update(global_attrs)
406391

407392
h2_out.to_netcdf(args.output)

external_tidal_generation/generate_bottom_roughness_regrid.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353

5454
path_root = Path(__file__).parents[1]
5555
sys.path.append(str(path_root))
56-
from scripts_common import get_provenance_metadata, md5sum
56+
from scripts_common import get_provenance_metadata
5757
from mesh_generation.generate_mesh import mom6_mask_detection
5858

5959

@@ -396,26 +396,8 @@ def main():
396396
print("Regridding done!")
397397

398398
# Add provenance metadata and MD5 hashes for input files.
399-
this_file = os.path.normpath(__file__)
400-
runcmd = (
401-
f"python3 {os.path.basename(this_file)} "
402-
f"--topog_file={args.topog_file} "
403-
f"--hgrid_file={args.hgrid_file} "
404-
f"--woa_intermediate_file={args.woa_intermediate_file} "
405-
f"--output_file={args.output_file} "
406-
f"--method={args.method} "
407-
f"--periodic_regrid={args.periodic_regrid} "
408-
f"--periodic_lon_laplace={args.periodic_lon_laplace}"
409-
)
410-
411-
history = get_provenance_metadata(this_file, runcmd)
412-
global_attrs = {"history": history}
413-
file_hashes = [
414-
f"{args.hgrid_file} (md5 hash: {md5sum(args.hgrid_file)})",
415-
f"{args.topog_file} (md5 hash: {md5sum(args.topog_file)})",
416-
f"{args.woa_intermediate_file} (md5 hash: {md5sum(args.woa_intermediate_file)})",
417-
]
418-
global_attrs["inputFile"] = ", ".join(file_hashes)
399+
input_files = [args.hgrid_file, args.topog_file, args.woa_intermediate_file]
400+
global_attrs = get_provenance_metadata(input_files)
419401
regrid_depth_var.attrs.update(global_attrs)
420402

421403
output_path = Path(args.output_file)

external_tidal_generation/generate_tide_amplitude.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
path_root = Path(__file__).parents[1]
3939
sys.path.append(str(path_root))
4040

41-
from scripts_common import get_provenance_metadata, md5sum
41+
from scripts_common import get_provenance_metadata
4242
from mesh_generation.generate_mesh import mom6_mask_detection
4343

4444
PRIMARY_CONSTITUENTS = ["m2", "s2", "n2", "k2", "k1", "o1", "p1", "q1"]
@@ -277,25 +277,8 @@ def main():
277277
tideamp = tideamp.rename({"x": "xh", "y": "yh"})
278278

279279
# Add provenance metadata and MD5 hashes for input files.
280-
this_file = os.path.normpath(__file__)
281-
runcmd = (
282-
f"python3 {os.path.basename(this_file)} "
283-
f"--hgrid-file={args.hgrid_file} "
284-
f"--topog-file={args.topog_file} "
285-
f"--method={args.method} "
286-
f"--data-path={args.data_path} "
287-
f"--output={args.output} "
288-
)
289-
290-
history = get_provenance_metadata(this_file, runcmd)
291-
global_attrs = {"history": history}
292-
293-
# add md5 hashes for input files
294-
file_hashes = [
295-
f"{args.hgrid_file} (md5 hash: {md5sum(args.hgrid_file)})",
296-
f"{args.topog_file} (md5 hash: {md5sum(args.topog_file)})",
297-
]
298-
global_attrs["inputFile"] = ", ".join(file_hashes)
280+
input_files = [args.hgrid_file, args.topog_file]
281+
global_attrs = get_provenance_metadata(input_files)
299282
tideamp.attrs.update(global_attrs)
300283

301284
tideamp.to_netcdf(args.output, unlimited_dims=["time"])

grid_generation/generate_vertical_grid.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
path_root = Path(__file__).parents[1]
3939
sys.path.append(str(path_root))
4040

41-
from scripts_common import get_provenance_metadata, md5sum
41+
from scripts_common import get_provenance_metadata
4242

4343
# Define a small constant to initialize the iteration and prevent numerical issues
4444
epsilon = 0.001
@@ -91,18 +91,8 @@ def generate_vertical_grid(H, dzd, min_dz, depfac, output_filename):
9191
: len(real_prop_z)
9292
] # Trim the spacing values to match the adjusted depth levels
9393

94-
this_file = os.path.normpath(__file__)
95-
96-
# Add some info about how the file was generated
97-
runcmd = (
98-
f"python3 {os.path.basename(this_file)} --H={H} --depfac={depfac} "
99-
f"--dzd={dzd} "
100-
f"--min_dz={min_dz} "
101-
f"--output={output_filename} "
102-
)
103-
10494
# Write to NetCDF file
105-
write_netcdf_file(output_filename, real_prop_z, this_file, runcmd)
95+
write_netcdf_file(output_filename, real_prop_z)
10696

10797
print(
10898
f"SUCCESS! A vertical grid with {len(real_prop_z) - 1} levels has been generated. "
@@ -111,7 +101,7 @@ def generate_vertical_grid(H, dzd, min_dz, depfac, output_filename):
111101
)
112102

113103

114-
def write_netcdf_file(output_filename, real_prop_z, this_file, runcmd):
104+
def write_netcdf_file(output_filename, real_prop_z):
115105
"""Function to write vertical grid data to a NetCDF file."""
116106
# Convert to float32 (single precision) to ensure values are exactly representable in single precision,
117107
# then convert back to float64 (double precision) for storage in NetCDF.
@@ -125,7 +115,7 @@ def write_netcdf_file(output_filename, real_prop_z, this_file, runcmd):
125115
zeta.standard_name = "depth"
126116
zeta.long_name = "vertical grid depth at top and bottom of each cell"
127117
eddyfile.variables["zeta"][:] = real_prop_z_float64
128-
eddyfile.setncatts({"history": get_provenance_metadata(this_file, runcmd)})
118+
eddyfile.setncatts(get_provenance_metadata()) # Obtain metadata
129119
eddyfile.close()
130120

131121

make_ryf/make_ryf.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
path_root = Path(__file__).parents[1]
3939
sys.path.append(str(path_root))
4040

41-
from scripts_common import get_provenance_metadata, md5sum
41+
from scripts_common import get_provenance_metadata
4242

4343
FILLVALUE = 1e20
4444
# compression settings to use
@@ -113,7 +113,7 @@
113113
ds = {}
114114

115115
for var in variables:
116-
ryf_files = str()
116+
input_files = []
117117
print(var)
118118
for y in (year1, year2):
119119
if source_data == "jra55v1p4" or source_data == "jra55v1p6":
@@ -147,7 +147,7 @@
147147
print("Loading {} for {}".format(files[0], y))
148148
ds[y] = xarray.open_dataset(files[0], decode_coords=False)
149149
# save info for metadata
150-
ryf_files += f"{files[0]} (md5 hash: {md5sum(files[0])}, )"
150+
input_files.append(files[0])
151151
# Make a copy of the second year without time_bnds
152152
ryf = ds[baseyear].drop_vars("time_bnds")
153153
ryf.encoding = ds[baseyear].encoding
@@ -205,10 +205,7 @@
205205
)
206206

207207
# Add some info about how the file was generated
208-
this_file = os.path.normpath(__file__)
209-
runcmd = f"python3 {os.path.basename(this_file)}"
210-
ryf.attrs |= {"RYF_creation": get_provenance_metadata(this_file, runcmd)}
211-
ryf.attrs |= {"RYF_inputFiles": ryf_files}
208+
ryf.attrs |= get_provenance_metadata(input_files)
212209

213210
outfile = "RYF.{}.{}_{}.nc".format(var, year1, year2)
214211
print("Writing ", outfile)

0 commit comments

Comments
 (0)