Skip to content

Commit 51ae74b

Browse files
authored
Add logic to allow for headers to be dropped/rearranged in mdio_to_segy (#810)
* Add logic to allow for headers to be dropped/rearranged in `mdio_to_segy` * precommit
1 parent b026d60 commit 51ae74b

4 files changed

Lines changed: 366 additions & 1 deletion

File tree

src/mdio/converters/mdio.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from mdio.segy.blocked_io import to_segy
1515
from mdio.segy.creation import concat_files
1616
from mdio.segy.creation import mdio_spec_to_segy
17+
from mdio.segy.utilities import project_headers_to_segy_spec
1718
from mdio.segy.utilities import segy_export_rechunker
1819

1920
try:
@@ -132,11 +133,13 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915
132133
out_dir = output_path.parent
133134
tmp_dir = TemporaryDirectory(dir=out_dir)
134135

136+
projected_headers = project_headers_to_segy_spec(dataset["headers"].data, segy_spec)
137+
135138
with tmp_dir:
136139
with TqdmCallback(desc="Unwrapping MDIO Blocks"):
137140
block_records = to_segy(
138141
samples=dataset[default_variable_name].data,
139-
headers=dataset["headers"].data,
142+
headers=projected_headers,
140143
live_mask=dataset["trace_mask"].data,
141144
segy_factory=segy_factory,
142145
file_root=tmp_dir.name,

src/mdio/segy/utilities.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@
1515
from mdio.segy.parsers import parse_headers
1616

1717
if TYPE_CHECKING:
18+
from dask.array import Array as DaskArray
1819
from numpy.typing import DTypeLike
20+
from numpy.typing import NDArray
1921
from segy.arrays import HeaderArray
22+
from segy.schema import SegySpec
2023

2124
from mdio.builder.templates.base import AbstractDatasetTemplate
2225
from mdio.segy.file import SegyFileArguments
@@ -183,6 +186,51 @@ def get_grid_plan( # noqa: C901, PLR0912, PLR0913, PLR0915
183186
return dimensions, chunksize
184187

185188

189+
def project_headers_to_segy_spec(headers: DaskArray, segy_spec: SegySpec) -> DaskArray:
190+
"""Project stored MDIO trace headers onto the SegySpec trace header layout.
191+
192+
``SegyFactory.create_traces`` assigns headers by numpy structured-array slot position,
193+
not by field name, so the input must expose exactly the SegySpec fields in SegySpec
194+
order. A packed (no-padding), native-byte-order dtype is used to avoid numpy byteswap
195+
artifacts over padding bytes.
196+
197+
Args:
198+
headers: Dask array holding MDIO trace headers with structured dtype.
199+
segy_spec: Target SegySpec describing the output SEG-Y trace header layout.
200+
201+
Returns:
202+
Dask array with a packed, native-byte-order dtype ordered like the SegySpec.
203+
204+
Raises:
205+
ValueError: If SegySpec requests header fields that do not exist in MDIO headers.
206+
"""
207+
spec_header_dtype = segy_spec.trace.header.dtype
208+
target_names = list(spec_header_dtype.names)
209+
210+
source_names = headers.dtype.names
211+
missing = [name for name in target_names if name not in source_names]
212+
if missing:
213+
msg = (
214+
f"SegySpec requires trace header fields not present in MDIO: {missing}. "
215+
f"Available MDIO header fields: {sorted(source_names)}."
216+
)
217+
raise ValueError(msg)
218+
219+
target_dtype = np.dtype([(name, spec_header_dtype.fields[name][0].newbyteorder("=")) for name in target_names])
220+
221+
# Don't actually project if the dtype is already the same as the target dtype.
222+
if headers.dtype == target_dtype:
223+
return headers
224+
225+
def _project_block(block: NDArray) -> NDArray:
226+
out = np.empty(block.shape, dtype=target_dtype)
227+
for name in target_names:
228+
out[name] = block[name]
229+
return out
230+
231+
return headers.map_blocks(_project_block, dtype=target_dtype)
232+
233+
186234
def find_trailing_ones_index(dim_blocks: tuple[int, ...]) -> int:
187235
"""Finds the index where trailing '1's begin in a tuple of dimension block sizes.
188236

tests/integration/test_segy_import_export_masked.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
from numpy.testing import assert_array_equal
1919
from segy.factory import SegyFactory
2020
from segy.schema import HeaderField
21+
from segy.schema import HeaderSpec
2122
from segy.schema import SegySpec
23+
from segy.schema import TraceSpec
2224
from segy.standards import get_segy_standard
2325

2426
from mdio import mdio_to_segy
@@ -560,3 +562,125 @@ def read_segy_trace_header(trace_index: int) -> bytes:
560562
assert_array_equal(mdio_header_bytes, segy_header_bytes)
561563

562564
segy_trace_idx += 1
565+
566+
567+
def _spec_with_trace_header_fields(base_spec: SegySpec, fields: list[HeaderField]) -> SegySpec:
568+
"""Return a copy of base_spec whose trace header contains exactly the given fields.
569+
570+
Unlike ``SegySpec.customize``, which merges into the base field set, this builds a
571+
fresh ``HeaderSpec`` so we can produce true subsets and arbitrary orderings used in
572+
the header projection tests.
573+
"""
574+
trace = TraceSpec(
575+
header=HeaderSpec(
576+
fields=fields,
577+
item_size=base_spec.trace.header.item_size,
578+
endianness=base_spec.trace.header.endianness,
579+
),
580+
data=base_spec.trace.data,
581+
)
582+
return SegySpec(
583+
segy_standard=base_spec.segy_standard,
584+
text_header=base_spec.text_header,
585+
binary_header=base_spec.binary_header,
586+
trace=trace,
587+
endianness=base_spec.endianness,
588+
)
589+
590+
591+
@dataclass
592+
class ProjectionFixture:
593+
"""Pre-ingested MDIO and baseline SegySpec shared by header projection tests."""
594+
595+
root: Path
596+
mdio_path: Path
597+
baseline_spec: SegySpec
598+
599+
600+
@pytest.fixture(scope="module")
601+
def projection_mdio(tmp_path_factory: pytest.TempPathFactory) -> ProjectionFixture:
602+
"""Ingest a small 3D-stack SEG-Y once for the header projection tests."""
603+
grid_conf, factory_conf, to_mdio_conf, _ = STACK_3D_CONF
604+
root = tmp_path_factory.mktemp("header_projection")
605+
segy_path = root / f"{grid_conf.name}.sgy"
606+
mdio_path = root / f"{grid_conf.name}.mdio"
607+
608+
mock_nd_segy(segy_path, grid_conf, factory_conf)
609+
baseline_spec = _segy_spec_mock_nd_segy(grid_conf, factory_conf)
610+
segy_to_mdio(
611+
segy_spec=baseline_spec,
612+
mdio_template=TemplateRegistry().get(to_mdio_conf.template),
613+
input_path=segy_path,
614+
output_path=mdio_path,
615+
overwrite=True,
616+
)
617+
return ProjectionFixture(root=root, mdio_path=mdio_path, baseline_spec=baseline_spec)
618+
619+
620+
class TestExportSegySpecHeaderProjection:
621+
"""Verify mdio_to_segy supports subset and reordered SegySpecs (issue #769)."""
622+
623+
def test_reordered_spec_produces_identical_bytes(self, projection_mdio: ProjectionFixture) -> None:
624+
"""Reordering trace header fields must not change the serialized SEG-Y."""
625+
baseline_spec = projection_mdio.baseline_spec
626+
reordered_spec = _spec_with_trace_header_fields(
627+
baseline_spec,
628+
list(reversed(baseline_spec.trace.header.fields)),
629+
)
630+
631+
baseline_out = projection_mdio.root / "baseline.sgy"
632+
reordered_out = projection_mdio.root / "reordered.sgy"
633+
634+
mdio_to_segy(segy_spec=baseline_spec, input_path=projection_mdio.mdio_path, output_path=baseline_out)
635+
mdio_to_segy(segy_spec=reordered_spec, input_path=projection_mdio.mdio_path, output_path=reordered_out)
636+
637+
assert baseline_out.read_bytes() == reordered_out.read_bytes()
638+
639+
def test_subset_spec_preserves_selected_fields(self, projection_mdio: ProjectionFixture) -> None:
640+
"""Subsetting trace header fields writes those fields at their declared byte locations."""
641+
baseline_spec = projection_mdio.baseline_spec
642+
643+
subset_fields = [
644+
HeaderField(name="crossline", byte=193, format="int32"),
645+
HeaderField(name="inline", byte=189, format="int32"),
646+
HeaderField(name="samples_per_trace", byte=115, format="int16"),
647+
HeaderField(name="sample_interval", byte=117, format="int16"),
648+
]
649+
subset_spec = _spec_with_trace_header_fields(baseline_spec, subset_fields)
650+
651+
baseline_out = projection_mdio.root / "baseline_for_subset.sgy"
652+
subset_out = projection_mdio.root / "subset.sgy"
653+
654+
mdio_to_segy(segy_spec=baseline_spec, input_path=projection_mdio.mdio_path, output_path=baseline_out)
655+
mdio_to_segy(segy_spec=subset_spec, input_path=projection_mdio.mdio_path, output_path=subset_out)
656+
657+
baseline_sgy = SegyFileWrapper(baseline_out, spec=baseline_spec)
658+
subset_sgy = SegyFileWrapper(subset_out, spec=subset_spec)
659+
660+
assert baseline_sgy.num_traces == subset_sgy.num_traces
661+
662+
baseline_traces = baseline_sgy.trace[:]
663+
subset_traces = subset_sgy.trace[:]
664+
665+
for name in ("inline", "crossline", "samples_per_trace", "sample_interval"):
666+
assert_array_equal(subset_traces.header[name], baseline_traces.header[name])
667+
assert_array_equal(subset_traces.sample, baseline_traces.sample)
668+
669+
def test_missing_field_in_mdio_raises(self, projection_mdio: ProjectionFixture) -> None:
670+
"""SegySpec with a field absent from MDIO headers must raise ValueError."""
671+
# 'offset' is not ingested for STACK_3D_CONF; requesting it must fail.
672+
bad_fields = [
673+
HeaderField(name="inline", byte=189, format="int32"),
674+
HeaderField(name="crossline", byte=193, format="int32"),
675+
HeaderField(name="offset", byte=37, format="int32"),
676+
HeaderField(name="samples_per_trace", byte=115, format="int16"),
677+
HeaderField(name="sample_interval", byte=117, format="int16"),
678+
]
679+
bad_spec = _spec_with_trace_header_fields(projection_mdio.baseline_spec, bad_fields)
680+
681+
with pytest.raises(ValueError, match="offset"):
682+
mdio_to_segy(
683+
segy_spec=bad_spec,
684+
input_path=projection_mdio.mdio_path,
685+
output_path=projection_mdio.root / "should_not_exist.sgy",
686+
)

0 commit comments

Comments
 (0)