Skip to content

Commit b026d60

Browse files
authored
814 text header enh (#817)
* Implement better text header prasing and in-flow correction logic * Lint and tighten up code
1 parent b43fd63 commit b026d60

10 files changed

Lines changed: 675 additions & 27 deletions

File tree

docs/configuration.md

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ You can find a summary of the available variables and their defaults below.
1818
| `MDIO__EXPORT__CPU_COUNT` | `int` | Number of logical CPUs available |
1919
| `MDIO__GRID__SPARSITY_RATIO_WARN` | `float` | `2.0` |
2020
| `MDIO__GRID__SPARSITY_RATIO_LIMIT` | `float` | `10.0` |
21-
| `MDIO__IMPORT__SAVE_SEGY_FILE_HEADER` | `bool` | `False` |
21+
| `MDIO__IMPORT__SAVE_SEGY_FILE_HEADER` | `int` | `0` |
2222
| `MDIO__IMPORT__CLOUD_NATIVE` | `bool` | `False` |
2323
| `MDIO__IMPORT__RAW_HEADERS` | `bool` | `False` |
2424
| `MDIO_IGNORE_CHECKS` | `bool` | `False` |
@@ -71,13 +71,29 @@ $ export MDIO__GRID__SPARSITY_RATIO_LIMIT=15.0
7171

7272
### `MDIO__IMPORT__SAVE_SEGY_FILE_HEADER`
7373

74-
**Accepted values:** `true`, `false`, `1`, `0`, `yes`, `no`, `on`, `off`
74+
**Accepted values:** `0`, `1`, `2`, `true`, `false`, `yes`, `no`, `on`, `off`
75+
76+
Controls preservation of the original SEG-Y textual file header during import.
77+
The textual file header must be 40 lines of 80 printable characters per the
78+
SEG-Y standard; lossy EBCDIC decoding can produce headers that violate this
79+
layout. The variable selects how MDIO reacts:
80+
81+
| Value | Behavior |
82+
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
83+
| `0` / `false` | Do not save SEG-Y file headers (default). |
84+
| `1` / `true` | Save SEG-Y file headers and raise `ValueError` if the text header is not exactly 40x80 ASCII-printable characters (rejects e.g. `U+FFFD` from a lossy EBCDIC decode). |
85+
| `2` | Save SEG-Y file headers; if the text header is malformed, log a warning and correct it (non-ASCII or non-printable characters become spaces and rows pad to 80x40). |
7586

76-
When enabled, preserves the original SEG-Y textual file header during import.
77-
This is useful for maintaining full SEG-Y standard compliance and preserving survey metadata.
87+
```{note}
88+
On export, `mdio_to_segy` always defensively validates the stored text header
89+
and, if it cannot be re-encoded as ASCII (for example because the store was
90+
written by an older version of MDIO that accepted lossy EBCDIC decodes),
91+
repairs it on the fly and emits a warning. Re-ingest the source SEG-Y with
92+
mode `1` or `2` to silence the warning permanently.
93+
```
7894

7995
```shell
80-
$ export MDIO__IMPORT__SAVE_SEGY_FILE_HEADER=true
96+
$ export MDIO__IMPORT__SAVE_SEGY_FILE_HEADER=1
8197
$ mdio segy import input.segy output.mdio --header-locations 189,193
8298
```
8399

src/mdio/core/config.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,26 @@
11
"""Environment variable management for MDIO operations."""
22

3+
from typing import Literal
4+
35
from psutil import cpu_count
46
from pydantic import Field
7+
from pydantic import field_validator
58
from pydantic_settings import BaseSettings
69
from pydantic_settings import SettingsConfigDict
710

11+
SAVE_SEGY_FILE_HEADER_OFF = 0
12+
SAVE_SEGY_FILE_HEADER_STRICT = 1
13+
SAVE_SEGY_FILE_HEADER_LENIENT = 2
14+
15+
SaveSegyFileHeaderMode = Literal[
16+
SAVE_SEGY_FILE_HEADER_OFF,
17+
SAVE_SEGY_FILE_HEADER_STRICT,
18+
SAVE_SEGY_FILE_HEADER_LENIENT,
19+
]
20+
21+
_SAVE_HEADER_TRUE_STRINGS = frozenset({"true", "yes", "on"})
22+
_SAVE_HEADER_FALSE_STRINGS = frozenset({"false", "no", "off"})
23+
824

925
class MDIOSettings(BaseSettings):
1026
"""MDIO environment configuration settings."""
@@ -34,9 +50,12 @@ class MDIOSettings(BaseSettings):
3450
)
3551

3652
# Import configuration
37-
save_segy_file_header: bool = Field(
38-
default=False,
39-
description="Whether to save SEG-Y file headers",
53+
save_segy_file_header: SaveSegyFileHeaderMode = Field(
54+
default=0,
55+
description=(
56+
"How to save SEG-Y file headers: 0 (or False) skips, 1 (or True) saves "
57+
"and raises on malformed text header, 2 saves and corrects malformed text header."
58+
),
4059
alias="MDIO__IMPORT__SAVE_SEGY_FILE_HEADER",
4160
)
4261
raw_headers: bool = Field(
@@ -58,3 +77,21 @@ class MDIOSettings(BaseSettings):
5877
)
5978

6079
model_config = SettingsConfigDict(case_sensitive=True)
80+
81+
@field_validator("save_segy_file_header", mode="before")
82+
@classmethod
83+
def _coerce_save_segy_file_header(cls, value: object) -> object:
84+
"""Accept legacy bool values and case-insensitive string aliases."""
85+
if isinstance(value, str):
86+
normalized = value.strip().lower()
87+
if normalized in _SAVE_HEADER_FALSE_STRINGS:
88+
return SAVE_SEGY_FILE_HEADER_OFF
89+
if normalized in _SAVE_HEADER_TRUE_STRINGS:
90+
return SAVE_SEGY_FILE_HEADER_STRICT
91+
try:
92+
return int(value)
93+
except ValueError:
94+
pass
95+
if isinstance(value, bool):
96+
return int(value)
97+
return value

src/mdio/ingestion/segy/file_headers.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,41 +3,48 @@
33
from __future__ import annotations
44

55
import base64
6+
import logging
67
from typing import TYPE_CHECKING
78

9+
from mdio.core.config import SAVE_SEGY_FILE_HEADER_LENIENT
10+
from mdio.core.config import SAVE_SEGY_FILE_HEADER_OFF
11+
from mdio.core.config import SAVE_SEGY_FILE_HEADER_STRICT
812
from mdio.core.config import MDIOSettings
13+
from mdio.segy.text_header import sanitize_text_header
14+
from mdio.segy.text_header import validate_text_header
915

1016
if TYPE_CHECKING:
1117
from xarray import Dataset as xr_Dataset
1218

1319
from mdio.segy.file import SegyFileInfo
1420

1521

22+
logger = logging.getLogger(__name__)
23+
24+
1625
def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_info: SegyFileInfo) -> xr_Dataset:
1726
"""Attach the SEG-Y text and binary file headers as attrs on a scalar variable."""
1827
settings = MDIOSettings()
28+
mode = settings.save_segy_file_header
1929

20-
if not settings.save_segy_file_header:
30+
if mode == SAVE_SEGY_FILE_HEADER_OFF:
2131
return xr_dataset
2232

23-
expected_rows = 40
24-
expected_cols = 80
25-
26-
text_header_rows = segy_file_info.text_header.splitlines()
27-
text_header_cols_bad = [len(row) != expected_cols for row in text_header_rows]
28-
29-
if len(text_header_rows) != expected_rows:
30-
err = f"Invalid text header count: expected {expected_rows}, got {len(segy_file_info.text_header)}"
31-
raise ValueError(err)
33+
text_header = segy_file_info.text_header
3234

33-
if any(text_header_cols_bad):
34-
err = f"Invalid text header columns: expected {expected_cols} per line."
35-
raise ValueError(err)
35+
if mode == SAVE_SEGY_FILE_HEADER_LENIENT:
36+
try:
37+
validate_text_header(text_header)
38+
except ValueError as exc:
39+
logger.warning("Correcting malformed SEG-Y text header on import: %s", exc)
40+
text_header = sanitize_text_header(text_header)
41+
elif mode == SAVE_SEGY_FILE_HEADER_STRICT:
42+
validate_text_header(text_header)
3643

3744
xr_dataset["segy_file_header"] = ((), "")
3845
xr_dataset["segy_file_header"].attrs.update(
3946
{
40-
"textHeader": segy_file_info.text_header,
47+
"textHeader": text_header,
4148
"binaryHeader": segy_file_info.binary_header_dict,
4249
}
4350
)

src/mdio/segy/creation.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
from mdio.api.io import open_mdio
1818
from mdio.exceptions import MDIOMissingVariableError
1919
from mdio.segy.compat import encode_segy_revision
20+
from mdio.segy.text_header import sanitize_text_header
21+
from mdio.segy.text_header import validate_text_header
2022

2123
if TYPE_CHECKING:
2224
import xarray as xr
@@ -28,6 +30,23 @@
2830
logger = logging.getLogger(__name__)
2931

3032

33+
def _ensure_exportable_text_header(text_header: str) -> str:
34+
"""Validate the stored text header; repair and warn if it cannot be ASCII-encoded.
35+
36+
Args:
37+
text_header: The ``textHeader`` attribute as stored on the MDIO dataset.
38+
39+
Returns:
40+
A text header string that satisfies :func:`validate_text_header`.
41+
"""
42+
try:
43+
validate_text_header(text_header)
44+
except ValueError as exc:
45+
logger.warning("Stored MDIO text header is not exportable as-is and will be repaired: %s", exc)
46+
return sanitize_text_header(text_header)
47+
return text_header
48+
49+
3150
def make_segy_factory(spec: SegySpec, binary_header: dict[str, int]) -> SegyFactory:
3251
"""Generate SEG-Y factory from MDIO metadata."""
3352
sample_interval = binary_header["sample_interval"]
@@ -88,6 +107,7 @@ def mdio_spec_to_segy(
88107

89108
factory = make_segy_factory(spec=segy_spec, binary_header=binary_header)
90109

110+
text_header = _ensure_exportable_text_header(text_header)
91111
text_header_bytes = factory.create_textual_header(text_header)
92112

93113
# During MDIO SEGY import, TGSAI/segy always creates revision major/minor fields

src/mdio/segy/text_header.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""SEG-Y textual file header validation and sanitization helpers."""
2+
3+
from __future__ import annotations
4+
5+
import re
6+
7+
EXPECTED_ROWS = 40
8+
EXPECTED_COLS = 80
9+
ASCII_MAX_ORD = 127
10+
11+
_REPORT_LIMIT = 5
12+
_NEWLINE_RUN = re.compile(r"\n{2,}")
13+
14+
15+
def _is_safe_char(char: str) -> bool:
16+
"""Return True if char is 7-bit ASCII and printable."""
17+
return ord(char) <= ASCII_MAX_ORD and char.isprintable()
18+
19+
20+
def _summarize(mapping: dict[int, list[int]], limit: int = _REPORT_LIMIT) -> str:
21+
"""Format ``{row: [positions]}`` for an error message, capped for readability."""
22+
if not mapping:
23+
return "{}"
24+
25+
items = list(mapping.items())
26+
head = items[:limit]
27+
body = ", ".join(f"row {row}: positions {positions[:limit]}" for row, positions in head)
28+
29+
extra_rows = len(items) - len(head)
30+
if extra_rows > 0:
31+
body += f" (+{extra_rows} more rows)"
32+
return body
33+
34+
35+
def validate_text_header(text_header: str) -> None:
36+
r"""Validate a SEG-Y textual file header is 40 rows of 80 ASCII-printable characters.
37+
38+
Args:
39+
text_header: Decoded text header in wrapped form (40 rows of 80 chars joined by ``\n``).
40+
41+
Raises:
42+
ValueError: If row count, row width, or any character fails the SEG-Y ASCII contract.
43+
"""
44+
rows = text_header.split("\n")
45+
46+
if len(rows) != EXPECTED_ROWS:
47+
err = f"Invalid text header line count: expected {EXPECTED_ROWS}, got {len(rows)}"
48+
raise ValueError(err)
49+
50+
bad_widths = [(i, len(row)) for i, row in enumerate(rows) if len(row) != EXPECTED_COLS]
51+
if bad_widths:
52+
capped = bad_widths[:_REPORT_LIMIT]
53+
suffix = f" (+{len(bad_widths) - len(capped)} more)" if len(bad_widths) > len(capped) else ""
54+
err = f"Invalid text header line widths: expected {EXPECTED_COLS} columns; offending rows: {capped}{suffix}"
55+
raise ValueError(err)
56+
57+
bad_chars: dict[int, list[int]] = {}
58+
for i, row in enumerate(rows):
59+
positions = [j for j, c in enumerate(row) if not _is_safe_char(c)]
60+
if positions:
61+
bad_chars[i] = positions
62+
63+
if bad_chars:
64+
err = f"Invalid text header characters: non-ASCII or non-printable at {_summarize(bad_chars)}"
65+
raise ValueError(err)
66+
67+
68+
def sanitize_text_header(text_header: str) -> str:
69+
r"""Coerce a SEG-Y textual file header into the 40x80 ASCII-printable card layout.
70+
71+
Runs of two or more ``\n`` collapse to one (some writers terminate cards with ``\n\n``).
72+
Each row gets unsafe characters replaced with spaces and is padded/truncated to 80 chars.
73+
The result always has exactly 40 rows.
74+
75+
Args:
76+
text_header: Decoded textual file header string.
77+
78+
Returns:
79+
Sanitized header that satisfies :func:`validate_text_header`.
80+
"""
81+
normalized = _NEWLINE_RUN.sub("\n", text_header)
82+
rows = normalized.split("\n")
83+
84+
sanitized: list[str] = []
85+
for row in rows[:EXPECTED_ROWS]:
86+
cleaned = "".join(c if _is_safe_char(c) else " " for c in row)
87+
sanitized.append(cleaned[:EXPECTED_COLS].ljust(EXPECTED_COLS))
88+
89+
while len(sanitized) < EXPECTED_ROWS:
90+
sanitized.append(" " * EXPECTED_COLS)
91+
92+
return "\n".join(sanitized)

tests/unit/ingestion/test_segy_file_headers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def test_invalid_row_count_raises(self) -> None:
8181
ds = _empty_dataset()
8282
with (
8383
patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true"}),
84-
pytest.raises(ValueError, match="Invalid text header count"),
84+
pytest.raises(ValueError, match="Invalid text header line count"),
8585
):
8686
_add_segy_file_headers(ds, info)
8787

@@ -93,6 +93,6 @@ def test_invalid_column_count_raises(self) -> None:
9393
ds = _empty_dataset()
9494
with (
9595
patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true"}),
96-
pytest.raises(ValueError, match="Invalid text header columns"),
96+
pytest.raises(ValueError, match="Invalid text header line widths"),
9797
):
9898
_add_segy_file_headers(ds, info)

tests/unit/test_environment.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from unittest.mock import patch
55

66
import pytest
7+
from pydantic import ValidationError
78

89
from mdio.core.config import MDIOSettings
910

@@ -32,7 +33,7 @@ def test_environment_isolation(self) -> None:
3233
original_values = {
3334
"cpus": MDIOSettings().export_cpus,
3435
"ratio": MDIOSettings().grid_sparsity_ratio_warn,
35-
"bool": MDIOSettings().save_segy_file_header,
36+
"save_header": MDIOSettings().save_segy_file_header,
3637
}
3738

3839
with patch.dict(
@@ -45,9 +46,51 @@ def test_environment_isolation(self) -> None:
4546
):
4647
assert MDIOSettings().export_cpus == 99
4748
assert MDIOSettings().grid_sparsity_ratio_warn == 99.9
48-
assert MDIOSettings().save_segy_file_header is True
49+
assert MDIOSettings().save_segy_file_header == 1
4950

5051
# Values should be restored after context
5152
assert MDIOSettings().export_cpus == original_values["cpus"]
5253
assert MDIOSettings().grid_sparsity_ratio_warn == original_values["ratio"]
53-
assert MDIOSettings().save_segy_file_header == original_values["bool"]
54+
assert MDIOSettings().save_segy_file_header == original_values["save_header"]
55+
56+
57+
class TestSaveSegyFileHeaderMode:
58+
"""Test coercion for ``MDIO__IMPORT__SAVE_SEGY_FILE_HEADER``."""
59+
60+
@pytest.mark.parametrize(
61+
("env_value", "expected"),
62+
[
63+
("0", 0),
64+
("1", 1),
65+
("2", 2),
66+
("false", 0),
67+
("False", 0),
68+
("FALSE", 0),
69+
("no", 0),
70+
("off", 0),
71+
("true", 1),
72+
("True", 1),
73+
("TRUE", 1),
74+
("yes", 1),
75+
("on", 1),
76+
],
77+
)
78+
def test_string_coercion(self, env_value: str, expected: int) -> None:
79+
"""Strings (including legacy bool aliases) coerce to 0, 1, or 2."""
80+
with patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": env_value}):
81+
assert MDIOSettings().save_segy_file_header == expected
82+
83+
@pytest.mark.parametrize("python_value", [False, True, 0, 1, 2])
84+
def test_native_python_values(self, python_value: bool | int) -> None:
85+
"""Bool/int passed directly are accepted for backwards compatibility."""
86+
settings = MDIOSettings(MDIO__IMPORT__SAVE_SEGY_FILE_HEADER=python_value)
87+
assert settings.save_segy_file_header == int(python_value)
88+
89+
@pytest.mark.parametrize("bad_value", ["3", "-1", "maybe", "tru"])
90+
def test_rejects_invalid_strings(self, bad_value: str) -> None:
91+
"""Anything other than 0/1/2 or bool aliases is rejected."""
92+
with (
93+
patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": bad_value}),
94+
pytest.raises(ValidationError),
95+
):
96+
MDIOSettings()

0 commit comments

Comments
 (0)