|
| 1 | +"""SEG-Y textual file header validation and sanitization helpers.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import re |
| 6 | + |
| 7 | +EXPECTED_ROWS = 40 |
| 8 | +EXPECTED_COLS = 80 |
| 9 | +ASCII_MAX_ORD = 127 |
| 10 | + |
| 11 | +_REPORT_LIMIT = 5 |
| 12 | +_NEWLINE_RUN = re.compile(r"\n{2,}") |
| 13 | + |
| 14 | + |
| 15 | +def _is_safe_char(char: str) -> bool: |
| 16 | + """Return True if char is 7-bit ASCII and printable.""" |
| 17 | + return ord(char) <= ASCII_MAX_ORD and char.isprintable() |
| 18 | + |
| 19 | + |
| 20 | +def _summarize(mapping: dict[int, list[int]], limit: int = _REPORT_LIMIT) -> str: |
| 21 | + """Format ``{row: [positions]}`` for an error message, capped for readability.""" |
| 22 | + if not mapping: |
| 23 | + return "{}" |
| 24 | + |
| 25 | + items = list(mapping.items()) |
| 26 | + head = items[:limit] |
| 27 | + body = ", ".join(f"row {row}: positions {positions[:limit]}" for row, positions in head) |
| 28 | + |
| 29 | + extra_rows = len(items) - len(head) |
| 30 | + if extra_rows > 0: |
| 31 | + body += f" (+{extra_rows} more rows)" |
| 32 | + return body |
| 33 | + |
| 34 | + |
| 35 | +def validate_text_header(text_header: str) -> None: |
| 36 | + r"""Validate a SEG-Y textual file header is 40 rows of 80 ASCII-printable characters. |
| 37 | +
|
| 38 | + Args: |
| 39 | + text_header: Decoded text header in wrapped form (40 rows of 80 chars joined by ``\n``). |
| 40 | +
|
| 41 | + Raises: |
| 42 | + ValueError: If row count, row width, or any character fails the SEG-Y ASCII contract. |
| 43 | + """ |
| 44 | + rows = text_header.split("\n") |
| 45 | + |
| 46 | + if len(rows) != EXPECTED_ROWS: |
| 47 | + err = f"Invalid text header line count: expected {EXPECTED_ROWS}, got {len(rows)}" |
| 48 | + raise ValueError(err) |
| 49 | + |
| 50 | + bad_widths = [(i, len(row)) for i, row in enumerate(rows) if len(row) != EXPECTED_COLS] |
| 51 | + if bad_widths: |
| 52 | + capped = bad_widths[:_REPORT_LIMIT] |
| 53 | + suffix = f" (+{len(bad_widths) - len(capped)} more)" if len(bad_widths) > len(capped) else "" |
| 54 | + err = f"Invalid text header line widths: expected {EXPECTED_COLS} columns; offending rows: {capped}{suffix}" |
| 55 | + raise ValueError(err) |
| 56 | + |
| 57 | + bad_chars: dict[int, list[int]] = {} |
| 58 | + for i, row in enumerate(rows): |
| 59 | + positions = [j for j, c in enumerate(row) if not _is_safe_char(c)] |
| 60 | + if positions: |
| 61 | + bad_chars[i] = positions |
| 62 | + |
| 63 | + if bad_chars: |
| 64 | + err = f"Invalid text header characters: non-ASCII or non-printable at {_summarize(bad_chars)}" |
| 65 | + raise ValueError(err) |
| 66 | + |
| 67 | + |
| 68 | +def sanitize_text_header(text_header: str) -> str: |
| 69 | + r"""Coerce a SEG-Y textual file header into the 40x80 ASCII-printable card layout. |
| 70 | +
|
| 71 | + Runs of two or more ``\n`` collapse to one (some writers terminate cards with ``\n\n``). |
| 72 | + Each row gets unsafe characters replaced with spaces and is padded/truncated to 80 chars. |
| 73 | + The result always has exactly 40 rows. |
| 74 | +
|
| 75 | + Args: |
| 76 | + text_header: Decoded textual file header string. |
| 77 | +
|
| 78 | + Returns: |
| 79 | + Sanitized header that satisfies :func:`validate_text_header`. |
| 80 | + """ |
| 81 | + normalized = _NEWLINE_RUN.sub("\n", text_header) |
| 82 | + rows = normalized.split("\n") |
| 83 | + |
| 84 | + sanitized: list[str] = [] |
| 85 | + for row in rows[:EXPECTED_ROWS]: |
| 86 | + cleaned = "".join(c if _is_safe_char(c) else " " for c in row) |
| 87 | + sanitized.append(cleaned[:EXPECTED_COLS].ljust(EXPECTED_COLS)) |
| 88 | + |
| 89 | + while len(sanitized) < EXPECTED_ROWS: |
| 90 | + sanitized.append(" " * EXPECTED_COLS) |
| 91 | + |
| 92 | + return "\n".join(sanitized) |
0 commit comments