|
| 1 | +"""Minimal OOXML helpers for .xlsx files. |
| 2 | +
|
| 3 | +The pipeline avoids third-party dependencies, so it edits the workbook as XML |
| 4 | +inside the .xlsx zip. Higher-level workbook code should own row/column meaning. |
| 5 | +""" |
| 6 | +import zipfile |
| 7 | +import xml.etree.ElementTree as ET |
| 8 | + |
| 9 | +# SpreadsheetML namespace used by worksheet XML files. |
| 10 | +M_NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" |
| 11 | + |
| 12 | +# Package relationship namespace used to map workbook sheets to XML files. |
| 13 | +REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships" |
| 14 | + |
| 15 | +# Office relationship namespace used for sheet relationship ids. |
| 16 | +OFFICE_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" |
| 17 | + |
| 18 | +# Prefix map used by ElementTree XPath calls. |
| 19 | +NS = {"m": M_NS} |
| 20 | + |
| 21 | +# Preserve the default SpreadsheetML namespace when writing XML back. |
| 22 | +ET.register_namespace("", M_NS) |
| 23 | + |
| 24 | + |
| 25 | +def sheet_paths(z: zipfile.ZipFile) -> dict[str, str]: |
| 26 | + """Return worksheet display name -> XML path inside the workbook zip.""" |
| 27 | + # workbook.xml lists sheets by name and relationship id. |
| 28 | + workbook = ET.fromstring(z.read("xl/workbook.xml")) |
| 29 | + |
| 30 | + # workbook.xml.rels maps relationship ids to actual worksheet XML paths. |
| 31 | + rels = ET.fromstring(z.read("xl/_rels/workbook.xml.rels")) |
| 32 | + rel_targets = { |
| 33 | + rel.attrib["Id"]: rel.attrib["Target"] |
| 34 | + for rel in rels.findall(f"{{{REL_NS}}}Relationship") |
| 35 | + } |
| 36 | + |
| 37 | + # Build a direct lookup so callers can read a sheet by visible tab name. |
| 38 | + paths = {} |
| 39 | + for sheet in workbook.findall("m:sheets/m:sheet", NS): |
| 40 | + target = rel_targets[sheet.attrib[f"{{{OFFICE_REL}}}id"]] |
| 41 | + |
| 42 | + # Relationship targets are sometimes relative to xl/. |
| 43 | + paths[sheet.attrib["name"]] = target if target.startswith("xl/") else "xl/" + target |
| 44 | + return paths |
| 45 | + |
| 46 | + |
| 47 | +def shared_strings(z: zipfile.ZipFile) -> list[str]: |
| 48 | + """Read Excel's shared string table.""" |
| 49 | + # Workbooks with only inline strings may not have sharedStrings.xml. |
| 50 | + if "xl/sharedStrings.xml" not in z.namelist(): |
| 51 | + return [] |
| 52 | + |
| 53 | + # Shared strings are stored as rich text runs; join all text nodes. |
| 54 | + root = ET.fromstring(z.read("xl/sharedStrings.xml")) |
| 55 | + return [ |
| 56 | + "".join((text.text or "") for text in item.findall(".//m:t", NS)) |
| 57 | + for item in root.findall("m:si", NS) |
| 58 | + ] |
| 59 | + |
| 60 | + |
| 61 | +def cell_text(cell: ET.Element, strings: list[str]) -> str: |
| 62 | + """Return a cell's displayed text value.""" |
| 63 | + # Normal cells store their value under <v>. |
| 64 | + value = cell.find("m:v", NS) |
| 65 | + |
| 66 | + # t="s" means <v> is an index into sharedStrings.xml. |
| 67 | + if cell.attrib.get("t") == "s" and value is not None and value.text is not None: |
| 68 | + return strings[int(value.text)] |
| 69 | + |
| 70 | + # t="inlineStr" means the text lives directly inside the cell. |
| 71 | + if cell.attrib.get("t") == "inlineStr": |
| 72 | + inline = cell.find("m:is", NS) |
| 73 | + return "".join((text.text or "") for text in inline.findall(".//m:t", NS)) if inline is not None else "" |
| 74 | + |
| 75 | + # Numeric/plain cells can be returned directly from <v>. |
| 76 | + return value.text if value is not None and value.text is not None else "" |
| 77 | + |
| 78 | + |
| 79 | +def split_ref(ref: str) -> tuple[int | None, int | None]: |
| 80 | + """Convert an Excel cell reference like C12 into (3, 12).""" |
| 81 | + # Separate column letters from row digits. |
| 82 | + letters = "".join(ch for ch in ref if ch.isalpha()) |
| 83 | + digits = "".join(ch for ch in ref if ch.isdigit()) |
| 84 | + if not letters or not digits: |
| 85 | + return None, None |
| 86 | + |
| 87 | + # Convert base-26 letters into a 1-based column number. |
| 88 | + col = 0 |
| 89 | + for ch in letters: |
| 90 | + col = col * 26 + ord(ch.upper()) - 64 |
| 91 | + return col, int(digits) |
| 92 | + |
| 93 | + |
| 94 | +def col_name(idx: int) -> str: |
| 95 | + """Convert a 1-based column number into Excel letters.""" |
| 96 | + name = "" |
| 97 | + |
| 98 | + # Excel columns are base-26 but without a zero digit. |
| 99 | + while idx: |
| 100 | + idx, rem = divmod(idx - 1, 26) |
| 101 | + name = chr(65 + rem) + name |
| 102 | + return name |
| 103 | + |
| 104 | + |
| 105 | +def ensure_cell(row: ET.Element, col: int) -> ET.Element: |
| 106 | + """Return an existing cell in a row, or create it in column order.""" |
| 107 | + # Build the Excel cell reference, for example column 14 in row 2 is N2. |
| 108 | + ref = f"{col_name(col)}{row.attrib['r']}" |
| 109 | + |
| 110 | + # Reuse an existing cell if one is already present. |
| 111 | + cells = row.findall("m:c", NS) |
| 112 | + for cell in cells: |
| 113 | + existing_col, _ = split_ref(cell.attrib.get("r", "")) |
| 114 | + if existing_col == col: |
| 115 | + return cell |
| 116 | + |
| 117 | + # Create a new blank cell. |
| 118 | + new_cell = ET.Element(f"{{{M_NS}}}c", {"r": ref}) |
| 119 | + |
| 120 | + # Insert before the next higher column so Excel sees cells in normal order. |
| 121 | + for pos, cell in enumerate(cells): |
| 122 | + existing_col, _ = split_ref(cell.attrib.get("r", "")) |
| 123 | + if existing_col and existing_col > col: |
| 124 | + row.insert(pos, new_cell) |
| 125 | + return new_cell |
| 126 | + |
| 127 | + # Append if this is now the rightmost cell in the row. |
| 128 | + row.append(new_cell) |
| 129 | + return new_cell |
| 130 | + |
| 131 | + |
| 132 | +def set_text(cell: ET.Element, value: str) -> None: |
| 133 | + """Replace a cell's contents with inline text.""" |
| 134 | + # Remove any old <v>, <is>, or formula children. |
| 135 | + for child in list(cell): |
| 136 | + cell.remove(child) |
| 137 | + |
| 138 | + # Remove the previous cell type before setting the new representation. |
| 139 | + cell.attrib.pop("t", None) |
| 140 | + |
| 141 | + # Empty string means leave the cell blank. |
| 142 | + if not value: |
| 143 | + return |
| 144 | + |
| 145 | + # Inline strings keep this writer simple and avoid editing sharedStrings.xml. |
| 146 | + cell.attrib["t"] = "inlineStr" |
| 147 | + inline = ET.SubElement(cell, f"{{{M_NS}}}is") |
| 148 | + ET.SubElement(inline, f"{{{M_NS}}}t").text = value |
0 commit comments