Skip to content

Commit 9ad27ba

Browse files
Matthew HoroszowskiMatthew Horoszowski
authored andcommitted
feat(oxml): add custom_properties and custom_xml element classes
Phase 1 of customXml support per Plans/customxml-implementation-plan.md. Adds the leaf-layer xmlchemy element classes for two OOXML mechanisms: - /docProps/custom.xml — <op:Properties>, <op:property>, and the five <vt:*> typed value elements (lpwstr, i4, r8, bool, filetime). CT_Property.value dispatches Python type to vt:* child; bool checked before int because bool is-a int in Python. - /customXml/itemPropsN.xml — <ds:datastoreItem>, <ds:schemaRefs>, <ds:schemaRef>. CT_DatastoreItem.add_schema_ref / remove_schema_ref manage the optional schema_refs envelope idempotently and drop the empty parent on last-removal to match Office output. Three new namespace prefixes registered in oxml/ns.py: op -> custom-properties vt -> docPropsVTypes ds -> customXml ZeroOrMore declaration on CT_Properties is named 'prop' rather than 'property' to avoid shadowing Python's @Property decorator inside the class body. Public methods preserve *_property naming on the API surface. 63 new unit tests; 96-98% line coverage on the new modules. Existing 2796-test suite still green (2859 total).
1 parent 1bf94bb commit 9ad27ba

6 files changed

Lines changed: 874 additions & 0 deletions

File tree

src/pptx/oxml/__init__.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,36 @@ def register_element_cls(nsptagname: str, cls: Type[BaseOxmlElement]):
217217
register_element_cls("cp:coreProperties", CT_CoreProperties)
218218

219219

220+
from pptx.oxml.custom_properties import ( # noqa: E402
221+
CT_Properties,
222+
CT_Property,
223+
CT_VtBool,
224+
CT_VtFiletime,
225+
CT_VtI4,
226+
CT_VtLpwstr,
227+
CT_VtR8,
228+
)
229+
230+
register_element_cls("op:Properties", CT_Properties)
231+
register_element_cls("op:property", CT_Property)
232+
register_element_cls("vt:bool", CT_VtBool)
233+
register_element_cls("vt:filetime", CT_VtFiletime)
234+
register_element_cls("vt:i4", CT_VtI4)
235+
register_element_cls("vt:lpwstr", CT_VtLpwstr)
236+
register_element_cls("vt:r8", CT_VtR8)
237+
238+
239+
from pptx.oxml.custom_xml import ( # noqa: E402
240+
CT_DatastoreItem,
241+
CT_DatastoreSchemaRef,
242+
CT_DatastoreSchemaRefs,
243+
)
244+
245+
register_element_cls("ds:datastoreItem", CT_DatastoreItem)
246+
register_element_cls("ds:schemaRef", CT_DatastoreSchemaRef)
247+
register_element_cls("ds:schemaRefs", CT_DatastoreSchemaRefs)
248+
249+
220250
from pptx.oxml.dml.color import ( # noqa: E402
221251
CT_Color,
222252
CT_HslColor,

src/pptx/oxml/custom_properties.py

Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
"""lxml custom element classes for the Custom Document Properties part.
2+
3+
Models `/docProps/custom.xml` — the `<op:Properties>` root and its `<op:property>`
4+
children, each carrying one of five typed `<vt:*>` value elements.
5+
6+
Schema references: ECMA-376 Part 1, §15.2.12.2 (Custom File Properties Part).
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import datetime as dt
12+
from typing import cast
13+
14+
from lxml.etree import _Element # pyright: ignore[reportPrivateUsage]
15+
16+
from pptx.oxml import parse_xml
17+
from pptx.oxml.ns import nsdecls, qn
18+
from pptx.oxml.simpletypes import XsdString, XsdUnsignedInt
19+
from pptx.oxml.xmlchemy import (
20+
BaseOxmlElement,
21+
RequiredAttribute,
22+
ZeroOrMore,
23+
ZeroOrOne,
24+
)
25+
26+
# Well-known FMTID Office writes on every user-defined custom property.
27+
DEFAULT_FMTID = "{D5CDD505-2E9C-101B-9397-08002B2CF9AE}"
28+
29+
# pid values 0 and 1 are reserved by the OOXML spec; user properties start at 2.
30+
_FIRST_PID = 2
31+
32+
# Maximum string length for an lpwstr value. Office-tested limit; longer values
33+
# round-trip but are reported by some inspectors as malformed.
34+
_LPWSTR_MAX_LEN = 255
35+
36+
37+
class CT_Properties(BaseOxmlElement):
38+
"""`<op:Properties>` element, root of `/docProps/custom.xml`.
39+
40+
The xmlchemy declaration is named `prop` rather than `property` because the
41+
latter would shadow Python's built-in `@property` decorator inside the
42+
class body — see metaclass-walk in `xmlchemy.py:120-131`. Public methods
43+
below preserve the `*_property` naming on the user-facing surface.
44+
"""
45+
46+
prop = ZeroOrMore("op:property", successors=())
47+
48+
_properties_tmpl = "<op:Properties %s/>\n" % nsdecls("op", "vt")
49+
50+
@staticmethod
51+
def new_properties() -> "CT_Properties":
52+
"""Return a new empty `<op:Properties>` element with op + vt namespaces."""
53+
return cast("CT_Properties", parse_xml(CT_Properties._properties_tmpl))
54+
55+
@property
56+
def property_lst(self) -> "list[CT_Property]":
57+
"""List of `<op:property>` children in document order."""
58+
return cast("list[CT_Property]", self.prop_lst)
59+
60+
def add_property(self, name: str, value: object) -> "CT_Property":
61+
"""Append a new `<op:property>` child for `(name, value)`.
62+
63+
The pid is auto-assigned to the next free integer ≥ 2 within this
64+
collection. Dispatches `value` by Python type to choose the `<vt:*>`
65+
child. Raises `TypeError` if `value` is not one of the supported types
66+
(see `CT_Property.value` for the dispatch table).
67+
"""
68+
prop = cast("CT_Property", self._add_prop())
69+
prop.fmtid = DEFAULT_FMTID
70+
prop.pid = self._next_pid()
71+
prop.name = name
72+
prop.value = value
73+
return prop
74+
75+
def get_property(self, name: str) -> "CT_Property | None":
76+
"""Return the `<op:property>` child whose `name` attribute is `name`.
77+
78+
Returns `None` if no such child exists. Match is case-sensitive — Office
79+
treats names case-sensitively even though Windows file names elsewhere
80+
do not.
81+
"""
82+
for prop in self.property_lst:
83+
if prop.name == name:
84+
return prop
85+
return None
86+
87+
def remove_property(self, name: str) -> bool:
88+
"""Remove the `<op:property>` child with `name`, returning True if found."""
89+
prop = self.get_property(name)
90+
if prop is None:
91+
return False
92+
self.remove(prop)
93+
return True
94+
95+
@property
96+
def property_names(self) -> tuple[str, ...]:
97+
"""Tuple of `name` attributes for every `<op:property>` child, in order."""
98+
return tuple(p.name for p in self.property_lst)
99+
100+
def _next_pid(self) -> int:
101+
"""Return the next free pid (≥ 2) not yet used by any child."""
102+
used = {p.pid for p in self.property_lst if p.has_pid}
103+
candidate = _FIRST_PID
104+
while candidate in used:
105+
candidate += 1
106+
return candidate
107+
108+
109+
class CT_Property(BaseOxmlElement):
110+
"""`<op:property>` element — one custom document property entry."""
111+
112+
fmtid: str = RequiredAttribute( # pyright: ignore[reportAssignmentType]
113+
"fmtid", XsdString
114+
)
115+
pid: int = RequiredAttribute( # pyright: ignore[reportAssignmentType]
116+
"pid", XsdUnsignedInt
117+
)
118+
name: str = RequiredAttribute( # pyright: ignore[reportAssignmentType]
119+
"name", XsdString
120+
)
121+
122+
lpwstr = ZeroOrOne("vt:lpwstr", successors=())
123+
i4 = ZeroOrOne("vt:i4", successors=())
124+
r8 = ZeroOrOne("vt:r8", successors=())
125+
bool_ = ZeroOrOne("vt:bool", successors=())
126+
filetime = ZeroOrOne("vt:filetime", successors=())
127+
128+
@property
129+
def has_pid(self) -> bool:
130+
"""True if the `pid` attribute is present (it is required, but parsing
131+
a malformed file can leave it unset; this guards `_next_pid` against
132+
crashing on partial input)."""
133+
return self.get("pid") is not None
134+
135+
@property
136+
def value(self) -> str | int | float | bool | dt.datetime | None:
137+
"""The Python-typed value of whichever `<vt:*>` child is present.
138+
139+
Returns `None` if no value child exists (a malformed but tolerated state).
140+
Order of precedence on read: lpwstr, i4, r8, bool, filetime — only one
141+
is expected to be present per the spec.
142+
"""
143+
for child in (self.lpwstr, self.i4, self.r8, self.bool_, self.filetime):
144+
if child is not None:
145+
return cast("_VtValueElement", child).value_typed
146+
return None
147+
148+
@value.setter
149+
def value(self, new_value: object) -> None:
150+
"""Replace the current `<vt:*>` child with one matching `new_value`'s type.
151+
152+
Dispatch table (bool checked BEFORE int because `bool` is a subclass of
153+
`int` in Python):
154+
155+
bool -> <vt:bool>
156+
int -> <vt:i4>
157+
float -> <vt:r8>
158+
str -> <vt:lpwstr>
159+
datetime.datetime -> <vt:filetime>
160+
161+
Other types raise `TypeError`.
162+
"""
163+
# Remove any existing value child before adding the new one.
164+
for tagname in ("vt:lpwstr", "vt:i4", "vt:r8", "vt:bool", "vt:filetime"):
165+
for elem in self.findall(qn(tagname)):
166+
self.remove(elem)
167+
168+
if isinstance(new_value, bool):
169+
child = cast("CT_VtBool", self.get_or_add_bool_())
170+
child.value_typed = new_value
171+
elif isinstance(new_value, int):
172+
child = cast("CT_VtI4", self.get_or_add_i4())
173+
child.value_typed = new_value
174+
elif isinstance(new_value, float):
175+
child = cast("CT_VtR8", self.get_or_add_r8())
176+
child.value_typed = new_value
177+
elif isinstance(new_value, str):
178+
child = cast("CT_VtLpwstr", self.get_or_add_lpwstr())
179+
child.value_typed = new_value
180+
elif isinstance(new_value, dt.datetime):
181+
child = cast("CT_VtFiletime", self.get_or_add_filetime())
182+
child.value_typed = new_value
183+
else:
184+
raise TypeError(
185+
"custom property value must be bool, int, float, str, or datetime; "
186+
"got %s" % type(new_value).__name__
187+
)
188+
189+
190+
class _VtValueElement(BaseOxmlElement):
191+
"""Mixin-style base for `<vt:*>` typed value elements.
192+
193+
Subclasses define a `value_typed` property that round-trips the element's
194+
text content to/from a Python value.
195+
"""
196+
197+
value_typed: object # pyright: ignore[reportUninitializedInstanceVariable]
198+
199+
200+
class CT_VtLpwstr(_VtValueElement):
201+
"""`<vt:lpwstr>` — Unicode string value."""
202+
203+
@property
204+
def value_typed(self) -> str:
205+
return self.text or ""
206+
207+
@value_typed.setter
208+
def value_typed(self, value: str) -> None:
209+
if not isinstance(value, str): # pyright: ignore[reportUnnecessaryIsInstance]
210+
raise TypeError("vt:lpwstr value must be str, got %s" % type(value).__name__)
211+
if len(value) > _LPWSTR_MAX_LEN:
212+
raise ValueError(
213+
"vt:lpwstr value exceeds %d-character limit" % _LPWSTR_MAX_LEN
214+
)
215+
self.text = value
216+
217+
218+
class CT_VtI4(_VtValueElement):
219+
"""`<vt:i4>` — 32-bit signed integer value."""
220+
221+
_MIN = -2147483648
222+
_MAX = 2147483647
223+
224+
@property
225+
def value_typed(self) -> int:
226+
text = self.text
227+
if text is None:
228+
raise ValueError("vt:i4 element has no text content")
229+
return int(text)
230+
231+
@value_typed.setter
232+
def value_typed(self, value: int) -> None:
233+
if isinstance(value, bool) or not isinstance(value, int):
234+
raise TypeError("vt:i4 value must be int, got %s" % type(value).__name__)
235+
if value < self._MIN or value > self._MAX:
236+
raise ValueError(
237+
"vt:i4 value out of range [%d, %d]: %d" % (self._MIN, self._MAX, value)
238+
)
239+
self.text = str(value)
240+
241+
242+
class CT_VtR8(_VtValueElement):
243+
"""`<vt:r8>` — IEEE-754 double-precision float value."""
244+
245+
@property
246+
def value_typed(self) -> float:
247+
text = self.text
248+
if text is None:
249+
raise ValueError("vt:r8 element has no text content")
250+
return float(text)
251+
252+
@value_typed.setter
253+
def value_typed(self, value: float) -> None:
254+
if isinstance(value, bool):
255+
raise TypeError("vt:r8 value must be float, got bool")
256+
if not isinstance(value, (int, float)):
257+
raise TypeError("vt:r8 value must be a number, got %s" % type(value).__name__)
258+
self.text = repr(float(value))
259+
260+
261+
class CT_VtBool(_VtValueElement):
262+
"""`<vt:bool>` — boolean value.
263+
264+
Reads accept `"1"`, `"0"`, `"true"`, `"false"` (case-insensitive). Writes
265+
emit `"true"` or `"false"` to match what Microsoft Office produces.
266+
"""
267+
268+
@property
269+
def value_typed(self) -> bool:
270+
text = (self.text or "").strip().lower()
271+
if text in ("true", "1"):
272+
return True
273+
if text in ("false", "0"):
274+
return False
275+
raise ValueError("vt:bool element has invalid text content: %r" % self.text)
276+
277+
@value_typed.setter
278+
def value_typed(self, value: bool) -> None:
279+
if not isinstance(value, bool): # pyright: ignore[reportUnnecessaryIsInstance]
280+
raise TypeError("vt:bool value must be bool, got %s" % type(value).__name__)
281+
self.text = "true" if value else "false"
282+
283+
284+
class CT_VtFiletime(_VtValueElement):
285+
"""`<vt:filetime>` — ISO-8601 UTC datetime value (always with `Z` suffix)."""
286+
287+
@property
288+
def value_typed(self) -> dt.datetime:
289+
text = self.text
290+
if text is None:
291+
raise ValueError("vt:filetime element has no text content")
292+
return _parse_iso_utc(text)
293+
294+
@value_typed.setter
295+
def value_typed(self, value: dt.datetime) -> None:
296+
if not isinstance(value, dt.datetime): # pyright: ignore[reportUnnecessaryIsInstance]
297+
raise TypeError(
298+
"vt:filetime value must be datetime, got %s" % type(value).__name__
299+
)
300+
# Office writes filetimes as UTC with a literal trailing 'Z'. If the
301+
# caller supplied a tz-aware value in another zone, convert; if naive,
302+
# assume already UTC (matches CorePropertiesPart's behavior).
303+
if value.tzinfo is not None:
304+
value = value.astimezone(dt.timezone.utc).replace(tzinfo=None)
305+
self.text = value.strftime("%Y-%m-%dT%H:%M:%SZ")
306+
307+
308+
def _parse_iso_utc(text: str) -> dt.datetime:
309+
"""Parse `text` as ISO-8601, returning a naive UTC `datetime`.
310+
311+
Accepts the `Z` suffix Office writes and the `+HH:MM` form some tools use.
312+
Returns a naive datetime in UTC for symmetry with `_set_element_datetime`
313+
in `coreprops`. Raises `ValueError` on unparsable input.
314+
"""
315+
cleaned = text.strip()
316+
if cleaned.endswith("Z"):
317+
cleaned = cleaned[:-1] + "+00:00"
318+
parsed = dt.datetime.fromisoformat(cleaned)
319+
if parsed.tzinfo is not None:
320+
parsed = parsed.astimezone(dt.timezone.utc).replace(tzinfo=None)
321+
return parsed

0 commit comments

Comments
 (0)