Skip to content

Commit 0fa609a

Browse files
authored
Strip per-save revision markers from WordprocessingML output (#80)
Aspose.Words and Word stamp random w14:paraId, w14:textId, and w:rsid* attributes on paragraphs/runs/sections every save. These have no semantic meaning but break deterministic conversion for snapshot testing. Add WordRevisionMarkers helper that strips these attributes; wire it into DocumentPatcher and WordPartPatcher. Broaden WordPartPatcher to match all word/*.xml so headers, footers, settings, styles, etc. get scrubbed (rels remap remains conditional on a mapping existing).
1 parent 8165c0f commit 0fa609a

26 files changed

Lines changed: 187 additions & 30 deletions

src/DeterministicIoPackaging/Patching/DocumentPatcher.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ public void PatchXml(XDocument xml, string entryName)
1212
{
1313
var root = xml.Root!;
1414

15+
WordRevisionMarkers.Strip(xml);
16+
1517
// Find all elements with id attributes that need normalization
1618
var elementsWithIds = new List<XElement>();
1719
elementsWithIds.AddRange(root.Descendants(wpDocPr));

src/DeterministicIoPackaging/Patching/WordPartPatcher.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
// Patches Word sub-part XML files (headers, footers, etc.) to remap
2-
// relationship IDs that were renumbered by WordPartRelationshipPatcher.
1+
// Patches Word sub-part XML files (headers, footers, etc.) to strip
2+
// per-save revision markers and remap relationship IDs that were
3+
// renumbered by WordPartRelationshipPatcher.
34
//
45
// Matches files like word/footer1.xml, word/header1.xml — any XML file
5-
// under word/ that has a corresponding entry in the relationship patcher's
6-
// ID mappings. Excludes document.xml and numbering.xml which have their
7-
// own dedicated patchers.
6+
// under word/ that is not a relationship file. Excludes document.xml and
7+
// numbering.xml which have their own dedicated patchers.
88
//
99
// Must be registered after WordPartRelationshipPatcher so that ID mappings
1010
// are populated before this patcher runs.
@@ -15,11 +15,12 @@ public bool IsMatch(Entry entry) =>
1515
entry.FullName != "word/document.xml" &&
1616
entry.FullName != "word/numbering.xml" &&
1717
!entry.FullName.Contains("/_rels/") &&
18-
entry.FullName.EndsWith(".xml") &&
19-
relsPatcher.IdMappings.ContainsKey(entry.FullName);
18+
entry.FullName.EndsWith(".xml");
2019

2120
public void PatchXml(XDocument xml, string entryName)
2221
{
22+
WordRevisionMarkers.Strip(xml);
23+
2324
if (relsPatcher.IdMappings.TryGetValue(entryName, out var mapping) && mapping.Count > 0)
2425
{
2526
RelationshipRenumber.RemapIds(xml, mapping);
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Strips per-save revision/identity attributes from WordprocessingML XML.
2+
//
3+
// Word and tools like Aspose.Words emit random IDs on every save for
4+
// change-tracking and cross-reference lookup. They have no semantic
5+
// meaning for document content and break deterministic output:
6+
// - w14:paraId / w14:textId on <w:p> (random hex per paragraph save)
7+
// - w:rsidR, w:rsidRPr, w:rsidP, w:rsidRDefault, w:rsidDel,
8+
// w:rsidTr, w:rsidSect (revision save IDs on paragraphs, runs,
9+
// table rows, section properties)
10+
static class WordRevisionMarkers
11+
{
12+
static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
13+
static XNamespace w14 = "http://schemas.microsoft.com/office/word/2010/wordml";
14+
15+
static readonly HashSet<XName> attributesToRemove =
16+
[
17+
w14 + "paraId",
18+
w14 + "textId",
19+
w + "rsidR",
20+
w + "rsidRPr",
21+
w + "rsidP",
22+
w + "rsidRDefault",
23+
w + "rsidDel",
24+
w + "rsidTr",
25+
w + "rsidSect"
26+
];
27+
28+
public static void Strip(XDocument xml)
29+
{
30+
var root = xml.Root;
31+
if (root == null)
32+
{
33+
return;
34+
}
35+
36+
foreach (var element in root.DescendantsAndSelf())
37+
{
38+
element.Attributes()
39+
.Where(_ => attributesToRemove.Contains(_.Name))
40+
.ToList()
41+
.Remove();
42+
}
43+
}
44+
}

src/Directory.Build.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<Project>
33
<PropertyGroup>
44
<NoWarn>CS1591;CS0649;CA1416;NU1608;NU1109;NU1510</NoWarn>
5-
<Version>0.25.0</Version>
5+
<Version>0.26.0</Version>
66
<LangVersion>preview</LangVersion>
77
<AssemblyVersion>1.0.0</AssemblyVersion>
88
<Description>Modify System.IO.Packaging (https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging) files to ensure they are deterministic. Helpful for testing, build reproducibility, security verification, and ensuring package integrity across different build environments.</Description>

src/Tests/OpenXmlTests.cs

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,126 @@ public void ValidateConvertedDocxWithFooterHyperlink()
402402
string.Join(Environment.NewLine, errors.Select(_ => $"{_.Description} ({_.Path})")));
403403
}
404404

405+
[Test]
406+
public void RevisionMarkersAreStripped()
407+
{
408+
var docxStream = CreateDocxWithRevisionMarkers();
409+
var result = DeterministicPackage.Convert(docxStream);
410+
411+
result.Position = 0;
412+
using var archive = new Archive(result, ZipArchiveMode.Read);
413+
414+
XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
415+
XNamespace w14 = "http://schemas.microsoft.com/office/word/2010/wordml";
416+
var attributesToCheck = new[]
417+
{
418+
w14 + "paraId",
419+
w14 + "textId",
420+
w + "rsidR",
421+
w + "rsidRPr",
422+
w + "rsidP",
423+
w + "rsidRDefault",
424+
w + "rsidDel",
425+
w + "rsidTr",
426+
w + "rsidSect"
427+
};
428+
429+
foreach (var entry in archive.Entries.Where(_ =>
430+
_.FullName.StartsWith("word/") &&
431+
_.FullName.EndsWith(".xml") &&
432+
!_.FullName.Contains("/_rels/")))
433+
{
434+
using var stream = entry.Open();
435+
var xml = XDocument.Load(stream);
436+
foreach (var attrName in attributesToCheck)
437+
{
438+
var found = xml.Descendants().Attributes(attrName).FirstOrDefault();
439+
Assert.That(found, Is.Null,
440+
$"Entry '{entry.FullName}' still contains attribute '{attrName}'");
441+
}
442+
}
443+
}
444+
445+
[Test]
446+
public void RevisionMarkersBinaryEquality()
447+
{
448+
// Two builds with different random rsids/paraIds must produce identical bytes
449+
using var stream1 = DeterministicPackage.Convert(CreateDocxWithRevisionMarkers());
450+
using var stream2 = DeterministicPackage.Convert(CreateDocxWithRevisionMarkers());
451+
452+
var bytes1 = stream1.ToArray();
453+
var bytes2 = stream2.ToArray();
454+
455+
Assert.That(bytes1, Is.EqualTo(bytes2));
456+
}
457+
458+
static MemoryStream CreateDocxWithRevisionMarkers()
459+
{
460+
var stream = new MemoryStream();
461+
using (var document = WordprocessingDocument.Create(stream, WordprocessingDocumentType.Document))
462+
{
463+
var mainPart = document.AddMainDocumentPart();
464+
465+
// Header with revision markers
466+
var headerPart = mainPart.AddNewPart<HeaderPart>();
467+
headerPart.Header = new W.Header(
468+
new W.Paragraph(
469+
new W.Run(
470+
new W.Text("Header") { Space = SpaceProcessingModeValues.Preserve }))
471+
{
472+
ParagraphId = RandomHex8(),
473+
TextId = RandomHex8(),
474+
RsidParagraphAddition = RandomHex8()
475+
});
476+
477+
// Footer with revision markers
478+
var footerPart = mainPart.AddNewPart<FooterPart>();
479+
footerPart.Footer = new W.Footer(
480+
new W.Paragraph(
481+
new W.Run(
482+
new W.Text("Footer") { Space = SpaceProcessingModeValues.Preserve }))
483+
{
484+
ParagraphId = RandomHex8(),
485+
TextId = RandomHex8(),
486+
RsidParagraphAddition = RandomHex8()
487+
});
488+
489+
// Body with revision markers on paragraph and section properties
490+
var body = new W.Body(
491+
new W.Paragraph(
492+
new W.Run(
493+
new W.Text("Body") { Space = SpaceProcessingModeValues.Preserve }))
494+
{
495+
ParagraphId = RandomHex8(),
496+
TextId = RandomHex8(),
497+
RsidParagraphAddition = RandomHex8(),
498+
RsidParagraphProperties = RandomHex8(),
499+
RsidRunAdditionDefault = RandomHex8()
500+
},
501+
new W.SectionProperties(
502+
new W.HeaderReference
503+
{
504+
Type = W.HeaderFooterValues.Default,
505+
Id = mainPart.GetIdOfPart(headerPart)
506+
},
507+
new W.FooterReference
508+
{
509+
Type = W.HeaderFooterValues.Default,
510+
Id = mainPart.GetIdOfPart(footerPart)
511+
})
512+
{
513+
RsidSect = RandomHex8()
514+
});
515+
mainPart.Document = new(body);
516+
}
517+
518+
stream.Position = 0;
519+
return stream;
520+
}
521+
522+
static string RandomHex8() =>
523+
Random.Shared.Next().ToString("X8");
524+
405525
[Test]
406526
public void ValidateConvertedDocxWithHeaderHyperlink()
407527
{
87 Bytes
Binary file not shown.
-7 Bytes
Binary file not shown.
51 Bytes
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
<?xml version="1.0" encoding="utf-8" standalone="yes"?><w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:body><w:p w14:paraId="3CC9AF7E" w14:textId="011C4A72" w:rsidR="003A213A" w:rsidRDefault="000C4CD5"><w:r><w:t>sdcvsdvsdv</w:t></w:r></w:p><w:sectPr w:rsidR="003A213A"><w:pgSz w:w="12240" w:h="15840" /><w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0" /><w:cols w:space="720" /><w:docGrid w:linePitch="360" /></w:sectPr></w:body></w:document>
1+
<?xml version="1.0" encoding="utf-8" standalone="yes"?><w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:body><w:p><w:r><w:t>sdcvsdvsdv</w:t></w:r></w:p><w:sectPr><w:pgSz w:w="12240" w:h="15840" /><w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0" /><w:cols w:space="720" /><w:docGrid w:linePitch="360" /></w:sectPr></w:body></w:document>
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2-
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:font w:name="Calibri"><w:panose1 w:val="020F0502020204030204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E4002EFF" w:usb1="C200247B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="Calibri Light"><w:panose1 w:val="020F0302020204030204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E4002EFF" w:usb1="C200247B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font></w:fonts>
1+
<?xml version="1.0" encoding="utf-8" standalone="yes"?><w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:font w:name="Calibri"><w:panose1 w:val="020F0502020204030204" /><w:charset w:val="00" /><w:family w:val="swiss" /><w:pitch w:val="variable" /><w:sig w:usb0="E4002EFF" w:usb1="C200247B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000" /></w:font><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304" /><w:charset w:val="00" /><w:family w:val="roman" /><w:pitch w:val="variable" /><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000" /></w:font><w:font w:name="Calibri Light"><w:panose1 w:val="020F0302020204030204" /><w:charset w:val="00" /><w:family w:val="swiss" /><w:pitch w:val="variable" /><w:sig w:usb0="E4002EFF" w:usb1="C200247B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000" /></w:font></w:fonts>

0 commit comments

Comments
 (0)