Skip to content

Commit b488029

Browse files
authored
Merge pull request #14 from SimonCropp/better-docx-scrubbing
Better docx scrubbing
2 parents f75d363 + 073d7ff commit b488029

29 files changed

Lines changed: 379 additions & 15 deletions

readme.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Example file formats that leverage System.IO.Packaging
4444
using var sourceStream = File.OpenRead(packagePath);
4545
await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
4646
```
47-
<sup><a href='/src/Tests/Tests.cs#L106-L111' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
47+
<sup><a href='/src/Tests/Tests.cs#L130-L135' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
4848
<!-- endSnippet -->
4949

5050

@@ -56,7 +56,7 @@ await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
5656
using var sourceStream = File.OpenRead(packagePath);
5757
await DeterministicPackage.ConvertAsync(sourceStream, targetStream);
5858
```
59-
<sup><a href='/src/Tests/Tests.cs#L106-L111' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
59+
<sup><a href='/src/Tests/Tests.cs#L130-L135' title='Snippet source file'>snippet source</a> | <a href='#snippet-ConvertAsync' title='Start of snippet'>anchor</a></sup>
6060
<!-- endSnippet -->
6161

6262

src/DeterministicIoPackaging/DeterministicPackage.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ public static partial class DeterministicPackage
1313
new WorkbookRelationshipPatcher(),
1414
new CorePatcher(),
1515
new SheetRelationshipPatcher(),
16-
new DocumentRelationshipPatcher()
16+
new DocumentRelationshipPatcher(),
17+
new DocumentPatcher(),
18+
new NumberingPatcher()
1719
];
1820

1921
static Archive CreateArchive(Stream target) => new(target, ZipArchiveMode.Create, leaveOpen: true);

src/DeterministicIoPackaging/DeterministicPackage_Convert.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public static void Convert(Stream source, Stream target)
2222
{
2323
using var sourceArchive = ReadArchive(source);
2424
using var targetArchive = CreateArchive(target);
25-
foreach (var sourceEntry in OrderedEntries(sourceArchive))
25+
foreach (var sourceEntry in sourceArchive.OrderedEntries())
2626
{
2727
DuplicateEntry(sourceEntry, targetArchive);
2828
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
class DocumentPatcher : IPatcher
2+
{
3+
static XNamespace wp = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing";
4+
static XNamespace pic = "http://schemas.openxmlformats.org/drawingml/2006/picture";
5+
static XName wpDocPr = wp + "docPr";
6+
static XName picCNvPr = pic + "cNvPr";
7+
8+
public bool IsMatch(Entry entry) =>
9+
entry.FullName is "word/document.xml";
10+
11+
public void PatchXml(XDocument xml)
12+
{
13+
var root = xml.Root!;
14+
15+
// Find all elements with id attributes that need normalization
16+
var elementsWithIds = new List<XElement>();
17+
elementsWithIds.AddRange(root.Descendants(wpDocPr));
18+
elementsWithIds.AddRange(root.Descendants(picCNvPr));
19+
20+
// Renumber all id attributes deterministically
21+
for (var i = 0; i < elementsWithIds.Count; i++)
22+
{
23+
// Use index + 1 for 1-based numbering (common in Office Open XML)
24+
elementsWithIds[i].Attribute("id")!.Value = (i + 1).ToString();
25+
}
26+
}
27+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
class NumberingPatcher : IPatcher
2+
{
3+
static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
4+
static XName nsid = w + "nsid";
5+
static XName abstractNum = w + "abstractNum";
6+
static XName num = w + "num";
7+
8+
public bool IsMatch(Entry entry) =>
9+
entry.FullName is "word/numbering.xml";
10+
11+
public void PatchXml(XDocument xml)
12+
{
13+
var root = xml.Root!;
14+
15+
// Remove all w:nsid elements
16+
var nsidElements = root.Descendants(nsid).ToList();
17+
foreach (var element in nsidElements)
18+
{
19+
element.Remove();
20+
}
21+
22+
// Sort abstractNum elements by their content to ensure deterministic order
23+
var abstractNums = root.Elements(abstractNum).ToList();
24+
var sortedAbstractNums = abstractNums
25+
.OrderBy(_ => _.ToString())
26+
.ToList();
27+
28+
// Create a mapping from old abstractNumId to new abstractNumId
29+
var idMapping = new Dictionary<string, string>();
30+
for (var i = 0; i < abstractNums.Count; i++)
31+
{
32+
var oldId = abstractNums[i].Attribute(w + "abstractNumId")?.Value;
33+
var newId = i.ToString();
34+
if (oldId != null)
35+
{
36+
idMapping[oldId] = newId;
37+
}
38+
}
39+
40+
// Update abstractNumId attributes
41+
foreach (var (element, index) in sortedAbstractNums.Select((e, i) => (e, i)))
42+
{
43+
element.Attribute(w + "abstractNumId")!.Value = index.ToString();
44+
}
45+
46+
// Replace abstractNum elements with sorted ones
47+
var nums = root.Elements(num).ToList();
48+
root.ReplaceAll(sortedAbstractNums.Concat(nums));
49+
50+
// Update references in num elements
51+
foreach (var numElement in root.Elements(num))
52+
{
53+
var abstractNumIdElement = numElement.Element(w + "abstractNumId");
54+
if (abstractNumIdElement != null)
55+
{
56+
var oldId = abstractNumIdElement.Attribute(w + "val")?.Value;
57+
if (oldId != null && idMapping.TryGetValue(oldId, out var newId))
58+
{
59+
abstractNumIdElement.Attribute(w + "val")!.Value = newId;
60+
}
61+
}
62+
}
63+
}
64+
}

src/Directory.Build.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<Project>
33
<PropertyGroup>
44
<NoWarn>CS1591;CS0649;CA1416;NU1608;NU1109</NoWarn>
5-
<Version>0.13.0</Version>
5+
<Version>0.14.0</Version>
66
<LangVersion>preview</LangVersion>
77
<AssemblyVersion>1.0.0</AssemblyVersion>
88
<Description>Modify System.IO.Packaging (https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging) files to ensure they are deterministic. Helpful for testing, build reproducibility, security verification, and ensuring package integrity across different build environments.</Description>
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
2+
<w:body>
3+
<w:p>
4+
<w:r>
5+
<w:drawing>
6+
<wp:inline>
7+
<wp:docPr id="1" name="Picture 1" />
8+
<a:graphic>
9+
<a:graphicData>
10+
<pic:pic>
11+
<pic:nvPicPr>
12+
<pic:cNvPr id="4" name="Picture 1" />
13+
</pic:nvPicPr>
14+
</pic:pic>
15+
</a:graphicData>
16+
</a:graphic>
17+
</wp:inline>
18+
</w:drawing>
19+
</w:r>
20+
</w:p>
21+
<w:p>
22+
<w:r>
23+
<w:drawing>
24+
<wp:inline>
25+
<wp:docPr id="2" name="Picture 2" />
26+
<a:graphic>
27+
<a:graphicData>
28+
<pic:pic>
29+
<pic:nvPicPr>
30+
<pic:cNvPr id="5" name="Picture 2" />
31+
</pic:nvPicPr>
32+
</pic:pic>
33+
</a:graphicData>
34+
</a:graphic>
35+
</wp:inline>
36+
</w:drawing>
37+
</w:r>
38+
</w:p>
39+
<w:p>
40+
<w:r>
41+
<w:drawing>
42+
<wp:inline>
43+
<wp:docPr id="3" name="Picture 3" />
44+
<a:graphic>
45+
<a:graphicData>
46+
<pic:pic>
47+
<pic:nvPicPr>
48+
<pic:cNvPr id="6" name="Picture 3" />
49+
</pic:nvPicPr>
50+
</pic:pic>
51+
</a:graphicData>
52+
</a:graphic>
53+
</wp:inline>
54+
</w:drawing>
55+
</w:r>
56+
</w:p>
57+
</w:body>
58+
</w:document>
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
2+
<w:body>
3+
<w:p>
4+
<w:r>
5+
<w:drawing>
6+
<wp:inline>
7+
<wp:docPr id="1" name="Picture 1" />
8+
<a:graphic>
9+
<a:graphicData>
10+
<pic:pic>
11+
<pic:nvPicPr>
12+
<pic:cNvPr id="4" name="Picture 1" />
13+
</pic:nvPicPr>
14+
</pic:pic>
15+
</a:graphicData>
16+
</a:graphic>
17+
</wp:inline>
18+
</w:drawing>
19+
</w:r>
20+
</w:p>
21+
<w:p>
22+
<w:r>
23+
<w:drawing>
24+
<wp:inline>
25+
<wp:docPr id="2" name="Picture 2" />
26+
<a:graphic>
27+
<a:graphicData>
28+
<pic:pic>
29+
<pic:nvPicPr>
30+
<pic:cNvPr id="5" name="Picture 2" />
31+
</pic:nvPicPr>
32+
</pic:pic>
33+
</a:graphicData>
34+
</a:graphic>
35+
</wp:inline>
36+
</w:drawing>
37+
</w:r>
38+
</w:p>
39+
<w:p>
40+
<w:r>
41+
<w:drawing>
42+
<wp:inline>
43+
<wp:docPr id="3" name="Picture 3" />
44+
<a:graphic>
45+
<a:graphicData>
46+
<pic:pic>
47+
<pic:nvPicPr>
48+
<pic:cNvPr id="6" name="Picture 3" />
49+
</pic:nvPicPr>
50+
</pic:pic>
51+
</a:graphicData>
52+
</a:graphic>
53+
</wp:inline>
54+
</w:drawing>
55+
</w:r>
56+
</w:p>
57+
</w:body>
58+
</w:document>
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
[TestFixture]
2+
public class DocumentPatcherTests
3+
{
4+
[Test]
5+
public Task Patch()
6+
{
7+
var xml =
8+
"""
9+
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
10+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
11+
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
12+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
13+
xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
14+
<w:body>
15+
<w:p>
16+
<w:r>
17+
<w:drawing>
18+
<wp:inline>
19+
<wp:docPr id="1627681933" name="Picture 1" />
20+
<a:graphic>
21+
<a:graphicData>
22+
<pic:pic>
23+
<pic:nvPicPr>
24+
<pic:cNvPr id="1627681933" name="Picture 1" />
25+
</pic:nvPicPr>
26+
</pic:pic>
27+
</a:graphicData>
28+
</a:graphic>
29+
</wp:inline>
30+
</w:drawing>
31+
</w:r>
32+
</w:p>
33+
<w:p>
34+
<w:r>
35+
<w:drawing>
36+
<wp:inline>
37+
<wp:docPr id="805879261" name="Picture 2" />
38+
<a:graphic>
39+
<a:graphicData>
40+
<pic:pic>
41+
<pic:nvPicPr>
42+
<pic:cNvPr id="805879261" name="Picture 2" />
43+
</pic:nvPicPr>
44+
</pic:pic>
45+
</a:graphicData>
46+
</a:graphic>
47+
</wp:inline>
48+
</w:drawing>
49+
</w:r>
50+
</w:p>
51+
<w:p>
52+
<w:r>
53+
<w:drawing>
54+
<wp:inline>
55+
<wp:docPr id="999999999" name="Picture 3" />
56+
<a:graphic>
57+
<a:graphicData>
58+
<pic:pic>
59+
<pic:nvPicPr>
60+
<pic:cNvPr id="999999999" name="Picture 3" />
61+
</pic:nvPicPr>
62+
</pic:pic>
63+
</a:graphicData>
64+
</a:graphic>
65+
</wp:inline>
66+
</w:drawing>
67+
</w:r>
68+
</w:p>
69+
</w:body>
70+
</w:document>
71+
""";
72+
73+
var document = PatchHelper.Patch<DocumentPatcher>(xml);
74+
return Verify(document);
75+
}
76+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<numbering xmlns="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
2+
<abstractNum p2:abstractNumId="0" xmlns:p2="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
3+
<p2:multiLevelType p2:val="hybridMultilevel" />
4+
</abstractNum>
5+
<abstractNum p2:abstractNumId="1" xmlns:p2="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
6+
<p2:multiLevelType p2:val="hybridMultilevel" />
7+
</abstractNum>
8+
<num p2:numId="1" xmlns:p2="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
9+
<p2:abstractNumId p2:val="1" />
10+
</num>
11+
<num p2:numId="2" xmlns:p2="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
12+
<p2:abstractNumId p2:val="0" />
13+
</num>
14+
</numbering>

0 commit comments

Comments
 (0)