Skip to content

Commit 444dc1b

Browse files
committed
Support disabling external file access
1 parent 7ef256b commit 444dc1b

9 files changed

Lines changed: 179 additions & 29 deletions

File tree

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
* Ignore style definitions using a style ID that has already been used.
44

5+
* Support disabling external file accesses using the external_file_access argument.
6+
57
# 1.10.0
68

79
* Add "Heading" and "Body" styles, as found in documents created by Apple Pages,

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,10 @@ Methods:
255255
if the document contains an embedded style map, then it is combined with the default style map.
256256
Call this to ignore any embedded style maps.
257257

258+
* `DocumentConvert disableExternalFileAccess()`: Source documents may reference files outside of the source document.
259+
Call this to disable access to any such external files during the conversion process.
260+
This is highly recommended when converting untrusted user input.
261+
258262
* `DocumentConverter preserveEmptyParagraphs()`: by default, empty paragraphs are ignored.
259263
Call this to preserve empty paragraphs in the output.
260264

src/main/java/org/zwobble/mammoth/DocumentConverter.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,15 @@ public DocumentConverter disableEmbeddedStyleMap() {
6060
return new DocumentConverter(options.disableEmbeddedStyleMap());
6161
}
6262

63+
/**
64+
* Source documents may reference files outside of the source document.
65+
* Call this to disable access to any such external files during the conversion process.
66+
* This is highly recommended when converting untrusted user input.
67+
*/
68+
public DocumentConverter disableExternalFileAccess() {
69+
return new DocumentConverter(options.disableExternalFileAccess());
70+
}
71+
6372
/**
6473
* By default, images are converted to {@code <img>} elements with the source included inline in the {@code src} attribute.
6574
* Call this to change how images are converted.

src/main/java/org/zwobble/mammoth/internal/InternalDocumentConverter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ private InternalResult<String> convertToHtml(Optional<Path> path, Archive zipFil
4545
Optional<StyleMap> styleMap = readEmbeddedStyleMap(zipFile).map(StyleMapParser::parse);
4646
DocumentToHtmlOptions conversionOptions = styleMap.map(options::addEmbeddedStyleMap).orElse(options);
4747

48-
return readDocument(path, zipFile)
48+
return readDocument(path, zipFile, conversionOptions.externalFileAccess())
4949
.flatMap(nodes -> DocumentToHtml.convertToHtml(nodes, conversionOptions))
5050
.map(Html::stripEmpty)
5151
.map(Html::collapse)
@@ -69,7 +69,7 @@ public InternalResult<String> extractRawText(File file) throws IOException {
6969
}
7070

7171
private InternalResult<String> extractRawText(Optional<Path> path, Archive zipFile) {
72-
return readDocument(path, zipFile)
72+
return readDocument(path, zipFile, options.externalFileAccess())
7373
.map(RawText::extractRawText);
7474
}
7575

src/main/java/org/zwobble/mammoth/internal/conversion/DocumentToHtmlOptions.java

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ public class DocumentToHtmlOptions {
1616
StyleMap.EMPTY,
1717
false,
1818
false,
19+
false,
1920
InternalImageConverter.imgElement(image -> {
2021
String base64 = Base64Encoding.streamToBase64(image::getInputStream);
2122
String src = "data:" + image.getContentType() + ";base64," + base64;
@@ -30,6 +31,7 @@ public class DocumentToHtmlOptions {
3031
private final StyleMap embeddedStyleMap;
3132
private final boolean disableDefaultStyleMap;
3233
private final boolean disableEmbeddedStyleMap;
34+
private final boolean disableExternalFileAccess;
3335
private final InternalImageConverter imageConverter;
3436

3537
public DocumentToHtmlOptions(
@@ -39,6 +41,7 @@ public DocumentToHtmlOptions(
3941
StyleMap embeddedStyleMap,
4042
boolean disableDefaultStyleMap,
4143
boolean disableEmbeddedStyleMap,
44+
boolean disableExternalFileAccess,
4245
InternalImageConverter imageConverter
4346
) {
4447
this.idPrefix = idPrefix;
@@ -47,35 +50,103 @@ public DocumentToHtmlOptions(
4750
this.embeddedStyleMap = embeddedStyleMap;
4851
this.disableDefaultStyleMap = disableDefaultStyleMap;
4952
this.disableEmbeddedStyleMap = disableEmbeddedStyleMap;
53+
this.disableExternalFileAccess = disableExternalFileAccess;
5054
this.imageConverter = imageConverter;
5155
}
5256

5357
public DocumentToHtmlOptions idPrefix(String prefix) {
54-
return new DocumentToHtmlOptions(prefix, preserveEmptyParagraphs, styleMap, embeddedStyleMap, disableDefaultStyleMap, disableEmbeddedStyleMap, imageConverter);
58+
return new DocumentToHtmlOptions(
59+
prefix,
60+
preserveEmptyParagraphs,
61+
styleMap,
62+
embeddedStyleMap,
63+
disableDefaultStyleMap,
64+
disableEmbeddedStyleMap,
65+
disableExternalFileAccess,
66+
imageConverter
67+
);
5568
}
5669

5770
public DocumentToHtmlOptions preserveEmptyParagraphs() {
58-
return new DocumentToHtmlOptions(idPrefix, true, styleMap, embeddedStyleMap, disableDefaultStyleMap, disableEmbeddedStyleMap, imageConverter);
71+
return new DocumentToHtmlOptions(
72+
idPrefix,
73+
true,
74+
styleMap,
75+
embeddedStyleMap,
76+
disableDefaultStyleMap,
77+
disableEmbeddedStyleMap,
78+
disableExternalFileAccess,
79+
imageConverter
80+
);
5981
}
6082

6183
public DocumentToHtmlOptions addStyleMap(String styleMap) {
6284
return addStyleMap(StyleMapParser.parse(styleMap));
6385
}
6486

6587
public DocumentToHtmlOptions addStyleMap(StyleMap styleMap) {
66-
return new DocumentToHtmlOptions(idPrefix, preserveEmptyParagraphs, this.styleMap.update(styleMap), embeddedStyleMap, disableDefaultStyleMap, disableEmbeddedStyleMap, imageConverter);
88+
return new DocumentToHtmlOptions(
89+
idPrefix,
90+
preserveEmptyParagraphs,
91+
this.styleMap.update(styleMap),
92+
embeddedStyleMap,
93+
disableDefaultStyleMap,
94+
disableEmbeddedStyleMap,
95+
disableExternalFileAccess,
96+
imageConverter
97+
);
6798
}
6899

69100
public DocumentToHtmlOptions disableDefaultStyleMap() {
70-
return new DocumentToHtmlOptions(idPrefix, preserveEmptyParagraphs, styleMap, embeddedStyleMap, true, disableEmbeddedStyleMap, imageConverter);
101+
return new DocumentToHtmlOptions(
102+
idPrefix,
103+
preserveEmptyParagraphs,
104+
styleMap,
105+
embeddedStyleMap,
106+
true,
107+
disableEmbeddedStyleMap,
108+
disableExternalFileAccess,
109+
imageConverter
110+
);
71111
}
72112

73113
public DocumentToHtmlOptions disableEmbeddedStyleMap() {
74-
return new DocumentToHtmlOptions(idPrefix, preserveEmptyParagraphs, styleMap, embeddedStyleMap, disableDefaultStyleMap, true, imageConverter);
114+
return new DocumentToHtmlOptions(
115+
idPrefix,
116+
preserveEmptyParagraphs,
117+
styleMap,
118+
embeddedStyleMap,
119+
disableDefaultStyleMap,
120+
true,
121+
disableExternalFileAccess,
122+
imageConverter
123+
);
124+
}
125+
126+
public DocumentToHtmlOptions disableExternalFileAccess() {
127+
return new DocumentToHtmlOptions(
128+
idPrefix,
129+
preserveEmptyParagraphs,
130+
styleMap,
131+
embeddedStyleMap,
132+
disableDefaultStyleMap,
133+
disableEmbeddedStyleMap,
134+
true,
135+
imageConverter
136+
);
75137
}
76138

77139
public DocumentToHtmlOptions addEmbeddedStyleMap(StyleMap embeddedStyleMap) {
78-
return new DocumentToHtmlOptions(idPrefix, preserveEmptyParagraphs, styleMap, embeddedStyleMap, disableDefaultStyleMap, disableEmbeddedStyleMap, imageConverter);
140+
return new DocumentToHtmlOptions(
141+
idPrefix,
142+
preserveEmptyParagraphs,
143+
styleMap,
144+
embeddedStyleMap,
145+
disableDefaultStyleMap,
146+
disableEmbeddedStyleMap,
147+
disableExternalFileAccess,
148+
imageConverter
149+
);
79150
}
80151

81152
public DocumentToHtmlOptions imageConverter(ImageConverter.ImgElement imageConverter) {
@@ -86,6 +157,7 @@ public DocumentToHtmlOptions imageConverter(ImageConverter.ImgElement imageConve
86157
embeddedStyleMap,
87158
disableDefaultStyleMap,
88159
disableEmbeddedStyleMap,
160+
disableExternalFileAccess,
89161
InternalImageConverter.imgElement(imageConverter)
90162
);
91163
}
@@ -110,6 +182,10 @@ public StyleMap styleMap() {
110182
return styleMap;
111183
}
112184

185+
public boolean externalFileAccess() {
186+
return !this.disableExternalFileAccess;
187+
}
188+
113189
public InternalImageConverter imageConverter() {
114190
return imageConverter;
115191
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package org.zwobble.mammoth.internal.docx;
2+
3+
import java.io.IOException;
4+
import java.io.InputStream;
5+
6+
public class DisabledFileReader implements FileReader {
7+
@Override
8+
public InputStream getInputStream(String uri) throws IOException {
9+
throw new IOException("could not open external image '" + uri + "': external file access is disabled");
10+
}
11+
}

src/main/java/org/zwobble/mammoth/internal/docx/DocumentReader.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,21 @@
2222
import static org.zwobble.mammoth.internal.util.Strings.trimLeft;
2323

2424
public class DocumentReader {
25-
public static InternalResult<Document> readDocument(Optional<Path> path, Archive zipFile) {
25+
public static InternalResult<Document> readDocument(
26+
Optional<Path> path,
27+
Archive zipFile,
28+
boolean externalFileAccess
29+
) {
2630
PartPaths partPaths = findPartPaths(zipFile);
2731

2832
Styles styles = readStyles(zipFile, partPaths);
2933
Numbering numbering = readNumbering(zipFile, partPaths, styles);
3034
ContentTypes contentTypes = readContentTypes(zipFile);
31-
FileReader fileReader = new PathRelativeFileReader(path);
35+
36+
FileReader fileReader = externalFileAccess
37+
? new PathRelativeFileReader(path)
38+
: new DisabledFileReader();
39+
3240
PartWithBodyReader partReader = new PartWithBodyReader(zipFile, contentTypes, fileReader, numbering, styles);
3341
return InternalResult.flatMap(
3442
readNotes(partReader, partPaths),

src/test/java/org/zwobble/mammoth/tests/MammothTests.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,29 @@ public void warnIfDocumentHasImagesStoredOutsideOfDocumentWhenPathOfDocumentIsUn
120120
}
121121
}
122122

123+
@Test
124+
public void warnIfDocumentHasImagesStoredOutsideOfDocumentWhenExternalFileAccessIsDisabled() throws IOException {
125+
Path tempDirectory = Files.createTempDirectory("mammoth-");
126+
try {
127+
Path documentPath = tempDirectory.resolve("external-picture.docx");
128+
Files.copy(TestData.file("external-picture.docx").toPath(), documentPath);
129+
Files.copy(TestData.file("tiny-picture.png").toPath(), tempDirectory.resolve("tiny-picture.png"));
130+
assertThat(
131+
new DocumentConverter()
132+
.disableExternalFileAccess()
133+
.convertToHtml(documentPath.toFile()),
134+
allOf(
135+
hasProperty("value", equalTo("")),
136+
hasProperty("warnings", contains(
137+
equalTo("could not open external image 'tiny-picture.png': external file access is disabled")
138+
))
139+
)
140+
);
141+
} finally {
142+
tempDirectory.toFile().delete();
143+
}
144+
}
145+
123146
@Test
124147
public void warnIfImagesStoredOutsideOfDocumentAreNotFound() throws IOException {
125148
Path tempDirectory = Files.createTempDirectory("mammoth-");

src/test/java/org/zwobble/mammoth/tests/docx/DocumentReaderTests.java

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,30 +38,47 @@ public class DocumentReaderTests {
3838

3939
@Test
4040
public void mainDocumentIsFoundUsingPackageRelationships() {
41-
InternalResult<Document> result = DocumentReader.readDocument(Optional.empty(), InMemoryArchive.fromStrings(map(
41+
Archive archive = InMemoryArchive.fromStrings(map(
4242
"word/document2.xml", XmlWriter.toString(
43-
element("w:document", list(
44-
element("w:body", list(
45-
element("w:p", list(
46-
element("w:r", list(
47-
element("w:t", list(text("Hello.")))
48-
))
49-
))
50-
))
51-
)),
43+
element(
44+
"w:document", list(
45+
element(
46+
"w:body", list(
47+
element(
48+
"w:p", list(
49+
element(
50+
"w:r", list(
51+
element("w:t", list(text("Hello.")))
52+
)
53+
)
54+
)
55+
)
56+
)
57+
)
58+
)
59+
),
5260
mainDocumentNamespaces
5361
),
5462
"_rels/.rels", XmlWriter.toString(
55-
element("Relationships", list(
56-
element("Relationship", map(
57-
"Id", "rId1",
58-
"Target", "/word/document2.xml",
59-
"Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"
60-
))
61-
)),
63+
element(
64+
"Relationships", list(
65+
element(
66+
"Relationship", map(
67+
"Id", "rId1",
68+
"Target", "/word/document2.xml",
69+
"Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"
70+
)
71+
)
72+
)
73+
),
6274
relationshipsNamespaces
6375
)
64-
)));
76+
));
77+
InternalResult<Document> result = DocumentReader.readDocument(
78+
Optional.empty(),
79+
archive,
80+
false
81+
);
6582

6683
assertThat(result, isInternalSuccess(document(
6784
withChildren(paragraphWithText("Hello."))
@@ -84,7 +101,7 @@ public void errorIsThrownWhenMainDocumentPartDoesNotExist() {
84101
));
85102
PassThroughException exception = assertThrows(
86103
PassThroughException.class,
87-
() -> DocumentReader.readDocument(Optional.empty(), archive)
104+
() -> DocumentReader.readDocument(Optional.empty(), archive, false)
88105
);
89106
assertThat(exception.getMessage(), equalTo("java.io.IOException: Could not find main document part. Are you sure this is a valid .docx file?"));
90107
}

0 commit comments

Comments
 (0)