diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index aeebd8881e..1c7ce15581 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -212,10 +212,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.startDocument(); try { Metadata entrydata = new Metadata(); - if (cis instanceof GzipCompressorInputStream) { - extractGzipMetadata((GzipCompressorInputStream) cis, entrydata); - } - setName(metadata, entrydata); + setNameAndInternalPath(cis, metadata, entrydata); // Use the delegate parser to parse the compressed document EmbeddedDocumentExtractor extractor = @@ -230,33 +227,44 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.endDocument(); } - private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata metadata) { - GzipParameters gzipParameters = gzcis.getMetaData(); - if (gzipParameters == null) { - return; - } - String name = gzipParameters.getFileName(); - if (!StringUtils.isBlank(name)) { - metadata.set(TikaCoreProperties.INTERNAL_PATH, name); + private String getNameFromGzipMetadataIfPossible(CompressorInputStream cis) { + if (cis instanceof GzipCompressorInputStream) { + GzipCompressorInputStream gzcis = (GzipCompressorInputStream) cis; + GzipParameters gzipParameters = gzcis.getMetaData(); + if (gzipParameters == null) { + return null; + } + String name = gzipParameters.getFileName(); + if (!StringUtils.isBlank(name)) { + return name; + } else { + return null; + } } - //TODO: modification, OS, comment + return null; } - private void setName(Metadata parentMetadata, Metadata metadata) { - String name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - //if parent's name is blank stop now - if (StringUtils.isBlank(name)) { - return; - } - if (name.endsWith(".tgz") || name.endsWith(".tbz") || name.endsWith(".tbz2")) { - name = name.substring(0, name.lastIndexOf(".")) + ".tar"; - } else if (name.endsWith(".bz") || name.endsWith("gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") || - name.endsWith(".br")) { - name = name.substring(0, name.lastIndexOf(".")); - } else if (!name.isEmpty()) { - name = GzipUtils.getUncompressedFileName(name); + private void setNameAndInternalPath(CompressorInputStream cis, Metadata parentMetadata, Metadata metadata) { + String name = getNameFromGzipMetadataIfPossible(cis); + + if (name == null) { + name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + //if parent's name is blank stop now + if (StringUtils.isBlank(name)) { + return; + } + if (name.endsWith(".tgz") || name.endsWith(".tbz") || name.endsWith(".tbz2")) { + name = name.substring(0, name.lastIndexOf(".")) + ".tar"; + } else if (name.endsWith(".bz") || name.endsWith("gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") || + name.endsWith(".br")) { + name = name.substring(0, name.lastIndexOf(".")); + } else if (!name.isEmpty()) { + name = GzipUtils.getUncompressedFileName(name); + } } + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + metadata.set(TikaCoreProperties.INTERNAL_PATH, name); } /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java index 99f12427b2..c99f6d582b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java @@ -50,7 +50,7 @@ public void testEmbedded() throws Exception { assertEquals(1, tracker.mediatypes.size()); assertEquals(1, tracker.modifiedAts.size()); - assertEquals(null, tracker.filenames.get(0)); + assertEquals("test-documents.tar", tracker.filenames.get(0)); assertEquals(null, tracker.mediatypes.get(0)); assertEquals(null, tracker.modifiedAts.get(0)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-documents-no-name-metadata.tgz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-documents-no-name-metadata.tgz new file mode 100644 index 0000000000..9dee3973bf Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-documents-no-name-metadata.tgz differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 69a0eacccb..09defc3531 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -174,6 +174,16 @@ public void testTarball() throws Exception { "/test-documents.tar"), actualEmbeddedPaths); } + @Test + public void testTarballWithoutGzipNameMetadata() throws Exception { + List list = getRecursiveMetadata("test-documents-no-name-metadata.tgz"); + Metadata last = list.get(list.size() - 1); + String internalPath = last.get(TikaCoreProperties.INTERNAL_PATH); + String embeddedResourcePath = last.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); + assertEquals("test-documents-no-name-metadata.tar", internalPath); + assertEquals("/test-documents-no-name-metadata.tar", embeddedResourcePath); + } + @Test public void testCharLimitNoThrowOnWriteLimit() throws Exception { ParseContext context = new ParseContext();