Task 5 (#5)

lfoppiano · web-flow · commit 824461572604 · 2026-01-16T15:33:36.000+01:00
* ignore .idea, target

* add pom.xml, Readme.md and the data files

* add makefile

* add read warc

* add CI + spotless

* add figures, editorconfig, .gitignore from the python repository brother

* remove unclear make install, remove venv info from readme

* update read class, add recompress,

* cleanup, removing the rest of the python stuff for task 0,1,2

* fix missing make install

* move data under 'data' directory

* add Apache header in the code

* make sure we build before running

* update .gitignore

* Implement WARC compression validation for  Task 5

* Ignore gzip validation if is uncompressed

* fix compression check, update Readme.md

* add missing apache licence

* add commons-compress library

* Fix CI script

* place Github Actions in the correct directory

* Fix cache, update build description

* Fix formatting

* Fix method signature

* remove non-implemented part - to avoid confusion

* demonstrate compress-at-record access to WARC file using only JWARC

* fix: typos
diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml
@@ -14,8 +14,8 @@ jobs:
         with:
           java-version: '11'
           distribution: 'temurin'
-          cache: 'mvn'
-      - name: Build with Gradle
+          cache: maven
+      - name: Build with Maven
         run: mvn -B clean compile
       - name: Check with spotless
-        run: mvn spotless:check
+        run: mvn spotless:check
diff --git a/Makefile b/Makefile
@@ -63,19 +63,28 @@ wreck_the_warc: build get_jwarc
 	rm -f data/testing.warc
 	gzip -d data/testing.warc.gz  # windows gunzip no work-a
 	@echo
-	@echo iterate over this uncompressed warc: works
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc"
-	@echo
 	@echo compress it the wrong way
 	gzip data/testing.warc
 	@echo
-	@echo iterating over this compressed warc fails
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true
+	@echo showing the records in the compressed warc - note the offsets of request and response are
+	java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
+	@echo
+	@echo access the request record - failing
+	java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
+	@echo
+	@echo access the response record - failing
+	java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
 	@echo
 	@echo "now let's do it the right way"
 	gzip -d data/testing.warc.gz
 	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
 	@echo
-	@echo and now iterating works
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz"
+	@echo showing the records in the compressed warc - note the skewed offsets of request and response
+	java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
+	@echo
+	@echo access the request record - works
+	java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 518 | head
+	@echo
+	@echo access the response record - works
+	java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 1027 | head -n 20
 	@echo
diff --git a/README.md b/README.md
@@ -422,7 +422,109 @@ TBA
 
 ## Task 5: Wreck the WARC by compressing it wrong
 
-TBA
+As mentioned earlier, WARC/WET/WAT files look like they're normal gzipped files, but they're actually gzipped in a particular way that allows random access. This means that you can't `gunzip` and then `gzip` a warc without wrecking random access. This example:
+
+* creates a copy of one of the warc files in the repo
+* using JWARC we list the records and their respective offsets
+* we access one of the records in the middle of the archive to show that it works
+* uncompresses it
+* recompresses it the wrong way
+* access one of the records in the middle of the archive of the compressed file showing that it fails 
+* recompresses it the right way using `org.commoncrawl.whirlwind.RecompressWARC`
+* show that it works now accessing one of the records in the middle of the archive
+
+Run
+
+```make wreck_the_warc```
+
+and read through the output. You should get something like the output below:
+
+<details>
+  <summary>Click to view output</summary>
+
+```
+we will break and then fix this warc
+cp data/whirlwind.warc.gz data/testing.warc.gz
+rm -f data/testing.warc
+gzip -d data/testing.warc.gz  # windows gunzip no work-a
+
+compress it the wrong way
+gzip data/testing.warc
+
+showing the records in the compressed warc - note the offsets of request and response are identical 
+java -jar jwarc.jar ls data/testing.warc.gz
+         0 warcinfo   -    -
+      3734 request    GET  https://an.wikipedia.org/wiki/Escopete
+      3734 response   200  https://an.wikipedia.org/wiki/Escopete
+     18386 metadata   -    https://an.wikipedia.org/wiki/Escopete
+
+access the request record - failing
+java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
+Exception in thread "main" org.netpreserve.jwarc.ParsingException: invalid WARC record at position 0: <-- HERE -->\xffffff87@\r\xffffffa1\xffffffca\xffffff84\x1d\xffffffca\x0f0\xffffffb4\xffffff93\xfffffff9\xffffffc5\xfffffff3\xffffff89\xffffffeb?\x1b\xffffff87,q\xffffffed\xffffffb3!s\xffffffc1\x08\xffffff83\\xffffffe0T\xffffffadG\xffffffdcd5\x02\xffffffbaQ... (offset 3734)
+        at org.netpreserve.jwarc.WarcParser.parse(WarcParser.java:356)
+        at org.netpreserve.jwarc.WarcReader.next(WarcReader.java:181)
+        at org.netpreserve.jwarc.tools.ExtractTool.main(ExtractTool.java:141)
+        at org.netpreserve.jwarc.tools.WarcTool.main(WarcTool.java:26)
+
+access the response record - failing
+java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
+Exception in thread "main" org.netpreserve.jwarc.ParsingException: invalid WARC record at position 0: <-- HERE -->\xffffff87@\r\xffffffa1\xffffffca\xffffff84\x1d\xffffffca\x0f0\xffffffb4\xffffff93\xfffffff9\xffffffc5\xfffffff3\xffffff89\xffffffeb?\x1b\xffffff87,q\xffffffed\xffffffb3!s\xffffffc1\x08\xffffff83\\xffffffe0T\xffffffadG\xffffffdcd5\x02\xffffffbaQ... (offset 3734)
+        at org.netpreserve.jwarc.WarcParser.parse(WarcParser.java:356)
+        at org.netpreserve.jwarc.WarcReader.next(WarcReader.java:181)
+        at org.netpreserve.jwarc.tools.ExtractTool.main(ExtractTool.java:141)
+        at org.netpreserve.jwarc.tools.WarcTool.main(WarcTool.java:26)
+
+now let's do it the right way
+gzip -d data/testing.warc.gz
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
+
+showing the records in the compressed warc
+java -jar jwarc.jar ls data/testing.warc.gz
+         0 warcinfo   -    -
+       518 request    GET  https://an.wikipedia.org/wiki/Escopete
+      1027 response   200  https://an.wikipedia.org/wiki/Escopete
+     18383 metadata   -    https://an.wikipedia.org/wiki/Escopete
+
+access the request record - works
+java -jar jwarc.jar extract data/testing.warc.gz 518 | head
+WARC/1.0
+Content-Length: 265
+Content-Type: application/http; msgtype=request
+WARC-Block-Digest: sha1:IE7NEN3QEJHUCYRRGVMHDDW3BEHFRQ6V
+WARC-Date: 2024-05-18T01:58:10Z
+WARC-IP-Address: 208.80.154.224
+WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+WARC-Record-ID: <urn:uuid:292f457d-203c-42f2-a1b5-69a4dabefd4f>
+WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete
+WARC-Type: request
+
+access the response record - works
+java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20
+WARC/1.0
+Content-Length: 74581
+Content-Type: application/http; msgtype=response
+WARC-Block-Digest: sha1:35FTUGFVNWRVTZQGCWIX2MQA3LMYC7X7
+WARC-Concurrent-To: <urn:uuid:292f457d-203c-42f2-a1b5-69a4dabefd4f>
+WARC-Date: 2024-05-18T01:58:10Z
+WARC-Identified-Payload-Type: text/html
+WARC-IP-Address: 208.80.154.224
+WARC-Payload-Digest: sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU
+WARC-Record-ID: <urn:uuid:2aabeff2-67f5-4608-8466-e87c6296e2b6>
+WARC-Target-URI: https://an.wikipedia.org/wiki/Escopete
+WARC-Type: response
+WARC-Warcinfo-ID: <urn:uuid:668d88fc-4208-41fc-b327-1aa6cb783331>
+
+HTTP/1.1 200 OK
+date: Sat, 18 May 2024 01:58:10 GMT
+server: mw-web.eqiad.canary-bb67b76b8-jtwdb
+x-content-type-options: nosniff
+content-language: an
+origin-trial: AonOP4SwCrqpb0nhZbg554z9iJimP3DxUDB8V4yu9fyyepauGKD0NXqTknWi4gnuDfMG6hNb7TDUDTsl0mDw9gIAAABmeyJvcmlnaW4iOiJodHRwczovL3dpa2lwZWRpYS5vcmc6NDQzIiwiZmVhdHVyZSI6IlRvcExldmVsVHBjZCIsImV4cGlyeSI6MTczNTM0Mzk5OSwiaXNTdWJkb21haW4iOnRydWV9
+```
+
+</details>
+
+Make sure you compress WARCs the right way!
 
 ## Task 6: Use cdx_toolkit to query the full CDX index and download those captures from AWS S3
 
diff --git a/pom.xml b/pom.xml
@@ -16,11 +16,17 @@
 	</properties>
 
 	<dependencies>
+		<dependency>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-compress</artifactId>
+			<version>1.28.0</version>
+		</dependency>
 		<dependency>
 			<groupId>org.netpreserve</groupId>
 			<artifactId>jwarc</artifactId>
 			<version>0.33.0</version>
 		</dependency>
+
 	</dependencies>
 
 	<build>
diff --git a/src/main/java/org/commoncrawl/whirlwind/ReadWARC.java b/src/main/java/org/commoncrawl/whirlwind/ReadWARC.java
@@ -29,33 +29,42 @@
 
 public class ReadWARC {
 
-    public static void main(String[] args) throws IOException {
-
-        if (args.length != 1) {
-            System.err.println("Usage: java ReadWARC <input-warc-file>");
-            System.exit(1);
-        }
-
-        Path requested = Path.of(args[0]).toAbsolutePath().normalize();
-        if (!Files.isRegularFile(requested)) {
-            throw new SecurityException("Invalid WARC path");
-        }
-
-        final List<String> RESPONSE_TYPES = Arrays.asList("request", "response", "conversion", "metadata");
-
-        try (
-                InputStream in = Files.newInputStream(requested);
-                WarcReader reader = new WarcReader(in)
-        ) {
-            reader.records().forEach(record -> {
-                System.out.println(" WARC-Type: " + record.type());
-                if (RESPONSE_TYPES.contains(record.type())) {
-                    MessageHeaders headers = record.headers();
-                    for (String header : headers.all("WARC-Target-URI")) {
-                        System.out.println(" WARC-Target-URI " + header);
-                    }
-                }
-            });
-        }
-    }
+	private static final List<String> RESPONSE_TYPES = Arrays.asList("request", "response", "conversion", "metadata");
+
+	public static void main(String[] args) throws IOException {
+
+		if (args.length != 1) {
+			System.err.println("Usage: java ReadWARC <input-warc-file>");
+			System.exit(1);
+		}
+
+		Path requested = Path.of(args[0]).toAbsolutePath().normalize();
+		if (!Files.isRegularFile(requested)) {
+			throw new SecurityException("Invalid WARC path");
+		}
+
+		if (requested.toString().endsWith("gz") || requested.toString().endsWith("gzip")) {
+			try {
+				ValidateWARC.validateRandomAccessWarcOrFail(requested);
+			} catch (IOException e) {
+				System.out.println("This file is probably not a multi-member gzip but a single gzip file." + "\n"
+						+ "To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together."
+						+ "\n" + "\n" + "This file is likely still valid and can be fixed by running:" + "\n"
+						+ "mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=\"testing.warc testing.warc.gz\"");
+				System.exit(-1);
+			}
+		}
+
+		try (InputStream in = Files.newInputStream(requested); WarcReader reader = new WarcReader(in)) {
+			reader.records().forEach(record -> {
+				System.out.println(" WARC-Type: " + record.type());
+				if (RESPONSE_TYPES.contains(record.type())) {
+					MessageHeaders headers = record.headers();
+					for (String header : headers.all("WARC-Target-URI")) {
+						System.out.println(" WARC-Target-URI " + header);
+					}
+				}
+			});
+		}
+	}
 }
diff --git a/src/main/java/org/commoncrawl/whirlwind/RecompressWARC.java b/src/main/java/org/commoncrawl/whirlwind/RecompressWARC.java
@@ -32,39 +32,38 @@
 
 public class RecompressWARC {
 
-    public static void main(String[] args) throws IOException {
+	public static void main(String[] args) throws IOException {
 
-        if (args.length != 2) {
-            System.err.println("Usage: java RecompressWarc <input-uncompressed-warc-file> <output-compressed-warc-file>");
-            System.exit(1);
-        }
+		if (args.length != 2) {
+			System.err
+					.println("Usage: java RecompressWarc <input-uncompressed-warc-file> <output-compressed-warc-file>");
+			System.exit(1);
+		}
 
-        Path inputPath = Path.of(args[0]).toAbsolutePath().normalize();
-        Path outputPath = Path.of(args[1]).toAbsolutePath().normalize();
+		Path inputPath = Path.of(args[0]).toAbsolutePath().normalize();
+		Path outputPath = Path.of(args[1]).toAbsolutePath().normalize();
 
-        if (!Files.isRegularFile(inputPath)) {
-            throw new SecurityException("Invalid input WARC path");
-        }
+		if (!Files.isRegularFile(inputPath)) {
+			throw new SecurityException("Invalid input WARC path");
+		}
 
-        if (inputPath.endsWith(".gz")) {
-            System.out.println("Input WARC file is already compressed");
-            System.exit(1);
-        }
+		if (inputPath.endsWith(".gz")) {
+			System.out.println("Input WARC file is already compressed");
+			System.exit(1);
+		}
 
-        try (
-                InputStream in = Files.newInputStream(inputPath);
-                WarcReader reader = new WarcReader(in);
-                OutputStream out = Files.newOutputStream(outputPath);
-                WritableByteChannel outChannel = Channels.newChannel(out);
-                WarcWriter writer = new WarcWriter(outChannel, WarcCompression.GZIP)
-        ) {
-            reader.forEach(record -> {
-                try {
-                    writer.write(record);
-                } catch (IOException e) {
-                    throw new UncheckedIOException(e);
-                }
-            });
-        }
-    }
+		try (InputStream in = Files.newInputStream(inputPath);
+				WarcReader reader = new WarcReader(in);
+				OutputStream out = Files.newOutputStream(outputPath);
+				WritableByteChannel outChannel = Channels.newChannel(out);
+				WarcWriter writer = new WarcWriter(outChannel, WarcCompression.GZIP)) {
+			reader.forEach(record -> {
+				try {
+					writer.write(record);
+				} catch (IOException e) {
+					throw new UncheckedIOException(e);
+				}
+			});
+		}
+	}
 }
diff --git a/src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java b/src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.commoncrawl.whirlwind;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class ValidateWARC {
+	public static void main(String[] args) throws Exception {
+		if (args.length != 1) {
+			System.err.println("Usage: java ValidateWARC <file.gz>");
+			System.exit(2);
+		}
+
+		Path requested = Path.of(args[0]).toAbsolutePath().normalize();
+		if (!Files.isRegularFile(requested)) {
+			throw new SecurityException("Invalid WARC path");
+		}
+
+		int n = getWarcCompressionInformation(requested);
+		if (n <= 1) {
+			System.out.println("Single-member gzip (likely whole-file gzip). members=" + n);
+		} else {
+			System.out.println("Concatenated multi-member gzip (record-compressed). members=" + n);
+		}
+
+	}
+
+	public static int getWarcCompressionInformation(Path inputWarc) throws IllegalArgumentException {
+		final AtomicInteger memberCount = new AtomicInteger(0);
+
+		try (InputStream fis = Files.newInputStream(inputWarc);
+				BufferedInputStream bis = new BufferedInputStream(fis);
+				GzipCompressorInputStream gz = GzipCompressorInputStream.builder().setDecompressConcatenated(true)
+						.setOnMemberEnd(x -> memberCount.incrementAndGet()).setInputStream(bis).get()) {
+
+			byte[] buf = new byte[64 * 1024];
+			while (gz.read(buf) != -1) {
+				// Read the entire stream to trigger member processing
+				// We might not need to read the whole stream, just enough to get an idea
+			}
+		} catch (IOException e) {
+			throw new IllegalArgumentException("The file is either not a gzip file or is corrupted.", e);
+		}
+
+		return memberCount.get();
+	}
+
+	public static void validateRandomAccessWarcOrFail(Path inputWarc) throws IOException {
+		int n = getWarcCompressionInformation(inputWarc);
+
+		if (n <= 1) {
+			throw new IOException(
+					"Non-chunked gzip file detected, gzip block continues\n" + "    beyond single record. " + n);
+		}
+
+	}
+}