commoncrawl · lfoppiano · Jan 16, 2026 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.github/ci-build-unstable.yml → .github/workflows/ci-build-unstable.yml b/.github/ci-build-unstable.yml → .github/workflows/ci-build-unstable.yml
@@ -14,8 +14,8 @@ jobs:
         with:
           java-version: '11'
           distribution: 'temurin'
-          cache: 'mvn'
-      - name: Build with Gradle
+          cache: maven
+      - name: Build with Maven
         run: mvn -B clean compile
       - name: Check with spotless
-        run: mvn spotless:check
+        run: mvn spotless:check
diff --git a/Makefile b/Makefile
@@ -1,32 +1,19 @@
 build:
 	mvn clean package
 
-iterate: build
-	@echo iterating over all of the local warcs:
-	@echo
-	@echo warc:
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.gz"
-	@echo
-	@echo wet:
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wet.gz"
-	@echo
-	@echo wat:
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wat.gz"
-	@echo
+cdxj: build ensure_jwarc
+	@echo "creating *.cdxj index files from the local warcs"
+	java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
+	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
+	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
 
-# cdxj:
-# 	@echo "creating *.cdxj index files from the local warcs"
-# 	cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
-# 	cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
-# 	cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
+extract:
+	@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
+	java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
+	java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
+	java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
+	@echo "hint: python -m json.tool extraction.json"
 
-# extract:
-# 	@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
-# 	warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
-# 	warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
-# 	warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
-# 	@echo "hint: python -m json.tool extraction.json"
-#
 # cdx_toolkit:
 # 	@echo demonstrate that we have this entry in the index
 # 	cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
@@ -44,15 +31,15 @@ iterate: build
 # 	python ./warcio-iterator.py TEST-000000.extracted.warc.gz
 # 	@echo
 #
-# download_collinfo:
-# 	@echo "downloading collinfo.json so we can find out the crawl name"
-# 	curl -O https://index.commoncrawl.org/collinfo.json
-#
-# CC-MAIN-2024-22.warc.paths.gz:
-# 	@echo "downloading the list from s3, requires s3 auth even though it is free"
-# 	@echo "note that this file should be in the repo"
-# 	 aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
-#
+download_collinfo:
+	@echo "downloading collinfo.json so we can find out the crawl name"
+	curl -O https://index.commoncrawl.org/collinfo.json
+
+CC-MAIN-2024-22.warc.paths.gz:
+	@echo "downloading the list from s3, requires s3 auth even though it is free"
+	@echo "note that this file should be in the repo"
+	 aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
+
 # duck_local_files:
 # 	@echo "warning! 300 gigabyte download"
 # 	python duck.py local_files
@@ -65,7 +52,20 @@ iterate: build
 # 	@echo "warning! this might take 1-10 minutes"
 # 	python duck.py cloudfront
 #
-wreck_the_warc: build
+ensure_jwarc:
+	@echo "Ensuring JWarc JAR is present"
+	@if [ ! -f jwarc.jar ] ; then \
+	  echo "jwarc.jar not found, downloading..." ; \
+	  curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \
+	else \
+	  echo "jwarc.jar found." ; \
+	fi
+
+get_jwarc:
+	@echo "downloading JWarc JAR"
+	curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
+
+wreck_the_warc: build ensure_jwarc
 	@echo
 	@echo we will break and then fix this warc
 	cp data/whirlwind.warc.gz data/testing.warc.gz

diff --git a/README.md b/README.md
@@ -208,24 +208,22 @@ We have two versions of the index: the CDX index and the columnar index. The CDX
 
 ### CDX(J) index
 
-**TBA**: Did not find a good java library that implements this feature, ideally can be implemented in jwarc
-
 The CDX index files are sorted plain-text files, with each line containing information about a single capture in the WARC. Technically, Common Crawl uses CDXJ index files since the information about each capture is formatted as JSON. We'll use CDX and CDXJ interchangeably in this tour for legacy reasons 💅
 
 We can create our own CDXJ index from the local WARCs by running:
 
 ```make cdxj```
 
-This uses the [cdxj-indexer](https://github.com/webrecorder/cdxj-indexer) library to generate CDXJ index files for our WARC files by running the code below: 
+This uses the JWARC library and, partially, a home-cooked code that we wrote to support WET and WAT records, to generate CDXJ index files for our WARC files by running the code below: 
 
 <details>
   <summary>Click to view code</summary>
 
 ```
 creating *.cdxj index files from the local warcs
-cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
-cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
-cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
+java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
 ```
 
 </details>
@@ -249,16 +247,16 @@ Run:
 ```make extract```
 
 to run a set of extractions from your local
-`whirlwind.*.gz` files with `warcio` using the code below:
+`whirlwind.*.gz` files with `JWARC` using the commands below:
 
 <details>
   <summary>Click to view code</summary>
 
 ```
 creating extraction.* from local warcs, the offset numbers are from the cdxj index
-warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
-warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
-warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
+java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
+java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
+java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
 hint: python -m json.tool extraction.json
 ```
 
@@ -291,12 +289,12 @@ and read through the output. You should get something like the output below:
 
 ```
 we will break and then fix this warc
-cp whirlwind.warc.gz testing.warc.gz
-rm -f testing.warc
-gzip -d testing.warc.gz  # windows gunzip no work-a
+cp data/whirlwind.warc.gz data/testing.warc.gz
+rm -f data/testing.warc
+gzip -d data/testing.warc.gz  # windows gunzip no work-a
 
 iterate over this uncompressed warc: works
-mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="testing.warc"
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc"
  WARC-Type: warcinfo
  WARC-Type: request
  WARC-Target-URI https://an.wikipedia.org/wiki/Escopete
@@ -306,25 +304,22 @@ mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args
  WARC-Target-URI https://an.wikipedia.org/wiki/Escopete
 
 compress it the wrong way
-gzip testing.warc
+gzip data/testing.warc
 
--- HERE IT DOES NOT FAIL --
 iterating over this compressed warc fails
-mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="testing.warc.gz" || /usr/bin/true
- WARC-Type: warcinfo
- WARC-Type: request
- WARC-Target-URI https://an.wikipedia.org/wiki/Escopete
- WARC-Type: response
- WARC-Target-URI https://an.wikipedia.org/wiki/Escopete
- WARC-Type: metadata
- WARC-Target-URI https://an.wikipedia.org/wiki/Escopete
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true
+This file is probably not a multi-member gzip but a single gzip file.
+To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together.
 
-now let's do it the right way
-gzip -d testing.warc.gz
+This file is likely still valid and can be fixed by running:
 mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="testing.warc testing.warc.gz"
 
+now let's do it the right way
+gzip -d data/testing.warc.gz
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
+
 and now iterating works
-mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="testing.warc.gz"
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz"
  WARC-Type: warcinfo
  WARC-Type: request
  WARC-Target-URI https://an.wikipedia.org/wiki/Escopete

diff --git a/pom.xml b/pom.xml
@@ -16,10 +16,15 @@
 	</properties>
 
 	<dependencies>
+		<dependency>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-compress</artifactId>
+			<version>1.28.0</version>
+		</dependency>
 		<dependency>
 			<groupId>org.netpreserve</groupId>
 			<artifactId>jwarc</artifactId>
-			<version>0.32.0</version>
+			<version>0.33.0</version>
 		</dependency>
 	</dependencies>