@@ -52,20 +52,12 @@ CC-MAIN-2024-22.warc.paths.gz:
5252# @echo "warning! this might take 1-10 minutes"
5353# python duck.py cloudfront
5454#
55- ensure_jwarc :
56- @echo " Ensuring JWarc JAR is present"
57- @if [ ! -f jwarc.jar ] ; then \
58- echo " jwarc.jar not found, downloading..." ; \
59- curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \
60- else \
61- echo " jwarc.jar found." ; \
62- fi
6355
64- get_jwarc :
56+ jwarc.jar :
6557 @echo " downloading JWarc JAR"
6658 curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
6759
68- wreck_the_warc : build ensure_jwarc
60+ wreck_the_warc : build jwarc.jar
6961 @echo
7062 @echo we will break and then fix this warc
7163 cp data/whirlwind.warc.gz data/testing.warc.gz
@@ -76,24 +68,24 @@ wreck_the_warc: build ensure_jwarc
7668 gzip data/testing.warc
7769 @echo
7870 @echo showing the records in the compressed warc - note the offsets of request and response are
79- java -jar jwarc-0.33.0 .jar ls data/testing.warc.gz
71+ java -jar jwarc.jar ls data/testing.warc.gz
8072 @echo
8173 @echo access the request record - failing
82- java -jar jwarc-0.33.0 .jar extract data/testing.warc.gz 3734 || /usr/bin/true
74+ java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
8375 @echo
8476 @echo access the response record - failing
85- java -jar jwarc-0.33.0 .jar extract data/testing.warc.gz 3734 || /usr/bin/true
77+ java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
8678 @echo
8779 @echo " now let's do it the right way"
8880 gzip -d data/testing.warc.gz
8981 mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=" data/testing.warc data/testing.warc.gz"
9082 @echo
9183 @echo showing the records in the compressed warc - note the skewed offsets of request and response
92- java -jar jwarc-0.33.0 .jar ls data/testing.warc.gz
84+ java -jar jwarc.jar ls data/testing.warc.gz
9385 @echo
9486 @echo access the request record - works
95- java -jar jwarc-0.33.0 .jar extract data/testing.warc.gz 518 | head
87+ java -jar jwarc.jar extract data/testing.warc.gz 518 | head
9688 @echo
9789 @echo access the response record - works
98- java -jar jwarc-0.33.0 .jar extract data/testing.warc.gz 1027 | head -n 20
90+ java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20
9991 @echo
0 commit comments