11build :
22 mvn clean package
33
4- cdxj : build ensure_jwarc
4+ cdxj : build jwarc.jar
55 @echo " creating *.cdxj index files from the local warcs"
66 java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
77 mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args=" data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj
88 mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args=" data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj
99
10- extract :
10+ extract : jwarc.jar
1111 @echo " creating extraction.* from local warcs, the offset numbers are from the cdxj index"
1212 java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
1313 java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
1414 java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
1515 @echo " hint: python -m json.tool extraction.json"
1616
17- cdx_toolkit :
17+ cdx_toolkit : jwarc.jar
1818 @echo demonstrate that we have this entry in the index
1919 curl ' https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810'
2020 @echo
@@ -33,7 +33,7 @@ cdx_toolkit:
3333
3434download_collinfo :
3535 @echo " downloading collinfo.json so we can find out the crawl name"
36- curl -O https://index.commoncrawl.org/collinfo.json
36+ curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json
3737
3838CC-MAIN-2024-22.warc.paths.gz :
3939 @echo " downloading the list from s3, requires s3 auth even though it is free"
@@ -48,7 +48,6 @@ duck_cloudfront: build
4848 @echo " warning! this might take 1-10 minutes"
4949 mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args" cloudfront"
5050
51-
5251jwarc.jar :
5352 @echo " downloading JWarc JAR"
5453 curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
0 commit comments