Skip to content

Commit f988d21

Browse files
committed
fix
1 parent 81edcc2 commit f988d21

1 file changed

Lines changed: 4 additions & 5 deletions

File tree

Makefile

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
build:
22
mvn clean package
33

4-
cdxj: build ensure_jwarc
4+
cdxj: build jwarc.jar
55
@echo "creating *.cdxj index files from the local warcs"
66
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
77
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj
88
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj
99

10-
extract:
10+
extract: jwarc.jar
1111
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
1212
java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
1313
java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
1414
java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
1515
@echo "hint: python -m json.tool extraction.json"
1616

17-
cdx_toolkit:
17+
cdx_toolkit: jwarc.jar
1818
@echo demonstrate that we have this entry in the index
1919
curl 'https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810'
2020
@echo
@@ -33,7 +33,7 @@ cdx_toolkit:
3333

3434
download_collinfo:
3535
@echo "downloading collinfo.json so we can find out the crawl name"
36-
curl -O https://index.commoncrawl.org/collinfo.json
36+
curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json
3737

3838
CC-MAIN-2024-22.warc.paths.gz:
3939
@echo "downloading the list from s3, requires s3 auth even though it is free"
@@ -48,7 +48,6 @@ duck_cloudfront: build
4848
@echo "warning! this might take 1-10 minutes"
4949
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront"
5050

51-
5251
jwarc.jar:
5352
@echo "downloading JWarc JAR"
5453
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar

0 commit comments

Comments
 (0)