Skip to content

Commit 87c6ca9

Browse files
committed
fix(makefile): write stuff in data/
1 parent 40bb84a commit 87c6ca9

1 file changed

Lines changed: 5 additions & 5 deletions

File tree

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ build:
33

44
cdxj: build ensure_jwarc
55
@echo "creating *.cdxj index files from the local warcs"
6-
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
7-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
8-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
6+
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
7+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj
8+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj
99

1010
extract:
1111
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
@@ -33,12 +33,12 @@ extract:
3333
#
3434
download_collinfo:
3535
@echo "downloading collinfo.json so we can find out the crawl name"
36-
curl -O https://index.commoncrawl.org/collinfo.json
36+
curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json
3737

3838
CC-MAIN-2024-22.warc.paths.gz:
3939
@echo "downloading the list from s3, requires s3 auth even though it is free"
4040
@echo "note that this file should be in the repo"
41-
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
41+
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > data/CC-MAIN-2024-22.warc.paths.gz
4242

4343
# duck_local_files:
4444
# @echo "warning! 300 gigabyte download"

0 commit comments

Comments
 (0)