File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 33
44cdxj : build ensure_jwarc
55 @echo " creating *.cdxj index files from the local warcs"
6- java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
7- mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args=" data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
8- mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args=" data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
6+ java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/ whirlwind.warc.cdxj
7+ mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args=" data/whirlwind.warc.wet.gz --records conversion" > data/ whirlwind.warc.wet.cdxj
8+ mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args=" data/whirlwind.warc.wat.gz --records metadata" > data/ whirlwind.warc.wat.cdxj
99
1010extract :
1111 @echo " creating extraction.* from local warcs, the offset numbers are from the cdxj index"
@@ -33,12 +33,12 @@ extract:
3333#
3434download_collinfo :
3535 @echo " downloading collinfo.json so we can find out the crawl name"
36- curl -O https://index.commoncrawl.org/collinfo.json
36+ curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json
3737
3838CC-MAIN-2024-22.warc.paths.gz :
3939 @echo " downloading the list from s3, requires s3 auth even though it is free"
4040 @echo " note that this file should be in the repo"
41- aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk ' {print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
41+ aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk ' {print $$4}' | gzip -9 > data/ CC-MAIN-2024-22.warc.paths.gz
4242
4343# duck_local_files:
4444# @echo "warning! 300 gigabyte download"
You can’t perform that action at this time.
0 commit comments