|
| 1 | +build: |
| 2 | + mvn clean package |
| 3 | + |
| 4 | +iterate: build |
| 5 | + @echo iterating over all of the local warcs: |
| 6 | + @echo |
| 7 | + @echo warc: |
| 8 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.gz" |
| 9 | + @echo |
| 10 | + @echo wet: |
| 11 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wet.gz" |
| 12 | + @echo |
| 13 | + @echo wat: |
| 14 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wat.gz" |
| 15 | + @echo |
| 16 | + |
| 17 | +# cdxj: |
| 18 | +# @echo "creating *.cdxj index files from the local warcs" |
| 19 | +# cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj |
| 20 | +# cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj |
| 21 | +# cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj |
| 22 | + |
| 23 | +# extract: |
| 24 | +# @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index" |
| 25 | +# warcio extract --payload whirlwind.warc.gz 1023 > extraction.html |
| 26 | +# warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt |
| 27 | +# warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json |
| 28 | +# @echo "hint: python -m json.tool extraction.json" |
| 29 | +# |
| 30 | +# cdx_toolkit: |
| 31 | +# @echo demonstrate that we have this entry in the index |
| 32 | +# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete |
| 33 | +# @echo |
| 34 | +# @echo cleanup previous work |
| 35 | +# rm -f TEST-000000.extracted.warc.gz |
| 36 | +# @echo retrieve the content from the commoncrawl s3 bucket |
| 37 | +# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete |
| 38 | +# @echo |
| 39 | +# @echo index this new warc |
| 40 | +# cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj |
| 41 | +# cat TEST-000000.extracted.warc.cdxj |
| 42 | +# @echo |
| 43 | +# @echo iterate this new warc |
| 44 | +# python ./warcio-iterator.py TEST-000000.extracted.warc.gz |
| 45 | +# @echo |
| 46 | +# |
| 47 | +# download_collinfo: |
| 48 | +# @echo "downloading collinfo.json so we can find out the crawl name" |
| 49 | +# curl -O https://index.commoncrawl.org/collinfo.json |
| 50 | +# |
| 51 | +# CC-MAIN-2024-22.warc.paths.gz: |
| 52 | +# @echo "downloading the list from s3, requires s3 auth even though it is free" |
| 53 | +# @echo "note that this file should be in the repo" |
| 54 | +# aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz |
| 55 | +# |
| 56 | +# duck_local_files: |
| 57 | +# @echo "warning! 300 gigabyte download" |
| 58 | +# python duck.py local_files |
| 59 | +# |
| 60 | +# duck_ccf_local_files: |
| 61 | +# @echo "warning! only works on Common Crawl Foundadtion's development machine" |
| 62 | +# python duck.py ccf_local_files |
| 63 | +# |
| 64 | +# duck_cloudfront: |
| 65 | +# @echo "warning! this might take 1-10 minutes" |
| 66 | +# python duck.py cloudfront |
| 67 | +# |
| 68 | +wreck_the_warc: build |
| 69 | + @echo |
| 70 | + @echo we will break and then fix this warc |
| 71 | + cp data/whirlwind.warc.gz data/testing.warc.gz |
| 72 | + rm -f data/testing.warc |
| 73 | + gzip -d data/testing.warc.gz # windows gunzip no work-a |
| 74 | + @echo |
| 75 | + @echo iterate over this uncompressed warc: works |
| 76 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc" |
| 77 | + @echo |
| 78 | + @echo compress it the wrong way |
| 79 | + gzip data/testing.warc |
| 80 | + @echo |
| 81 | + @echo iterating over this compressed warc fails |
| 82 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true |
| 83 | + @echo |
| 84 | + @echo "now let's do it the right way" |
| 85 | + gzip -d data/testing.warc.gz |
| 86 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz" |
| 87 | + @echo |
| 88 | + @echo and now iterating works |
| 89 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" |
| 90 | + @echo |
0 commit comments