Skip to content

Commit 1e5ac92

Browse files
authored
Implementation part 1 (Tasks 0, 1, 2) (#1)
1 parent fab0a8d commit 1e5ac92

18 files changed

Lines changed: 2573 additions & 1 deletion

.editorconfig

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# see http://EditorConfig.org
2+
3+
root = true
4+
5+
[*]
6+
end_of_line = lf
7+
insert_final_newline = true
8+
9+
# LF: not sure about this
10+
# [*.java]
11+
# charset = utf-8
12+
# indent_style = space
13+
# indent_size = 4
14+
15+
[Makefile]
16+
indent_style = tab
17+
18+
[*.yaml]
19+
indent_style = space
20+
indent_size = 2

.github/ci-build-unstable.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Build unstable
2+
3+
on: [ push ]
4+
5+
6+
jobs:
7+
build:
8+
runs-on: ubuntu-latest
9+
10+
steps:
11+
- uses: actions/checkout@v4
12+
- name: Set up JDK 11
13+
uses: actions/setup-java@v4
14+
with:
15+
java-version: '11'
16+
distribution: 'temurin'
17+
cache: 'mvn'
18+
- name: Build with Gradle
19+
run: mvn -B clean compile
20+
- name: Check with spotless
21+
run: mvn spotless:check

.gitignore

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,19 @@ Icon
2424
Network Trash Folder
2525
Temporary Items
2626
.apdisk
27-
.idea
27+
28+
# Borrowed from whirlwind-python
29+
*.cdxj
30+
construction/*.gz
31+
**/CC-MAIN*files
32+
**/TEST*.gz
33+
extraction.*
34+
testing.*
35+
**/whirlwind.parquet
36+
**/collinfo.json
37+
**/*.jar
38+
.venv/
39+
.vscode/
40+
.idea/
41+
target/
42+
build/

Makefile

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
build:
2+
mvn clean package
3+
4+
iterate: build
5+
@echo iterating over all of the local warcs:
6+
@echo
7+
@echo warc:
8+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.gz"
9+
@echo
10+
@echo wet:
11+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wet.gz"
12+
@echo
13+
@echo wat:
14+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wat.gz"
15+
@echo
16+
17+
# cdxj:
18+
# @echo "creating *.cdxj index files from the local warcs"
19+
# cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
20+
# cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
21+
# cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
22+
23+
# extract:
24+
# @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
25+
# warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
26+
# warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
27+
# warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
28+
# @echo "hint: python -m json.tool extraction.json"
29+
#
30+
# cdx_toolkit:
31+
# @echo demonstrate that we have this entry in the index
32+
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
33+
# @echo
34+
# @echo cleanup previous work
35+
# rm -f TEST-000000.extracted.warc.gz
36+
# @echo retrieve the content from the commoncrawl s3 bucket
37+
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
38+
# @echo
39+
# @echo index this new warc
40+
# cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
41+
# cat TEST-000000.extracted.warc.cdxj
42+
# @echo
43+
# @echo iterate this new warc
44+
# python ./warcio-iterator.py TEST-000000.extracted.warc.gz
45+
# @echo
46+
#
47+
# download_collinfo:
48+
# @echo "downloading collinfo.json so we can find out the crawl name"
49+
# curl -O https://index.commoncrawl.org/collinfo.json
50+
#
51+
# CC-MAIN-2024-22.warc.paths.gz:
52+
# @echo "downloading the list from s3, requires s3 auth even though it is free"
53+
# @echo "note that this file should be in the repo"
54+
# aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
55+
#
56+
# duck_local_files:
57+
# @echo "warning! 300 gigabyte download"
58+
# python duck.py local_files
59+
#
60+
# duck_ccf_local_files:
61+
# @echo "warning! only works on Common Crawl Foundadtion's development machine"
62+
# python duck.py ccf_local_files
63+
#
64+
# duck_cloudfront:
65+
# @echo "warning! this might take 1-10 minutes"
66+
# python duck.py cloudfront
67+
#
68+
wreck_the_warc: build
69+
@echo
70+
@echo we will break and then fix this warc
71+
cp data/whirlwind.warc.gz data/testing.warc.gz
72+
rm -f data/testing.warc
73+
gzip -d data/testing.warc.gz # windows gunzip no work-a
74+
@echo
75+
@echo iterate over this uncompressed warc: works
76+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc"
77+
@echo
78+
@echo compress it the wrong way
79+
gzip data/testing.warc
80+
@echo
81+
@echo iterating over this compressed warc fails
82+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true
83+
@echo
84+
@echo "now let's do it the right way"
85+
gzip -d data/testing.warc.gz
86+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
87+
@echo
88+
@echo and now iterating works
89+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz"
90+
@echo

0 commit comments

Comments
 (0)