Skip to content
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
7c63ebb
ignore .idea, target
lfoppiano Dec 16, 2025
3f584e5
add pom.xml, Readme.md and the data files
lfoppiano Dec 16, 2025
f9d929e
add makefile
lfoppiano Dec 16, 2025
7e7b3f5
add read warc
lfoppiano Dec 16, 2025
a998133
add CI + spotless
lfoppiano Dec 16, 2025
c808d8c
add figures, editorconfig, .gitignore from the python repository brother
lfoppiano Dec 16, 2025
fa3f707
remove unclear make install, remove venv info from readme
lfoppiano Dec 16, 2025
f6d62bb
update read class, add recompress,
lfoppiano Dec 17, 2025
4aa252a
cleanup, removing the rest of the python stuff for task 0,1,2
lfoppiano Dec 17, 2025
5b018e9
fix missing make install
lfoppiano Dec 18, 2025
817862c
move data under 'data' directory
lfoppiano Dec 18, 2025
620ebee
add Apache header in the code
lfoppiano Dec 18, 2025
886ff0b
make sure we build before running
lfoppiano Dec 18, 2025
6180fce
update .gitignore
lfoppiano Dec 19, 2025
d35e3d8
Implement WARC compression validation for Task 5
lfoppiano Dec 20, 2025
e20c81e
Ignore gzip validation if is uncompressed
lfoppiano Dec 20, 2025
07c9f8b
Merge branch 'main' into luca/feature/part2
lfoppiano Dec 22, 2025
0fa930e
fix compression check, update Readme.md
lfoppiano Dec 22, 2025
78fbac6
add missing apache licence
lfoppiano Dec 22, 2025
6f97782
add commons-compress library
lfoppiano Dec 22, 2025
52fca8c
place Github Actions in the correct directory
lfoppiano Dec 22, 2025
75af0e1
Add CDJX indexer using unreleased JARC code
lfoppiano Dec 23, 2025
077f904
Implement Task 3 and 4
lfoppiano Dec 28, 2025
3a2791a
fix: CI build
lfoppiano Dec 28, 2025
df257e4
fix: Reformat with spotless
lfoppiano Dec 28, 2025
3ed8d61
fix: Rename class
lfoppiano Dec 29, 2025
b3c7252
feat: task 7
lfoppiano Dec 29, 2025
e55c48a
feat: Task 8, duck DB with local file
lfoppiano Jan 5, 2026
43fc088
chore: Run spotless
lfoppiano Jan 5, 2026
a716f7d
feat: update editorconfig
lfoppiano Jan 9, 2026
e38a0ce
feat: task 6
lfoppiano Jan 9, 2026
a91e52f
chore(docu): minor changes
lfoppiano Jan 12, 2026
f88927a
fix: move data files in data
lfoppiano Jan 16, 2026
81edcc2
Merge branch 'main' into luca/feature/part5
lfoppiano Jan 16, 2026
f988d21
fix
lfoppiano Jan 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@ root = true
end_of_line = lf
insert_final_newline = true

# LF: not sure about this
# [*.java]
# charset = utf-8
# indent_style = space
# indent_size = 4
[*.java]
charset = utf-8
indent_style = space
indent_size = 4

[Makefile]
indent_style = tab
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ jobs:
with:
java-version: '11'
distribution: 'temurin'
cache: 'mvn'
- name: Build with Gradle
cache: maven
- name: Build with Maven
run: mvn -B clean compile
- name: Check with spotless
run: mvn spotless:check
run: mvn spotless:check
Binary file added CC-MAIN-2024-22.warc.paths.gz
Binary file not shown.
115 changes: 56 additions & 59 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,71 +1,68 @@
build:
mvn clean package

iterate: build
@echo iterating over all of the local warcs:
cdxj: build ensure_jwarc
@echo "creating *.cdxj index files from the local warcs"
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj

extract:
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
@echo "hint: python -m json.tool extraction.json"

cdx_toolkit:
@echo demonstrate that we have this entry in the index
curl 'https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810'
@echo
@echo warc:
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.gz"
@echo cleanup previous work
rm -f TEST-000000.extracted.warc.gz
@echo retrieve the content from the commoncrawl data server
curl --request GET --url 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header 'Range: bytes=80610731-80628153' > TEST-000000.extracted.warc.gz
@echo
@echo wet:
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wet.gz"
@echo index this new warc
java -jar jwarc.jar cdxj TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
cat TEST-000000.extracted.warc.cdxj
@echo
@echo wat:
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wat.gz"
@echo iterate this new warc
java -jar jwarc.jar ls TEST-000000.extracted.warc.gz
@echo

# cdxj:
# @echo "creating *.cdxj index files from the local warcs"
# cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
# cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
# cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
download_collinfo:
@echo "downloading collinfo.json so we can find out the crawl name"
curl -O https://index.commoncrawl.org/collinfo.json

CC-MAIN-2024-22.warc.paths.gz:
@echo "downloading the list from s3, requires s3 auth even though it is free"
@echo "note that this file should be in the repo"
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz

duck_ccf_local_files: build
@echo "warning! only works on Common Crawl Foundadtion's development machine"
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"ccf_local_files"

duck_cloudfront: build
@echo "warning! this might take 1-10 minutes"
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront"


ensure_jwarc:
@echo "Ensuring JWarc JAR is present"
@if [ ! -f jwarc.jar ] ; then \
echo "jwarc.jar not found, downloading..." ; \
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \
else \
echo "jwarc.jar found." ; \
fi

get_jwarc:
@echo "downloading JWarc JAR"
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar

# extract:
# @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
# warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
# warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
# warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
# @echo "hint: python -m json.tool extraction.json"
#
# cdx_toolkit:
# @echo demonstrate that we have this entry in the index
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
# @echo
# @echo cleanup previous work
# rm -f TEST-000000.extracted.warc.gz
# @echo retrieve the content from the commoncrawl s3 bucket
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
# @echo
# @echo index this new warc
# cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
# cat TEST-000000.extracted.warc.cdxj
# @echo
# @echo iterate this new warc
# python ./warcio-iterator.py TEST-000000.extracted.warc.gz
# @echo
#
# download_collinfo:
# @echo "downloading collinfo.json so we can find out the crawl name"
# curl -O https://index.commoncrawl.org/collinfo.json
#
# CC-MAIN-2024-22.warc.paths.gz:
# @echo "downloading the list from s3, requires s3 auth even though it is free"
# @echo "note that this file should be in the repo"
# aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
#
# duck_local_files:
# @echo "warning! 300 gigabyte download"
# python duck.py local_files
#
# duck_ccf_local_files:
# @echo "warning! only works on Common Crawl Foundadtion's development machine"
# python duck.py ccf_local_files
#
# duck_cloudfront:
# @echo "warning! this might take 1-10 minutes"
# python duck.py cloudfront
#
wreck_the_warc: build
wreck_the_warc: build ensure_jwarc
@echo
@echo we will break and then fix this warc
cp data/whirlwind.warc.gz data/testing.warc.gz
Expand Down
Loading