Skip to content

Commit 077f904

Browse files
committed
Implement Task 3 and 4
1 parent 75af0e1 commit 077f904

5 files changed

Lines changed: 363 additions & 51 deletions

File tree

Makefile

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,19 @@
11
build:
2-
mvn -q install:install-file -Dfile=libs/jwarc-0.32.1-SNAPSHOT.jar -DgroupId=org.netpreserve -DartifactId=jwarc -Dversion=0.32.1-SNAPSHOT -Dpackaging=jar
32
mvn clean package
43

5-
iterate: build
6-
@echo iterating over all of the local warcs:
7-
@echo
8-
@echo warc:
9-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.gz"
10-
@echo
11-
@echo wet:
12-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wet.gz"
13-
@echo
14-
@echo wat:
15-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wat.gz"
16-
@echo
17-
18-
cdxj:
4+
cdxj: build ensure_jwarc
195
@echo "creating *.cdxj index files from the local warcs"
20-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.gz" > whirlwind.warc.cdxj
21-
# cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
22-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz" > whirlwind.warc.wet.cdxj
23-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz" > whirlwind.warc.wat.cdxj
6+
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
7+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
8+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
9+
10+
extract:
11+
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
12+
java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
13+
java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
14+
java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
15+
@echo "hint: python -m json.tool extraction.json"
2416

25-
# extract:
26-
# @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
27-
# warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
28-
# warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
29-
# warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
30-
# @echo "hint: python -m json.tool extraction.json"
31-
#
3217
# cdx_toolkit:
3318
# @echo demonstrate that we have this entry in the index
3419
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
@@ -67,7 +52,20 @@ cdxj:
6752
# @echo "warning! this might take 1-10 minutes"
6853
# python duck.py cloudfront
6954
#
70-
wreck_the_warc: build
55+
ensure_jwarc:
56+
@echo "Ensuring JWarc JAR is present"
57+
@if [ ! -f jwarc.jar ] ; then \
58+
echo "jwarc.jar not found, downloading..." ; \
59+
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \
60+
else \
61+
echo "jwarc.jar found." ; \
62+
fi
63+
64+
get_jwarc:
65+
@echo "downloading JWarc JAR"
66+
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
67+
68+
wreck_the_warc: build ensure_jwarc
7169
@echo
7270
@echo we will break and then fix this warc
7371
cp data/whirlwind.warc.gz data/testing.warc.gz

README.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -214,17 +214,16 @@ We can create our own CDXJ index from the local WARCs by running:
214214

215215
```make cdxj```
216216

217-
This uses the [cdxj-indexer](https://github.com/webrecorder/cdxj-indexer) library to generate CDXJ index files for our WARC files by running the code below:
217+
This uses the JWARC library and, partially, a home-cooked code that we wrote to support WET and WAT records, to generate CDXJ index files for our WARC files by running the code below:
218218

219219
<details>
220220
<summary>Click to view code</summary>
221221

222222
```
223223
creating *.cdxj index files from the local warcs
224-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.gz"
225-
cdxj-indexer data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
226-
cdxj-indexer --records conversion data/whirlwind.warc.wet.gz > data/whirlwind.warc.wet.cdxj
227-
cdxj-indexer data/whirlwind.warc.wat.gz > data/whirlwind.warc.wat.cdxj
224+
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
225+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
226+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
228227
```
229228

230229
</details>
@@ -248,16 +247,16 @@ Run:
248247
```make extract```
249248

250249
to run a set of extractions from your local
251-
`whirlwind.*.gz` files with `warcio` using the code below:
250+
`whirlwind.*.gz` files with `JWARC` using the commands below:
252251

253252
<details>
254253
<summary>Click to view code</summary>
255254

256255
```
257256
creating extraction.* from local warcs, the offset numbers are from the cdxj index
258-
warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
259-
warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
260-
warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
257+
java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
258+
java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
259+
java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
261260
hint: python -m json.tool extraction.json
262261
```
263262

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
<dependency>
2525
<groupId>org.netpreserve</groupId>
2626
<artifactId>jwarc</artifactId>
27-
<version>0.32.1-SNAPSHOT</version>
27+
<version>0.33.0</version>
2828
</dependency>
2929
</dependencies>
3030

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.commoncrawl.whirlwind;
19+
20+
import org.netpreserve.jwarc.*;
21+
import org.netpreserve.jwarc.cdx.CdxFormat;
22+
import org.netpreserve.jwarc.cdx.CdxRequestEncoder;
23+
import org.netpreserve.jwarc.cdx.CdxWriter;
24+
25+
import java.io.IOException;
26+
import java.io.Writer;
27+
import java.net.URI;
28+
import java.time.ZoneOffset;
29+
import java.time.format.DateTimeFormatter;
30+
import java.util.function.Consumer;
31+
import java.util.function.Predicate;
32+
33+
34+
public class CdxFilterWithDynamicFiltering extends CdxWriter {
35+
private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
36+
.withZone(ZoneOffset.UTC);
37+
38+
private final Writer writer;
39+
private CdxFormat format = CdxFormat.CDXJ;
40+
private boolean postAppend = false;
41+
private Consumer<String> warningHandler;
42+
private Predicate<WarcRecord> recordFilter = null;
43+
44+
public CdxFilterWithDynamicFiltering(Writer writer) {
45+
super(writer);
46+
this.writer = writer;
47+
}
48+
49+
@Override
50+
public void setFormat(CdxFormat format) {
51+
super.setFormat(format);
52+
this.format = format;
53+
}
54+
55+
public CdxFormat getFormat() {
56+
return this.format;
57+
}
58+
59+
@Override
60+
public void setPostAppend(boolean postAppend) {
61+
super.setPostAppend(postAppend);
62+
this.postAppend = postAppend;
63+
}
64+
65+
@Override
66+
public void onWarning(Consumer<String> warningHandler) {
67+
super.onWarning(warningHandler);
68+
this.warningHandler = warningHandler;
69+
}
70+
71+
@Override
72+
public void process(WarcReader reader, String filename) throws IOException {
73+
74+
if (recordFilter == null) {
75+
super.process(reader, filename);
76+
return;
77+
}
78+
79+
// Custom processing for filtered record types, since we are filtering, we get and process
80+
// every record here.
81+
WarcRecord record = reader.next().orElse(null);
82+
while (record != null) {
83+
try {
84+
String recordType = record.type().toLowerCase();
85+
86+
long position = reader.position();
87+
88+
// Handle WarcCaptureRecord types (response, resource, revisit, request)
89+
if (record instanceof WarcCaptureRecord) {
90+
WarcCaptureRecord capture = (WarcCaptureRecord) record;
91+
URI id = record.version().getProtocol().equals("ARC") ? null : record.id();
92+
93+
// Ensure HTTP header is parsed for revisit records
94+
if (record instanceof WarcRevisit && record.contentType().base().equals(MediaType.HTTP)) {
95+
((WarcRevisit) record).http();
96+
}
97+
98+
// Advance to next record to calculate length
99+
record = reader.next().orElse(null);
100+
long length = reader.position() - position;
101+
102+
// Skip records without a date
103+
if (!capture.headers().first("WARC-Date").isPresent()) {
104+
emitWarning(filename, position, "Skipping record due to missing or invalid date");
105+
continue;
106+
}
107+
108+
String encodedRequest = null;
109+
if (postAppend) {
110+
while (encodedRequest == null && record instanceof WarcCaptureRecord
111+
&& ((WarcCaptureRecord) record).concurrentTo().contains(id)) {
112+
if (record instanceof WarcRequest) {
113+
HttpRequest httpRequest = ((WarcRequest) record).http();
114+
encodedRequest = CdxRequestEncoder.encode(httpRequest);
115+
}
116+
record = reader.next().orElse(null);
117+
}
118+
}
119+
120+
write(capture, filename, position, length, encodedRequest);
121+
}
122+
// Handle WarcConversion (from WET files) and other WarcTargetRecord types
123+
else if (record instanceof WarcTargetRecord) {
124+
WarcTargetRecord targetRecord = (WarcTargetRecord) record;
125+
126+
// Advance to next record to calculate length
127+
record = reader.next().orElse(null);
128+
long length = reader.position() - position;
129+
130+
// Skip records without a date
131+
if (!targetRecord.headers().first("WARC-Date").isPresent()) {
132+
emitWarning(filename, position, "Skipping record due to missing or invalid date");
133+
continue;
134+
}
135+
136+
writeTargetRecord(targetRecord, filename, position, length);
137+
} else {
138+
// Skip non-target records (like warcinfo)
139+
record = reader.next().orElse(null);
140+
}
141+
} catch (ParsingException e) {
142+
emitWarning(filename, reader.position(), "ParsingException: " + e.getBaseMessage());
143+
record = reader.next().orElse(null);
144+
}
145+
}
146+
}
147+
148+
@Override
149+
public void setRecordFilter(Predicate<WarcRecord> recordFilter) {
150+
super.setRecordFilter(recordFilter);
151+
this.recordFilter = recordFilter;
152+
}
153+
154+
/**
155+
* Writes a CDXJ record for a WarcTargetRecord (like WarcConversion from WET
156+
* files).
157+
*
158+
* TODO: make it more generic and integrated into jwarc
159+
*/
160+
private void writeTargetRecord(WarcTargetRecord record, String filename,
161+
long position, long length) throws IOException {
162+
String target = record.target();
163+
if (target == null) {
164+
emitWarning(filename, position, "Skipping record due to missing target URI");
165+
return;
166+
}
167+
168+
// Build CDXJ line: surt timestamp {json}
169+
StringBuilder line = new StringBuilder();
170+
171+
// SURT-formatted URL key
172+
String surt = URIs.toNormalizedSurt(target);
173+
line.append(escape(surt));
174+
line.append(' ');
175+
176+
// Timestamp
177+
String timestamp = DATE_FORMAT.format(record.date());
178+
line.append(timestamp);
179+
line.append(' ');
180+
181+
// JSON block
182+
line.append('{');
183+
184+
// URL
185+
line.append("\"url\": \"");
186+
escapeJsonString(line, target);
187+
line.append("\"");
188+
189+
// MIME type
190+
try {
191+
if (record.payload().isPresent()) {
192+
MediaType mime = record.payload().get().type();
193+
if (mime != null) {
194+
line.append(", \"mime\": \"");
195+
escapeJsonString(line, mime.base().toString());
196+
line.append("\"");
197+
}
198+
}
199+
} catch (IOException e) {
200+
// Skip mime if payload can't be read
201+
}
202+
203+
// Digest
204+
record.payloadDigest().ifPresent(digest -> {
205+
line.append(", \"digest\": \"");
206+
escapeJsonString(line, digest.raw());
207+
line.append("\"");
208+
});
209+
210+
// Filename
211+
if (filename != null) {
212+
line.append(", \"filename\": \"");
213+
escapeJsonString(line, filename);
214+
line.append("\"");
215+
}
216+
217+
// Offset
218+
line.append(", \"offset\": \"");
219+
line.append(position);
220+
line.append("\"");
221+
222+
// Length
223+
line.append(", \"length\": \"");
224+
line.append(length);
225+
line.append("\"");
226+
227+
line.append('}');
228+
229+
writer.write(line.toString());
230+
writer.write('\n');
231+
}
232+
233+
private void emitWarning(String filename, long position, String message) {
234+
if (warningHandler == null)
235+
return;
236+
warningHandler.accept(filename + " (offset " + position + ") " + message);
237+
}
238+
239+
// Borrowed from org.netpreserve.jwarc.cdx.CdxWriter
240+
// TODO: remove duplication
241+
private static String escape(String str) {
242+
if (str == null) return null;
243+
return str.replace(" ", "%20")
244+
.replace("\n", "%0A")
245+
.replace("\0", "%00");
246+
}
247+
248+
249+
// Borrowed from org.netpreserve.jwarc.cdx.CdxWriter
250+
// TODO: remove duplication
251+
private static void escapeJsonString(StringBuilder out, String value) {
252+
for (int i = 0; i < value.length(); i++) {
253+
char c = value.charAt(i);
254+
if (c == '"') out.append("\\\"");
255+
else if (c == '\\') out.append("\\\\");
256+
else if (c == '\b') out.append("\\b");
257+
else if (c == '\f') out.append("\\f");
258+
else if (c == '\n') out.append("\\n");
259+
else if (c == '\r') out.append("\\r");
260+
else if (c == '\t') out.append("\\t");
261+
else if (c <= 0x1f) {
262+
out.append("\\u00");
263+
out.append(Character.forDigit((c & 0xf0) >>> 4, 16));
264+
out.append(Character.forDigit(c & 0xf, 16));
265+
} else {
266+
out.append(c);
267+
}
268+
}
269+
}
270+
}

0 commit comments

Comments
 (0)