Skip to content

Commit 75af0e1

Browse files
committed
Add CDJX indexer using unreleased JARC code
1 parent 52fca8c commit 75af0e1

4 files changed

Lines changed: 129 additions & 60 deletions

File tree

Makefile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
build:
2+
mvn -q install:install-file -Dfile=libs/jwarc-0.32.1-SNAPSHOT.jar -DgroupId=org.netpreserve -DartifactId=jwarc -Dversion=0.32.1-SNAPSHOT -Dpackaging=jar
23
mvn clean package
34

45
iterate: build
@@ -14,11 +15,12 @@ iterate: build
1415
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/whirlwind.warc.wat.gz"
1516
@echo
1617

17-
# cdxj:
18-
# @echo "creating *.cdxj index files from the local warcs"
19-
# cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
18+
cdxj:
19+
@echo "creating *.cdxj index files from the local warcs"
20+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.gz" > whirlwind.warc.cdxj
2021
# cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
21-
# cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
22+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz" > whirlwind.warc.wet.cdxj
23+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz" > whirlwind.warc.wat.cdxj
2224

2325
# extract:
2426
# @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"

README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,6 @@ We have two versions of the index: the CDX index and the columnar index. The CDX
208208

209209
### CDX(J) index
210210

211-
**TBA**: Did not find a good java library that implements this feature, ideally can be implemented in jwarc
212-
213211
The CDX index files are sorted plain-text files, with each line containing information about a single capture in the WARC. Technically, Common Crawl uses CDXJ index files since the information about each capture is formatted as JSON. We'll use CDX and CDXJ interchangeably in this tour for legacy reasons 💅
214212

215213
We can create our own CDXJ index from the local WARCs by running:
@@ -223,9 +221,10 @@ This uses the [cdxj-indexer](https://github.com/webrecorder/cdxj-indexer) librar
223221

224222
```
225223
creating *.cdxj index files from the local warcs
226-
cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
227-
cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
228-
cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
224+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.gz"
225+
cdxj-indexer data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
226+
cdxj-indexer --records conversion data/whirlwind.warc.wet.gz > data/whirlwind.warc.wet.cdxj
227+
cdxj-indexer data/whirlwind.warc.wat.gz > data/whirlwind.warc.wat.cdxj
229228
```
230229

231230
</details>

pom.xml

Lines changed: 50 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -24,58 +24,57 @@
2424
<dependency>
2525
<groupId>org.netpreserve</groupId>
2626
<artifactId>jwarc</artifactId>
27-
<version>0.32.0</version>
27+
<version>0.32.1-SNAPSHOT</version>
2828
</dependency>
29-
3029
</dependencies>
3130

32-
<build>
33-
<plugins>
34-
<plugin>
35-
<groupId>org.apache.maven.plugins</groupId>
36-
<artifactId>maven-compiler-plugin</artifactId>
37-
<version>3.11.0</version>
38-
<configuration>
39-
<release>${maven.compiler.release}</release>
40-
</configuration>
41-
</plugin>
42-
<plugin>
43-
<groupId>org.apache.maven.plugins</groupId>
44-
<artifactId>maven-jar-plugin</artifactId>
45-
<version>3.3.0</version>
46-
<configuration>
47-
<archive>
48-
<manifest>
49-
<mainClass>org.commoncrawl.whirlwind.ReadWARC</mainClass>
50-
</manifest>
51-
</archive>
52-
</configuration>
53-
</plugin>
54-
<plugin>
55-
<groupId>com.diffplug.spotless</groupId>
56-
<artifactId>spotless-maven-plugin</artifactId>
57-
<version>2.46.1</version>
58-
<configuration>
59-
<pom>
60-
<!-- These are the defaults, you can override if you want -->
61-
<includes>
62-
<include>pom.xml</include>
63-
</includes>
64-
<sortPom>
65-
<indentAttribute>all</indentAttribute>
66-
<keepBlankLines>true</keepBlankLines>
67-
<expandEmptyElements>false</expandEmptyElements>
68-
<nrOfIndentSpace>-1</nrOfIndentSpace>
69-
<predefinedSortOrder>recommended_2008_06</predefinedSortOrder>
70-
</sortPom>
71-
</pom>
72-
<java>
73-
<eclipse>
74-
<file>${project.basedir}/eclipse-formatter.xml</file>
75-
</eclipse>
76-
</java>
77-
</configuration>
78-
</plugin>
79-
</plugins>
80-
</build>
31+
<build>
32+
<plugins>
33+
<plugin>
34+
<groupId>org.apache.maven.plugins</groupId>
35+
<artifactId>maven-compiler-plugin</artifactId>
36+
<version>3.11.0</version>
37+
<configuration>
38+
<release>${maven.compiler.release}</release>
39+
</configuration>
40+
</plugin>
41+
<plugin>
42+
<groupId>org.apache.maven.plugins</groupId>
43+
<artifactId>maven-jar-plugin</artifactId>
44+
<version>3.3.0</version>
45+
<configuration>
46+
<archive>
47+
<manifest>
48+
<mainClass>org.commoncrawl.whirlwind.ReadWARC</mainClass>
49+
</manifest>
50+
</archive>
51+
</configuration>
52+
</plugin>
53+
<plugin>
54+
<groupId>com.diffplug.spotless</groupId>
55+
<artifactId>spotless-maven-plugin</artifactId>
56+
<version>2.46.1</version>
57+
<configuration>
58+
<pom>
59+
<!-- These are the defaults, you can override if you want -->
60+
<includes>
61+
<include>pom.xml</include>
62+
</includes>
63+
<sortPom>
64+
<indentAttribute>all</indentAttribute>
65+
<keepBlankLines>true</keepBlankLines>
66+
<expandEmptyElements>false</expandEmptyElements>
67+
<nrOfIndentSpace>-1</nrOfIndentSpace>
68+
<predefinedSortOrder>recommended_2008_06</predefinedSortOrder>
69+
</sortPom>
70+
</pom>
71+
<java>
72+
<eclipse>
73+
<file>${project.basedir}/eclipse-formatter.xml</file>
74+
</eclipse>
75+
</java>
76+
</configuration>
77+
</plugin>
78+
</plugins>
79+
</build>
8180
</project>
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.commoncrawl.whirlwind;
19+
20+
import org.netpreserve.jwarc.WarcReader;
21+
import org.netpreserve.jwarc.cdx.CdxFormat;
22+
import org.netpreserve.jwarc.cdx.CdxWriter;
23+
24+
import java.io.IOException;
25+
import java.io.InputStream;
26+
import java.io.OutputStreamWriter;
27+
import java.nio.file.Files;
28+
import java.nio.file.Path;
29+
30+
public class CdxjIndexer {
31+
32+
public static void main(String[] args) throws IOException {
33+
34+
if (args.length != 1) {
35+
System.err.println("Usage: java CdxjIndexer <input-warc-file>");
36+
System.exit(1);
37+
}
38+
39+
Path requested = Path.of(args[0]).toAbsolutePath().normalize();
40+
if (!Files.isRegularFile(requested)) {
41+
throw new SecurityException("Invalid WARC path");
42+
}
43+
44+
if (requested.toString().endsWith("gz") || requested.toString().endsWith("gzip")) {
45+
try {
46+
ValidateWARC.validateRandomAccessWarcOrFail(requested);
47+
} catch (IOException e) {
48+
System.out.println("This file is probably not a multi-member gzip but a single gzip file." +
49+
"\n" +
50+
"To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together." +
51+
"\n" +
52+
"\n" +
53+
"This file is likely still valid and can be fixed by running:" +
54+
"\n" +
55+
"mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=\"testing.warc testing.warc.gz\"");
56+
System.exit(-1);
57+
}
58+
}
59+
60+
try (
61+
InputStream in = Files.newInputStream(requested);
62+
CdxWriter cdxjWriter = new CdxWriter(new OutputStreamWriter(System.out));
63+
WarcReader reader = new WarcReader(in)
64+
) {
65+
cdxjWriter.setFormat(CdxFormat.CDXJ);
66+
cdxjWriter.process(reader, requested.toString());
67+
}
68+
}
69+
}

0 commit comments

Comments
 (0)