Skip to content

Commit 07f185a

Browse files
Domain graph cc-main-2026-feb-mar-apr-domain not properly sorted (#34)
- Add unit test to reproduce the issue. - Throw runtime exception if output is not strictly monotonically sorted. - Fix issue: do not emit domain containing hyphen if there is a domain queued containing a dot at the same string index position. - Improve documentation.
1 parent f210ffd commit 07f185a

4 files changed

Lines changed: 169 additions & 46 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/
1616

1717
### Javadocs
1818

19-
The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser.
19+
The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/reports/apidocs/index.html` in a browser.
2020

2121

2222
### Source Code Formatting
@@ -41,7 +41,7 @@ The host-level web graph is built with help of PySpark, the corresponding code i
4141

4242
### Domain-Level Web Graph
4343

44-
The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh).
44+
The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). Please, see the script and the Java class [HostToDomainGraph](src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java) for further details.
4545

4646
### Processing Graphs using the WebGraph Framework
4747

src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ public class HostToDomainGraph {
8181
private long numInputLinesEdges = 0;
8282
protected String lastRevHost = null;
8383
protected Domain lastDomain = null;
84+
protected String lastOutputDomain = null;
8485
private TreeMap<String, Domain> domainQueue = new TreeMap<>();
8586
private int maxQueueUsed = 0;
8687

@@ -197,17 +198,26 @@ public static int compareRevDomainsSafe(String d1, String d2) {
197198
char c1 = d1.charAt(i);
198199
char c2 = d2.charAt(i);
199200
if (c1 != c2) {
201+
if (c1 == HYPHEN && c2 == DOT) {
202+
/*
203+
* Cannot finish "no.hedmark-folkemusikklag" unless "no.hedmark.os.www" is done
204+
* because input which is mapped to a suffix (a prefix in reversed domain name
205+
* notation) is still expected, e.g. "no.hedmark.www" which is mapped to
206+
* "no.hedmark".
207+
*/
208+
return 0;
209+
}
200210
return c1 - c2;
201211
} else if (c1 == HYPHEN) {
202212
/*
203-
* cannot finish "org.example-domain" unless "org.example" is done
213+
* Cannot finish "org.example-domain" unless "org.example" is done.
204214
*/
205215
return 0;
206216
} else if (c1 == DOT) {
207217
dots++;
208218
if (dots > 1) {
209219
/*
210-
* cannot finish "name.his.forgot.foobar" unless "name.his" is done
220+
* Cannot finish "name.his.forgot.foobar" unless "name.his" is done.
211221
*
212222
* This is a special case of multi-part suffixes with more than two parts when
213223
* the first part is also a public suffix, e.g. (in reversed domain name
@@ -401,7 +411,7 @@ private Domain queueDomain(StringBuilder sb, String domainName) {
401411
String firstDomain = domainQueue.firstKey();
402412
if (!Domain.isSafeToOutput(firstDomain, revDomainName)) {
403413
/*
404-
* queued domains are sorted lexicographically: if the first/current domain
414+
* Queued domains are sorted lexicographically: if the first/current domain
405415
* cannot be safely dequeued and written to output, this is also the case for
406416
* the following ones.
407417
*/
@@ -430,6 +440,7 @@ private String getNodeLine(Domain domain) {
430440
}
431441

432442
private void getNodeLine(StringBuilder b, Domain domain) {
443+
String domainName = null;
433444
if (domain == null)
434445
return;
435446
if (domain.id >= 0 && domain.name != null) {
@@ -438,7 +449,8 @@ private void getNodeLine(StringBuilder b, Domain domain) {
438449
}
439450
b.append(domain.id);
440451
b.append('\t');
441-
b.append(reverseHost(domain.name));
452+
domainName = reverseHost(domain.name);
453+
b.append(domainName);
442454
if (countHosts) {
443455
b.append('\t');
444456
b.append(domain.numberOfHosts);
@@ -447,6 +459,13 @@ private void getNodeLine(StringBuilder b, Domain domain) {
447459
for (Long hostId : domain.ids) {
448460
setValue(hostId.longValue(), domain.id);
449461
}
462+
if (lastOutputDomain != null && lastOutputDomain.compareTo(domainName) >= 0) {
463+
String msg = "Output domains are not strictly monotonically sorted: " + lastOutputDomain + " <> "
464+
+ domainName;
465+
LOG.error(msg);
466+
throw new RuntimeException(msg);
467+
}
468+
lastOutputDomain = domainName;
450469
}
451470

452471
public String convertEdge(String line) {
@@ -540,8 +559,7 @@ private static void showHelp() {
540559
System.err.println(" \tpublic suffix list, ");
541560
System.err.println(
542561
" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)");
543-
System.err
544-
.println(" \t- host-without-www: strip the www. prefix (keep the ");
562+
System.err.println(" \t- host-without-www: strip the www. prefix (keep the ");
545563
System.err.println(" \tfull host otherwise)");
546564
System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part");
547565
System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain");

src/script/host2domaingraph.sh

Lines changed: 84 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -65,64 +65,111 @@ PARALLEL_SORT_THREADS=2
6565
# 1 C locale is mandatory to keep reversed hosts of one domain or top-level domain
6666
# together in a single block:
6767
# echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=en_US.utf8 sort
68+
# co.mopera
69+
# com.opera
70+
# com.opus
71+
# co.mopus
6872
# vs.
6973
# echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=C sort
74+
# co.mopera
75+
# co.mopus
76+
# com.opera
77+
# com.opus
7078
# This requirement is met by the output of the cc-pyspark job.
7179
#
72-
# 2 The second problem stems from the fact that a hyphen (valid in host and
73-
# subdomain names) is sorted before the dot:
80+
# In an older version, the input was re-sorted to try to group
81+
# domains and their subdomains together:
82+
#
83+
# 2 Sorting with C locale, places a hyphen (valid in host and subdomain names)
84+
# before a dot:
7485
# ac.gov
7586
# ac.gov.ascension
7687
# ac.gov.ascension-island
7788
# ac.gov.ascension.mail
78-
# Unfortunately the output of the cc-pyspark job does not completely meet this
79-
# sorting criterion.
89+
# This causes that the domain "ac.gov.ascension" and its subdomain "ac.gov.ascension.mail"
90+
# end up in two separated blocks of the input, even with sorting using the C locale.
91+
#
8092
# The initial solution to ensure that the subdomains of "ac.gov.ascension" are not split
8193
# into two blocks, was to add an artificial dot temporarily to the end of each host
8294
# name during sorting:
8395
# zcat vertices.txt.gz | sed -e 's/$/./' \
8496
# | sort $SORTOPTS -t$'\t' -k2,2 | sed -e 's/\.$//'
8597
# The domain name "ac.gov.ascension" in the example above becomes temporarily
8698
# "ac.gov.ascension." and is now sorted after "ac.gov.ascension-island."
87-
#
88-
# To avoid this step (re-sorting billions of lines is expensive), the HostToDomainGraph
89-
# class now caches potentially "missorted" candidates and processes them later together
90-
# with the related subdomains / host names.
9199
#
92-
# Note: The final sorting of the domain names is the same as if there would be
93-
# a trailing dot:
94-
# ac.gov.ascension-island
95-
# ac.gov.ascension
100+
# A sort order that keeps hosts/domains of a common suffix in one block can be
101+
# also achieved if dots are replaced by commas:
102+
# zcat vertices.txt.gz | tr . , \
103+
# | sort $SORTOPTS -t$'\t' -k2,2 | tr , .
104+
# This approach is utilized by the "Sort-friendly URI Reordering Transform" (SURT),
105+
# see <http://crawler.archive.org/articles/user_manual/glossary.html#surt>.
106+
#
107+
# However, the public suffix list adds a further issue, which makes it impossible
108+
# to group domains and subdomains together, by simply sorting the input:
96109
#
97-
# 3 The public suffix list adds a further issue: there are multi-part suffixes,
98-
# such as "co.uk" (or "uk.co" in reverse domain name notation). And the suffixes
99-
# of a multi-part suffix can be public suffixes themselves: also "uk" is a public
100-
# suffix. But they do not need to. For example: "no" and "os.hordaland.no" are
101-
# in the public suffix list but "hordaland.no" is not. In this situation,
102-
# adding a trailing dot does not even guarantee that all hosts of a domain under
103-
# a public suffix is in a contiguous block:
104-
#
105-
# $> cat hordaland.txt
106-
# no.hordaland
107-
# no.hordaland-teater
108-
# no.hordaland.os
109-
# no.hordaland.os.bibliotek
110-
# no.hordaland.oygarden
111-
# no.hordalandfolkemusikklag
112-
#
113-
# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort
114-
# no.hordaland-teater.
115-
# no.hordaland.
116-
# no.hordaland.os.
117-
# no.hordaland.os.bibliotek.
118-
# no.hordaland.oygarden.
119-
# no.hordalandfolkemusikklag.
110+
# 3 There are multi-part suffixes, such as "co.uk" (or "uk.co" in reverse domain name
111+
# notation). And the suffixes of a multi-part suffix can be public suffixes themselves:
112+
# also "uk" is a public suffix. But they do not need to. For example: "no" and
113+
# "os.hordaland.no" are in the public suffix list but "hordaland.no" is not.
114+
# In this situation, adding a trailing dot does not even guarantee that all hosts of
115+
# a domain under a public suffix are in a contiguous block:
116+
#
117+
# $> cat hordaland.txt
118+
# no.hordaland
119+
# no.hordaland-teater
120+
# no.hordaland.os
121+
# no.hordaland.os.bibliotek
122+
# no.hordaland.oygarden
123+
# no.hordalandfolkemusikklag
124+
#
125+
# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort
126+
# no.hordaland-teater.
127+
# no.hordaland.
128+
# no.hordaland.os.
129+
# no.hordaland.os.bibliotek.
130+
# no.hordaland.oygarden.
131+
# no.hordalandfolkemusikklag.
120132
#
121133
# The host names "no.hordaland." and "no.hordaland.oygarden." both
122134
# are under the domain ""no.hordaland" (public suffix is "no").
123135
#
124-
# Please see https://github.com/commoncrawl/cc-webgraph/issues/3
125-
# for further details.
136+
# To address this issue (point 3), the HostToDomainGraph class now caches
137+
# potentially "missorted" candidates and processes them later together
138+
# with the related subdomains / host names.
139+
#
140+
# 4 This also addresses the fact, that re-sorting billions of input lines is
141+
# computationally expensive.
142+
#
143+
# Output sorting:
144+
#
145+
# 5 Ideally, the domain output should be lexicographically sorted
146+
# as well. This is a requirement to store the map of node names and IDs
147+
# in an "immutable external prefix map" (IEPM).
148+
# If a trailing dot is added and then removed (and no cache is used), the
149+
# output sorting would be consequently the same as if there is a trailing dot:
150+
# ac.gov.ascension-island.
151+
# ac.gov.ascension.
152+
# respectively (after removing the trailing dot)
153+
# ac.gov.ascension-island
154+
# ac.gov.ascension
155+
#
156+
# The required ASCII sorting is:
157+
# ac.gov.ascension
158+
# ac.gov.ascension-island
159+
#
160+
# We cannot re-sort the output because this would also require to change
161+
# the node IDs because the WebGraph framework expects the arc/edge input
162+
# to be numerically sorted. And the vertices/nodes are enumerated as they
163+
# are sorted, i.e. node IDs are line numbers starting with zero.
164+
#
165+
# Note: The approach to replace dots by commas ensures proper lexicographic
166+
# sorting even if the replacement is inverted. However, it does not guarantee
167+
# that all domains of one suffix are in a contigous block, if that suffix
168+
# is a suffix of another suffix. See point 3.
169+
#
170+
# Please see https://github.com/commoncrawl/cc-webgraph/issues/3
171+
# and https://github.com/commoncrawl/cc-webgraph/issues/33
172+
# for further details.
126173
#
127174

128175
export LC_ALL=C

src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,44 @@ class TestHostToDomainGraph {
110110
"3\tno.hordalandfolkemusikklag\t1", //
111111
};
112112

113+
/*
114+
* Issue #33 : domain output not sorted if domain name is a string suffix of
115+
* public suffix appears only after the longer suffix.
116+
*/
117+
String[] hostGraphDomainInSuffixA = { //
118+
"0\tno.hedland", //
119+
"1\tno.hedmark-folkemusikklag", //
120+
"2\tno.hedmark-trafikk", //
121+
"3\tno.hedmark.m", //
122+
"4\tno.hedmark.os.www", //
123+
"5\tno.hedmark.www", //
124+
"6\tno.hedmarktrafikk", //
125+
};
126+
String[] hostGraphDomainInSuffixB = { //
127+
"0\tno.hedland", //
128+
"1\tno.hedmark-folkemusikklag", //
129+
"2\tno.hedmark-trafikk", //
130+
"3\tno.hedmark.os.www", //
131+
"4\tno.hedmark.www", //
132+
"5\tno.hedmarktrafikk", //
133+
};
134+
String[] domainGraphDomainInSuffixA = { //
135+
"0\tno.hedland\t1", //
136+
"1\tno.hedmark\t2", //
137+
"2\tno.hedmark-folkemusikklag\t1", //
138+
"3\tno.hedmark-trafikk\t1", //
139+
"4\tno.hedmark.os.www\t1", //
140+
"5\tno.hedmarktrafikk\t1", //
141+
};
142+
String[] domainGraphDomainInSuffixB = { //
143+
"0\tno.hedland\t1", //
144+
"1\tno.hedmark\t1", //
145+
"2\tno.hedmark-folkemusikklag\t1", //
146+
"3\tno.hedmark-trafikk\t1", //
147+
"4\tno.hedmark.os.www\t1", //
148+
"5\tno.hedmarktrafikk\t1", //
149+
};
150+
113151
/**
114152
* <code>forgot.his.name</name> is in the "private section" of the public suffix
115153
* list, while <code>name</name> is in the ICANN section, see
@@ -194,7 +232,7 @@ private long[] getNodeIDs(String[] graph) {
194232
}
195233

196234
/**
197-
* test whether node names are properly sorted and IDs are correctly assigned
235+
* Test whether node names are properly sorted and IDs are correctly assigned
198236
* (sequentially, strictly monotonically increasing, no gaps)
199237
*/
200238
void testSorted(String[] graph) {
@@ -276,6 +314,26 @@ void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() {
276314
convert(converter, hostGraphHyphenatedDomains));
277315
}
278316

317+
@Test
318+
void testConvertNodesEnsureSortedOutputA() {
319+
testSorted(hostGraphDomainInSuffixA);
320+
testSorted(domainGraphDomainInSuffixA);
321+
converter.doCount(true);
322+
String[] output = convert(converter, hostGraphDomainInSuffixA);
323+
testSorted(output);
324+
assertArrayEquals(domainGraphDomainInSuffixA, output);
325+
}
326+
327+
@Test
328+
void testConvertNodesEnsureSortedOutputB() {
329+
testSorted(hostGraphDomainInSuffixB);
330+
testSorted(domainGraphDomainInSuffixB);
331+
converter.doCount(true);
332+
String[] output = convert(converter, hostGraphDomainInSuffixB);
333+
testSorted(output);
334+
assertArrayEquals(domainGraphDomainInSuffixB, output);
335+
}
336+
279337
@Test
280338
void testConvertPrivateDomain() {
281339
// verify sorting of input and expected output

0 commit comments

Comments
 (0)