From 4da9e0566b293c62227da203e600560a0432c8fe Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 1 May 2026 16:33:15 +0200 Subject: [PATCH 1/5] Domain graph cc-main-2026-feb-mar-apr-domain not properly sorted #33 Add unit test to reproduce the issue. --- .../webgraph/TestHostToDomainGraph.java | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java index 147b706..66538f0 100644 --- a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java +++ b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java @@ -110,6 +110,41 @@ class TestHostToDomainGraph { "3\tno.hordalandfolkemusikklag\t1", // }; + // issue #33 : domain output not sorted with input B + String[] hostGraphOutputSortingA = { // + "0\tno.hedland", // + "1\tno.hedmark-folkemusikklag", // + "2\tno.hedmark-trafikk", // + "3\tno.hedmark.m", // + "4\tno.hedmark.os.www", // + "5\tno.hedmark.www", // + "6\tno.hedmarktrafikk", // + }; + String[] hostGraphOutputSortingB = { // + "0\tno.hedland", // + "1\tno.hedmark-folkemusikklag", // + "2\tno.hedmark-trafikk", // + "3\tno.hedmark.os.www", // + "4\tno.hedmark.www", // + "5\tno.hedmarktrafikk", // + }; + String[] domainGraphOutputSortingA = { // + "0\tno.hedland\t1", // + "1\tno.hedmark\t2", // + "2\tno.hedmark-folkemusikklag\t1", // + "3\tno.hedmark-trafikk\t1", // + "4\tno.hedmark.os.www\t1", // + "5\tno.hedmarktrafikk\t1", // + }; + String[] domainGraphOutputSortingB = { // + "0\tno.hedland\t1", // + "1\tno.hedmark\t1", // + "2\tno.hedmark-folkemusikklag\t1", // + "3\tno.hedmark-trafikk\t1", // + "4\tno.hedmark.os.www\t1", // + "5\tno.hedmarktrafikk\t1", // + }; + /** * forgot.his.name is in the "private section" of the public suffix * list, while name is in the ICANN section, see @@ -194,7 +229,7 @@ private long[] getNodeIDs(String[] graph) { } /** - * test whether node names are properly sorted and IDs are correctly assigned + * Test whether node names are properly sorted and IDs are correctly assigned * (sequentially, strictly monotonically increasing, no gaps) */ void testSorted(String[] graph) { @@ -276,6 +311,26 @@ void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() { convert(converter, hostGraphHyphenatedDomains)); } + @Test + void testConvertNodesOutputSortingA() { + testSorted(hostGraphOutputSortingA); + testSorted(domainGraphOutputSortingA); + converter.doCount(true); + String[] output = convert(converter, hostGraphOutputSortingA); + testSorted(output); + assertArrayEquals(domainGraphOutputSortingA, output); + } + + @Test + void testConvertNodesOutputSortingB() { + testSorted(hostGraphOutputSortingB); + testSorted(domainGraphOutputSortingB); + converter.doCount(true); + String[] output = convert(converter, hostGraphOutputSortingB); + testSorted(output); + assertArrayEquals(domainGraphOutputSortingB, output); + } + @Test void testConvertPrivateDomain() { // verify sorting of input and expected output From c083007230e6d2cf4753e47dfec01f63875a2bec Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 1 May 2026 16:57:45 +0200 Subject: [PATCH 2/5] Domain graph cc-main-2026-feb-mar-apr-domain not properly sorted #33 Throw runtime exception if output is not strictly monotonically sorted. --- .../commoncrawl/webgraph/HostToDomainGraph.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java index fb0ee1f..83cac12 100644 --- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java +++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java @@ -81,6 +81,7 @@ public class HostToDomainGraph { private long numInputLinesEdges = 0; protected String lastRevHost = null; protected Domain lastDomain = null; + protected String lastOutputDomain = null; private TreeMap domainQueue = new TreeMap<>(); private int maxQueueUsed = 0; @@ -401,7 +402,7 @@ private Domain queueDomain(StringBuilder sb, String domainName) { String firstDomain = domainQueue.firstKey(); if (!Domain.isSafeToOutput(firstDomain, revDomainName)) { /* - * queued domains are sorted lexicographically: if the first/current domain + * Queued domains are sorted lexicographically: if the first/current domain * cannot be safely dequeued and written to output, this is also the case for * the following ones. */ @@ -430,6 +431,7 @@ private String getNodeLine(Domain domain) { } private void getNodeLine(StringBuilder b, Domain domain) { + String domainName = null; if (domain == null) return; if (domain.id >= 0 && domain.name != null) { @@ -438,7 +440,8 @@ private void getNodeLine(StringBuilder b, Domain domain) { } b.append(domain.id); b.append('\t'); - b.append(reverseHost(domain.name)); + domainName = reverseHost(domain.name); + b.append(domainName); if (countHosts) { b.append('\t'); b.append(domain.numberOfHosts); @@ -447,6 +450,13 @@ private void getNodeLine(StringBuilder b, Domain domain) { for (Long hostId : domain.ids) { setValue(hostId.longValue(), domain.id); } + if (lastOutputDomain != null && lastOutputDomain.compareTo(domainName) >= 0) { + String msg = "Output domains are not strictly monotonically sorted: " + lastOutputDomain + " <> " + + domainName; + LOG.error(msg); + throw new RuntimeException(msg); + } + lastOutputDomain = domainName; } public String convertEdge(String line) { From f03b9962a267ab2a7590203ec4a762c57ea26925 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 1 May 2026 17:46:33 +0200 Subject: [PATCH 3/5] Domain graph cc-main-2026-feb-mar-apr-domain not properly sorted #33 Fix issue: do not emit domain containing hyphen if there is a domain queued containing a dot at the same string index position. --- .../commoncrawl/webgraph/HostToDomainGraph.java | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java index 83cac12..3c0638e 100644 --- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java +++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java @@ -198,17 +198,26 @@ public static int compareRevDomainsSafe(String d1, String d2) { char c1 = d1.charAt(i); char c2 = d2.charAt(i); if (c1 != c2) { + if (c1 == HYPHEN && c2 == DOT) { + /* + * Cannot finish "no.hedmark-folkemusikklag" unless "no.hedmark.os.www" is done + * because input which is mapped to a suffix (a prefix in reversed domain name + * notation) is still expected, e.g. "no.hedmark.www" which is mapped to + * "no.hedmark". + */ + return 0; + } return c1 - c2; } else if (c1 == HYPHEN) { /* - * cannot finish "org.example-domain" unless "org.example" is done + * Cannot finish "org.example-domain" unless "org.example" is done. */ return 0; } else if (c1 == DOT) { dots++; if (dots > 1) { /* - * cannot finish "name.his.forgot.foobar" unless "name.his" is done + * Cannot finish "name.his.forgot.foobar" unless "name.his" is done. * * This is a special case of multi-part suffixes with more than two parts when * the first part is also a public suffix, e.g. (in reversed domain name @@ -550,8 +559,7 @@ private static void showHelp() { System.err.println(" \tpublic suffix list, "); System.err.println( " \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)"); - System.err - .println(" \t- host-without-www: strip the www. prefix (keep the "); + System.err.println(" \t- host-without-www: strip the www. prefix (keep the "); System.err.println(" \tfull host otherwise)"); System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part"); System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain"); From 3a97ef4a550eb8090b10c8ad3ed235828b6374b5 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 1 May 2026 17:54:03 +0200 Subject: [PATCH 4/5] Domain graph cc-main-2026-feb-mar-apr-domain not properly sorted #33 Improve documentation. --- README.md | 4 ++-- src/script/host2domaingraph.sh | 39 ++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index f335392..6003d0d 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/ ### Javadocs -The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser. +The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/reports/apidocs/index.html` in a browser. ### Source Code Formatting @@ -41,7 +41,7 @@ The host-level web graph is built with help of PySpark, the corresponding code i ### Domain-Level Web Graph -The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). +The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). Please, see the script and the Java class [HostToDomainGraph](src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java) for further details. ### Processing Graphs using the WebGraph Framework diff --git a/src/script/host2domaingraph.sh b/src/script/host2domaingraph.sh index a339b53..21b3c90 100755 --- a/src/script/host2domaingraph.sh +++ b/src/script/host2domaingraph.sh @@ -84,15 +84,17 @@ PARALLEL_SORT_THREADS=2 # | sort $SORTOPTS -t$'\t' -k2,2 | sed -e 's/\.$//' # The domain name "ac.gov.ascension" in the example above becomes temporarily # "ac.gov.ascension." and is now sorted after "ac.gov.ascension-island." -# -# To avoid this step (re-sorting billions of lines is expensive), the HostToDomainGraph -# class now caches potentially "missorted" candidates and processes them later together -# with the related subdomains / host names. # -# Note: The final sorting of the domain names is the same as if there would be -# a trailing dot: -# ac.gov.ascension-island -# ac.gov.ascension +# A sort order that keeps hosts/domains of a common suffix in one block can be +# also achieved if dots are replaced by commas: +# zcat vertices.txt.gz | tr . , \ +# | sort $SORTOPTS -t$'\t' -k2,2 | tr , . +# This approach is utilized by the "Sort-friendly URI Reordering Transform" (SURT), +# see . +# +# To avoid the re-sorting of the input (sorting billions of lines is expensive), +# the HostToDomainGraph class now caches potentially "missorted" candidates and +# processes them later together with the related subdomains / host names. # # 3 The public suffix list adds a further issue: there are multi-part suffixes, # such as "co.uk" (or "uk.co" in reverse domain name notation). And the suffixes @@ -100,7 +102,7 @@ PARALLEL_SORT_THREADS=2 # suffix. But they do not need to. For example: "no" and "os.hordaland.no" are # in the public suffix list but "hordaland.no" is not. In this situation, # adding a trailing dot does not even guarantee that all hosts of a domain under -# a public suffix is in a contiguous block: +# a public suffix are in a contiguous block: # # $> cat hordaland.txt # no.hordaland @@ -121,7 +123,26 @@ PARALLEL_SORT_THREADS=2 # The host names "no.hordaland." and "no.hordaland.oygarden." both # are under the domain ""no.hordaland" (public suffix is "no"). # +# 4 Ideally, the domain output should be lexicographically sorted +# as well. This is a requirement to store the map of node names and IDs +# in an "immutable external prefix map" (IEPM). +# If a trailing dot is added and then removed (and no cache is used), the +# output sorting would be consequently the same as if there is a trailing dot: +# ac.gov.ascension-island. +# ac.gov.ascension. +# respectively +# ac.gov.ascension-island +# ac.gov.ascension +# +# The required ASCII sorting is: +# ac.gov.ascension +# ac.gov.ascension-island +# +# Note: The approach to replace dots by commas ensures proper lexicographic +# sorting even if the replacement is inverted. +# # Please see https://github.com/commoncrawl/cc-webgraph/issues/3 +# and https://github.com/commoncrawl/cc-webgraph/issues/33 # for further details. # From 5af3cc3c70a0d607eddd907715249242954a2930 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 4 May 2026 22:59:24 +0200 Subject: [PATCH 5/5] Domain graph cc-main-2026-feb-mar-apr-domain not properly sorted #33 Address review comments: rename unit test, improve documentation. --- src/script/host2domaingraph.sh | 130 +++++++++++------- .../webgraph/TestHostToDomainGraph.java | 33 +++-- 2 files changed, 96 insertions(+), 67 deletions(-) diff --git a/src/script/host2domaingraph.sh b/src/script/host2domaingraph.sh index 21b3c90..ea51359 100755 --- a/src/script/host2domaingraph.sh +++ b/src/script/host2domaingraph.sh @@ -65,18 +65,30 @@ PARALLEL_SORT_THREADS=2 # 1 C locale is mandatory to keep reversed hosts of one domain or top-level domain # together in a single block: # echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=en_US.utf8 sort +# co.mopera +# com.opera +# com.opus +# co.mopus # vs. # echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=C sort +# co.mopera +# co.mopus +# com.opera +# com.opus # This requirement is met by the output of the cc-pyspark job. # -# 2 The second problem stems from the fact that a hyphen (valid in host and -# subdomain names) is sorted before the dot: +# In an older version, the input was re-sorted to try to group +# domains and their subdomains together: +# +# 2 Sorting with C locale, places a hyphen (valid in host and subdomain names) +# before a dot: # ac.gov # ac.gov.ascension # ac.gov.ascension-island # ac.gov.ascension.mail -# Unfortunately the output of the cc-pyspark job does not completely meet this -# sorting criterion. +# This causes that the domain "ac.gov.ascension" and its subdomain "ac.gov.ascension.mail" +# end up in two separated blocks of the input, even with sorting using the C locale. +# # The initial solution to ensure that the subdomains of "ac.gov.ascension" are not split # into two blocks, was to add an artificial dot temporarily to the end of each host # name during sorting: @@ -92,58 +104,72 @@ PARALLEL_SORT_THREADS=2 # This approach is utilized by the "Sort-friendly URI Reordering Transform" (SURT), # see . # -# To avoid the re-sorting of the input (sorting billions of lines is expensive), -# the HostToDomainGraph class now caches potentially "missorted" candidates and -# processes them later together with the related subdomains / host names. -# -# 3 The public suffix list adds a further issue: there are multi-part suffixes, -# such as "co.uk" (or "uk.co" in reverse domain name notation). And the suffixes -# of a multi-part suffix can be public suffixes themselves: also "uk" is a public -# suffix. But they do not need to. For example: "no" and "os.hordaland.no" are -# in the public suffix list but "hordaland.no" is not. In this situation, -# adding a trailing dot does not even guarantee that all hosts of a domain under -# a public suffix are in a contiguous block: -# -# $> cat hordaland.txt -# no.hordaland -# no.hordaland-teater -# no.hordaland.os -# no.hordaland.os.bibliotek -# no.hordaland.oygarden -# no.hordalandfolkemusikklag -# -# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort -# no.hordaland-teater. -# no.hordaland. -# no.hordaland.os. -# no.hordaland.os.bibliotek. -# no.hordaland.oygarden. -# no.hordalandfolkemusikklag. +# However, the public suffix list adds a further issue, which makes it impossible +# to group domains and subdomains together, by simply sorting the input: +# +# 3 There are multi-part suffixes, such as "co.uk" (or "uk.co" in reverse domain name +# notation). And the suffixes of a multi-part suffix can be public suffixes themselves: +# also "uk" is a public suffix. But they do not need to. For example: "no" and +# "os.hordaland.no" are in the public suffix list but "hordaland.no" is not. +# In this situation, adding a trailing dot does not even guarantee that all hosts of +# a domain under a public suffix are in a contiguous block: +# +# $> cat hordaland.txt +# no.hordaland +# no.hordaland-teater +# no.hordaland.os +# no.hordaland.os.bibliotek +# no.hordaland.oygarden +# no.hordalandfolkemusikklag +# +# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort +# no.hordaland-teater. +# no.hordaland. +# no.hordaland.os. +# no.hordaland.os.bibliotek. +# no.hordaland.oygarden. +# no.hordalandfolkemusikklag. # # The host names "no.hordaland." and "no.hordaland.oygarden." both # are under the domain ""no.hordaland" (public suffix is "no"). # -# 4 Ideally, the domain output should be lexicographically sorted -# as well. This is a requirement to store the map of node names and IDs -# in an "immutable external prefix map" (IEPM). -# If a trailing dot is added and then removed (and no cache is used), the -# output sorting would be consequently the same as if there is a trailing dot: -# ac.gov.ascension-island. -# ac.gov.ascension. -# respectively -# ac.gov.ascension-island -# ac.gov.ascension -# -# The required ASCII sorting is: -# ac.gov.ascension -# ac.gov.ascension-island -# -# Note: The approach to replace dots by commas ensures proper lexicographic -# sorting even if the replacement is inverted. -# -# Please see https://github.com/commoncrawl/cc-webgraph/issues/3 -# and https://github.com/commoncrawl/cc-webgraph/issues/33 -# for further details. +# To address this issue (point 3), the HostToDomainGraph class now caches +# potentially "missorted" candidates and processes them later together +# with the related subdomains / host names. +# +# 4 This also addresses the fact, that re-sorting billions of input lines is +# computationally expensive. +# +# Output sorting: +# +# 5 Ideally, the domain output should be lexicographically sorted +# as well. This is a requirement to store the map of node names and IDs +# in an "immutable external prefix map" (IEPM). +# If a trailing dot is added and then removed (and no cache is used), the +# output sorting would be consequently the same as if there is a trailing dot: +# ac.gov.ascension-island. +# ac.gov.ascension. +# respectively (after removing the trailing dot) +# ac.gov.ascension-island +# ac.gov.ascension +# +# The required ASCII sorting is: +# ac.gov.ascension +# ac.gov.ascension-island +# +# We cannot re-sort the output because this would also require to change +# the node IDs because the WebGraph framework expects the arc/edge input +# to be numerically sorted. And the vertices/nodes are enumerated as they +# are sorted, i.e. node IDs are line numbers starting with zero. +# +# Note: The approach to replace dots by commas ensures proper lexicographic +# sorting even if the replacement is inverted. However, it does not guarantee +# that all domains of one suffix are in a contigous block, if that suffix +# is a suffix of another suffix. See point 3. +# +# Please see https://github.com/commoncrawl/cc-webgraph/issues/3 +# and https://github.com/commoncrawl/cc-webgraph/issues/33 +# for further details. # export LC_ALL=C diff --git a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java index 66538f0..35bca6c 100644 --- a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java +++ b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java @@ -110,8 +110,11 @@ class TestHostToDomainGraph { "3\tno.hordalandfolkemusikklag\t1", // }; - // issue #33 : domain output not sorted with input B - String[] hostGraphOutputSortingA = { // + /* + * Issue #33 : domain output not sorted if domain name is a string suffix of + * public suffix appears only after the longer suffix. + */ + String[] hostGraphDomainInSuffixA = { // "0\tno.hedland", // "1\tno.hedmark-folkemusikklag", // "2\tno.hedmark-trafikk", // @@ -120,7 +123,7 @@ class TestHostToDomainGraph { "5\tno.hedmark.www", // "6\tno.hedmarktrafikk", // }; - String[] hostGraphOutputSortingB = { // + String[] hostGraphDomainInSuffixB = { // "0\tno.hedland", // "1\tno.hedmark-folkemusikklag", // "2\tno.hedmark-trafikk", // @@ -128,7 +131,7 @@ class TestHostToDomainGraph { "4\tno.hedmark.www", // "5\tno.hedmarktrafikk", // }; - String[] domainGraphOutputSortingA = { // + String[] domainGraphDomainInSuffixA = { // "0\tno.hedland\t1", // "1\tno.hedmark\t2", // "2\tno.hedmark-folkemusikklag\t1", // @@ -136,7 +139,7 @@ class TestHostToDomainGraph { "4\tno.hedmark.os.www\t1", // "5\tno.hedmarktrafikk\t1", // }; - String[] domainGraphOutputSortingB = { // + String[] domainGraphDomainInSuffixB = { // "0\tno.hedland\t1", // "1\tno.hedmark\t1", // "2\tno.hedmark-folkemusikklag\t1", // @@ -312,23 +315,23 @@ void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() { } @Test - void testConvertNodesOutputSortingA() { - testSorted(hostGraphOutputSortingA); - testSorted(domainGraphOutputSortingA); + void testConvertNodesEnsureSortedOutputA() { + testSorted(hostGraphDomainInSuffixA); + testSorted(domainGraphDomainInSuffixA); converter.doCount(true); - String[] output = convert(converter, hostGraphOutputSortingA); + String[] output = convert(converter, hostGraphDomainInSuffixA); testSorted(output); - assertArrayEquals(domainGraphOutputSortingA, output); + assertArrayEquals(domainGraphDomainInSuffixA, output); } @Test - void testConvertNodesOutputSortingB() { - testSorted(hostGraphOutputSortingB); - testSorted(domainGraphOutputSortingB); + void testConvertNodesEnsureSortedOutputB() { + testSorted(hostGraphDomainInSuffixB); + testSorted(domainGraphDomainInSuffixB); converter.doCount(true); - String[] output = convert(converter, hostGraphOutputSortingB); + String[] output = convert(converter, hostGraphDomainInSuffixB); testSorted(output); - assertArrayEquals(domainGraphOutputSortingB, output); + assertArrayEquals(domainGraphDomainInSuffixB, output); } @Test