diff --git a/README.md b/README.md index f335392..6003d0d 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/ ### Javadocs -The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser. +The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/reports/apidocs/index.html` in a browser. ### Source Code Formatting @@ -41,7 +41,7 @@ The host-level web graph is built with help of PySpark, the corresponding code i ### Domain-Level Web Graph -The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). +The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). Please, see the script and the Java class [HostToDomainGraph](src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java) for further details. ### Processing Graphs using the WebGraph Framework diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java index fb0ee1f..3c0638e 100644 --- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java +++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java @@ -81,6 +81,7 @@ public class HostToDomainGraph { private long numInputLinesEdges = 0; protected String lastRevHost = null; protected Domain lastDomain = null; + protected String lastOutputDomain = null; private TreeMap domainQueue = new TreeMap<>(); private int maxQueueUsed = 0; @@ -197,17 +198,26 @@ public static int compareRevDomainsSafe(String d1, String d2) { char c1 = d1.charAt(i); char c2 = d2.charAt(i); if (c1 != c2) { + if (c1 == HYPHEN && c2 == DOT) { + /* + * Cannot finish "no.hedmark-folkemusikklag" unless "no.hedmark.os.www" is done + * because input which is mapped to a suffix (a prefix in reversed domain name + * notation) is still expected, e.g. "no.hedmark.www" which is mapped to + * "no.hedmark". + */ + return 0; + } return c1 - c2; } else if (c1 == HYPHEN) { /* - * cannot finish "org.example-domain" unless "org.example" is done + * Cannot finish "org.example-domain" unless "org.example" is done. */ return 0; } else if (c1 == DOT) { dots++; if (dots > 1) { /* - * cannot finish "name.his.forgot.foobar" unless "name.his" is done + * Cannot finish "name.his.forgot.foobar" unless "name.his" is done. * * This is a special case of multi-part suffixes with more than two parts when * the first part is also a public suffix, e.g. (in reversed domain name @@ -401,7 +411,7 @@ private Domain queueDomain(StringBuilder sb, String domainName) { String firstDomain = domainQueue.firstKey(); if (!Domain.isSafeToOutput(firstDomain, revDomainName)) { /* - * queued domains are sorted lexicographically: if the first/current domain + * Queued domains are sorted lexicographically: if the first/current domain * cannot be safely dequeued and written to output, this is also the case for * the following ones. */ @@ -430,6 +440,7 @@ private String getNodeLine(Domain domain) { } private void getNodeLine(StringBuilder b, Domain domain) { + String domainName = null; if (domain == null) return; if (domain.id >= 0 && domain.name != null) { @@ -438,7 +449,8 @@ private void getNodeLine(StringBuilder b, Domain domain) { } b.append(domain.id); b.append('\t'); - b.append(reverseHost(domain.name)); + domainName = reverseHost(domain.name); + b.append(domainName); if (countHosts) { b.append('\t'); b.append(domain.numberOfHosts); @@ -447,6 +459,13 @@ private void getNodeLine(StringBuilder b, Domain domain) { for (Long hostId : domain.ids) { setValue(hostId.longValue(), domain.id); } + if (lastOutputDomain != null && lastOutputDomain.compareTo(domainName) >= 0) { + String msg = "Output domains are not strictly monotonically sorted: " + lastOutputDomain + " <> " + + domainName; + LOG.error(msg); + throw new RuntimeException(msg); + } + lastOutputDomain = domainName; } public String convertEdge(String line) { @@ -540,8 +559,7 @@ private static void showHelp() { System.err.println(" \tpublic suffix list, "); System.err.println( " \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)"); - System.err - .println(" \t- host-without-www: strip the www. prefix (keep the "); + System.err.println(" \t- host-without-www: strip the www. prefix (keep the "); System.err.println(" \tfull host otherwise)"); System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part"); System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain"); diff --git a/src/script/host2domaingraph.sh b/src/script/host2domaingraph.sh index a339b53..ea51359 100755 --- a/src/script/host2domaingraph.sh +++ b/src/script/host2domaingraph.sh @@ -65,18 +65,30 @@ PARALLEL_SORT_THREADS=2 # 1 C locale is mandatory to keep reversed hosts of one domain or top-level domain # together in a single block: # echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=en_US.utf8 sort +# co.mopera +# com.opera +# com.opus +# co.mopus # vs. # echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=C sort +# co.mopera +# co.mopus +# com.opera +# com.opus # This requirement is met by the output of the cc-pyspark job. # -# 2 The second problem stems from the fact that a hyphen (valid in host and -# subdomain names) is sorted before the dot: +# In an older version, the input was re-sorted to try to group +# domains and their subdomains together: +# +# 2 Sorting with C locale, places a hyphen (valid in host and subdomain names) +# before a dot: # ac.gov # ac.gov.ascension # ac.gov.ascension-island # ac.gov.ascension.mail -# Unfortunately the output of the cc-pyspark job does not completely meet this -# sorting criterion. +# This causes that the domain "ac.gov.ascension" and its subdomain "ac.gov.ascension.mail" +# end up in two separated blocks of the input, even with sorting using the C locale. +# # The initial solution to ensure that the subdomains of "ac.gov.ascension" are not split # into two blocks, was to add an artificial dot temporarily to the end of each host # name during sorting: @@ -84,45 +96,80 @@ PARALLEL_SORT_THREADS=2 # | sort $SORTOPTS -t$'\t' -k2,2 | sed -e 's/\.$//' # The domain name "ac.gov.ascension" in the example above becomes temporarily # "ac.gov.ascension." and is now sorted after "ac.gov.ascension-island." -# -# To avoid this step (re-sorting billions of lines is expensive), the HostToDomainGraph -# class now caches potentially "missorted" candidates and processes them later together -# with the related subdomains / host names. # -# Note: The final sorting of the domain names is the same as if there would be -# a trailing dot: -# ac.gov.ascension-island -# ac.gov.ascension +# A sort order that keeps hosts/domains of a common suffix in one block can be +# also achieved if dots are replaced by commas: +# zcat vertices.txt.gz | tr . , \ +# | sort $SORTOPTS -t$'\t' -k2,2 | tr , . +# This approach is utilized by the "Sort-friendly URI Reordering Transform" (SURT), +# see . +# +# However, the public suffix list adds a further issue, which makes it impossible +# to group domains and subdomains together, by simply sorting the input: # -# 3 The public suffix list adds a further issue: there are multi-part suffixes, -# such as "co.uk" (or "uk.co" in reverse domain name notation). And the suffixes -# of a multi-part suffix can be public suffixes themselves: also "uk" is a public -# suffix. But they do not need to. For example: "no" and "os.hordaland.no" are -# in the public suffix list but "hordaland.no" is not. In this situation, -# adding a trailing dot does not even guarantee that all hosts of a domain under -# a public suffix is in a contiguous block: -# -# $> cat hordaland.txt -# no.hordaland -# no.hordaland-teater -# no.hordaland.os -# no.hordaland.os.bibliotek -# no.hordaland.oygarden -# no.hordalandfolkemusikklag -# -# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort -# no.hordaland-teater. -# no.hordaland. -# no.hordaland.os. -# no.hordaland.os.bibliotek. -# no.hordaland.oygarden. -# no.hordalandfolkemusikklag. +# 3 There are multi-part suffixes, such as "co.uk" (or "uk.co" in reverse domain name +# notation). And the suffixes of a multi-part suffix can be public suffixes themselves: +# also "uk" is a public suffix. But they do not need to. For example: "no" and +# "os.hordaland.no" are in the public suffix list but "hordaland.no" is not. +# In this situation, adding a trailing dot does not even guarantee that all hosts of +# a domain under a public suffix are in a contiguous block: +# +# $> cat hordaland.txt +# no.hordaland +# no.hordaland-teater +# no.hordaland.os +# no.hordaland.os.bibliotek +# no.hordaland.oygarden +# no.hordalandfolkemusikklag +# +# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort +# no.hordaland-teater. +# no.hordaland. +# no.hordaland.os. +# no.hordaland.os.bibliotek. +# no.hordaland.oygarden. +# no.hordalandfolkemusikklag. # # The host names "no.hordaland." and "no.hordaland.oygarden." both # are under the domain ""no.hordaland" (public suffix is "no"). # -# Please see https://github.com/commoncrawl/cc-webgraph/issues/3 -# for further details. +# To address this issue (point 3), the HostToDomainGraph class now caches +# potentially "missorted" candidates and processes them later together +# with the related subdomains / host names. +# +# 4 This also addresses the fact, that re-sorting billions of input lines is +# computationally expensive. +# +# Output sorting: +# +# 5 Ideally, the domain output should be lexicographically sorted +# as well. This is a requirement to store the map of node names and IDs +# in an "immutable external prefix map" (IEPM). +# If a trailing dot is added and then removed (and no cache is used), the +# output sorting would be consequently the same as if there is a trailing dot: +# ac.gov.ascension-island. +# ac.gov.ascension. +# respectively (after removing the trailing dot) +# ac.gov.ascension-island +# ac.gov.ascension +# +# The required ASCII sorting is: +# ac.gov.ascension +# ac.gov.ascension-island +# +# We cannot re-sort the output because this would also require to change +# the node IDs because the WebGraph framework expects the arc/edge input +# to be numerically sorted. And the vertices/nodes are enumerated as they +# are sorted, i.e. node IDs are line numbers starting with zero. +# +# Note: The approach to replace dots by commas ensures proper lexicographic +# sorting even if the replacement is inverted. However, it does not guarantee +# that all domains of one suffix are in a contigous block, if that suffix +# is a suffix of another suffix. See point 3. +# +# Please see https://github.com/commoncrawl/cc-webgraph/issues/3 +# and https://github.com/commoncrawl/cc-webgraph/issues/33 +# for further details. # export LC_ALL=C diff --git a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java index 147b706..35bca6c 100644 --- a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java +++ b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java @@ -110,6 +110,44 @@ class TestHostToDomainGraph { "3\tno.hordalandfolkemusikklag\t1", // }; + /* + * Issue #33 : domain output not sorted if domain name is a string suffix of + * public suffix appears only after the longer suffix. + */ + String[] hostGraphDomainInSuffixA = { // + "0\tno.hedland", // + "1\tno.hedmark-folkemusikklag", // + "2\tno.hedmark-trafikk", // + "3\tno.hedmark.m", // + "4\tno.hedmark.os.www", // + "5\tno.hedmark.www", // + "6\tno.hedmarktrafikk", // + }; + String[] hostGraphDomainInSuffixB = { // + "0\tno.hedland", // + "1\tno.hedmark-folkemusikklag", // + "2\tno.hedmark-trafikk", // + "3\tno.hedmark.os.www", // + "4\tno.hedmark.www", // + "5\tno.hedmarktrafikk", // + }; + String[] domainGraphDomainInSuffixA = { // + "0\tno.hedland\t1", // + "1\tno.hedmark\t2", // + "2\tno.hedmark-folkemusikklag\t1", // + "3\tno.hedmark-trafikk\t1", // + "4\tno.hedmark.os.www\t1", // + "5\tno.hedmarktrafikk\t1", // + }; + String[] domainGraphDomainInSuffixB = { // + "0\tno.hedland\t1", // + "1\tno.hedmark\t1", // + "2\tno.hedmark-folkemusikklag\t1", // + "3\tno.hedmark-trafikk\t1", // + "4\tno.hedmark.os.www\t1", // + "5\tno.hedmarktrafikk\t1", // + }; + /** * forgot.his.name is in the "private section" of the public suffix * list, while name is in the ICANN section, see @@ -194,7 +232,7 @@ private long[] getNodeIDs(String[] graph) { } /** - * test whether node names are properly sorted and IDs are correctly assigned + * Test whether node names are properly sorted and IDs are correctly assigned * (sequentially, strictly monotonically increasing, no gaps) */ void testSorted(String[] graph) { @@ -276,6 +314,26 @@ void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() { convert(converter, hostGraphHyphenatedDomains)); } + @Test + void testConvertNodesEnsureSortedOutputA() { + testSorted(hostGraphDomainInSuffixA); + testSorted(domainGraphDomainInSuffixA); + converter.doCount(true); + String[] output = convert(converter, hostGraphDomainInSuffixA); + testSorted(output); + assertArrayEquals(domainGraphDomainInSuffixA, output); + } + + @Test + void testConvertNodesEnsureSortedOutputB() { + testSorted(hostGraphDomainInSuffixB); + testSorted(domainGraphDomainInSuffixB); + converter.doCount(true); + String[] output = convert(converter, hostGraphDomainInSuffixB); + testSorted(output); + assertArrayEquals(domainGraphDomainInSuffixB, output); + } + @Test void testConvertPrivateDomain() { // verify sorting of input and expected output