@@ -68,6 +68,8 @@ public class HostToDomainGraph {
6868
6969 protected boolean countHosts = false ;
7070 protected boolean privateDomains = false ;
71+
72+ protected boolean stripWww = false ;
7173 protected boolean includeMultiPartSuffixes = false ;
7274
7375 protected long maxSize ;
@@ -84,6 +86,13 @@ public class HostToDomainGraph {
8486
8587 private static Pattern SPLIT_HOST_PATTERN = Pattern .compile ("\\ ." );
8688
89+ public final static String AGGREGATION_HOST_WITHOUT_WWW = "host-without-www" ;
90+ public final static String AGGREGATION_PRIVATE_DOMAIN = "private-domain" ;
91+ public final static String AGGREGATION_REGISTERED_DOMAIN = "registered-domain" ;
92+
93+ private final static List <String > ALLOWED_AGGREGATION_PARAMS = java .util .Arrays
94+ .asList (AGGREGATION_REGISTERED_DOMAIN , AGGREGATION_PRIVATE_DOMAIN , AGGREGATION_HOST_WITHOUT_WWW );
95+
8796 private Consumer <? super String > reporterInputNodes = (String line ) -> {
8897 if ((numInputLinesNodes % 500000 ) != 0 || numInputLinesNodes == 0 ) {
8998 return ;
@@ -281,6 +290,13 @@ public void multiPartSuffixesAsDomains(boolean include) {
281290 this .includeMultiPartSuffixes = include ;
282291 }
283292
293+ /**
294+ * @param stripWww if true the www. prefix is stripped
295+ */
296+ public void setStripWww (boolean stripWww ) {
297+ this .stripWww = stripWww ;
298+ }
299+
284300 /**
285301 * Reverse host name, eg. <code>www.example.com</code> is reversed to
286302 * <code>com.example.www</code>. Can also be used to "unreverse" a reversed host
@@ -327,13 +343,23 @@ public String convertNode(String line) {
327343 }
328344 lastRevHost = revHost ;
329345 String host = reverseHost (revHost );
330- String domain = EffectiveTldFinder . getAssignedDomain ( host , true , ! privateDomains ) ;
346+ String domain = null ;
331347 StringBuilder sb = new StringBuilder ();
332- if (domain == null && includeMultiPartSuffixes ) {
333- if (EffectiveTldFinder .getEffectiveTLDs ().containsKey (host ) && host .indexOf ('.' ) != -1 ) {
334- LOG .info ("Accepting public suffix (containing dot) as domain: {}" , host );
348+ if (this .stripWww ) {
349+ if (host .startsWith ("www." ) && host .indexOf ('.' , 4 ) != -1 ) {
350+ // strip leading 'www' to reduce number of "duplicate" hosts,
351+ // but leave at least 2 trailing parts (www.com is a valid domain)
352+ host = host .substring (4 );
335353 }
336354 domain = host ;
355+ } else {
356+ domain = EffectiveTldFinder .getAssignedDomain (host , true , !privateDomains );
357+ if (domain == null && includeMultiPartSuffixes ) {
358+ if (EffectiveTldFinder .getEffectiveTLDs ().containsKey (host ) && host .indexOf ('.' ) != -1 ) {
359+ LOG .info ("Accepting public suffix (containing dot) as domain: {}" , host );
360+ }
361+ domain = host ;
362+ }
337363 }
338364 if (domain == null ) {
339365 LOG .warn ("No domain for host: {}" , host );
@@ -499,9 +525,24 @@ private static void showHelp() {
499525 System .err .println ("Options:" );
500526 System .err .println (" -h\t (also -? or --help) show usage message and exit" );
501527 System .err .println (" -c\t count hosts per domain (additional column in <nodes_out>" );
502- System .err .println (" --private-domains\t convert to private domains (include suffixes from the" );
528+ System .err .println (" --private-domains\t (deprecated - use --aggregation-level)" );
529+ System .err .println (" \t convert to private domains (include suffixes from the" );
503530 System .err .println (" \t PRIVATE domains subdivision of the public suffix list," );
504- System .err .println (" \t see https://github.com/publicsuffix/list/wiki/Format#divisions" );
531+ System .err .println (" \t see https://github.com/publicsuffix/list/wiki/Format#divisions)" );
532+ System .err .println (" --aggregation-level <level>\t define the strategy on which hosts are folded to domains." );
533+ System .err
534+ .println (" \t <level> values: registered-domain (default), private-domain, " );
535+ System .err .println (" \t host-without-www. " );
536+ System .err .println (" \t - registered-domain: convert only the registered domains " );
537+ System .err .println (" \t - private-domain: convert to private domains " );
538+ System .err .println (
539+ " \t (include suffixes from the PRIVATE domains subdivision of the " );
540+ System .err .println (" \t public suffix list, " );
541+ System .err .println (
542+ " \t see https://github.com/publicsuffix/list/wiki/Format#divisions)" );
543+ System .err
544+ .println (" \t - host-without-www: strip the www. prefix (keep the " );
545+ System .err .println (" \t full host otherwise)" );
505546 System .err .println (" --multipart-suffixes-as-domains\t output host names which are equal to multi-part" );
506547 System .err .println (" \t public suffixes (the suffix contains a dot) as domain" );
507548 System .err .println (" \t names, eg. `gov.uk', `freight.aero' or `altoadige.it'." );
@@ -512,6 +553,8 @@ public static void main(String[] args) {
512553 boolean countHosts = false ;
513554 boolean includeMultiPartSuffixes = false ;
514555 boolean privateDomains = false ;
556+ String aggregationLevel = null ;
557+ boolean stripWww = false ;
515558 int argpos = 0 ;
516559 while (argpos < args .length && args [argpos ].startsWith ("-" )) {
517560 switch (args [argpos ]) {
@@ -528,9 +571,28 @@ public static void main(String[] args) {
528571 includeMultiPartSuffixes = true ;
529572 break ;
530573 case "--private-domains" :
531- case "--private" : // back-ward compatibility
574+ case "--private" : // back-ward compatibility (but deprecated in favour of --aggregation-level)
575+ LOG .warn (
576+ "The parameter --private / --private-domains is deprecated, in favour of --aggregation-level with value private-domain" );
532577 privateDomains = true ;
533578 break ;
579+ case "--aggregation-level" :
580+ if ((argpos + 1 ) >= args .length ) {
581+ LOG .error ("Missing value for option " + args [argpos ]);
582+ showHelp ();
583+ System .exit (1 );
584+ }
585+ String value = args [argpos + 1 ];
586+
587+ if (!ALLOWED_AGGREGATION_PARAMS .contains (value )) {
588+ LOG .error ("Unknown value for option " + args [argpos ] + ": " + value );
589+ showHelp ();
590+ System .exit (1 );
591+ } else {
592+ aggregationLevel = value ;
593+ }
594+ argpos ++;
595+ break ;
534596 default :
535597 System .err .println ("Unknown option " + args [argpos ]);
536598 showHelp ();
@@ -549,15 +611,37 @@ public static void main(String[] args) {
549611 LOG .error ("Invalid number: " + args [argpos + 0 ]);
550612 System .exit (1 );
551613 }
614+ if (aggregationLevel != null ) {
615+ if (privateDomains ) {
616+ LOG .error (
617+ "You cannot specify both --private or --private-domains, and --aggregation-level. "
618+ + "Prefer --aggregation-level [level] because it will supersede the other option." );
619+ System .exit (1 );
620+ } else {
621+ switch (aggregationLevel ) {
622+ case AGGREGATION_REGISTERED_DOMAIN :
623+ break ;
624+ case AGGREGATION_PRIVATE_DOMAIN :
625+ privateDomains = true ;
626+ break ;
627+ case AGGREGATION_HOST_WITHOUT_WWW :
628+ stripWww = true ;
629+ break ;
630+ }
631+ }
632+ }
633+
552634 HostToDomainGraph converter ;
553635 if (maxSize <= Arrays .MAX_ARRAY_SIZE ) {
554636 converter = new HostToDomainGraph ((int ) maxSize );
555637 } else {
556638 converter = new HostToDomainGraphBig (maxSize );
557639 }
640+
558641 converter .doCount (countHosts );
559642 converter .multiPartSuffixesAsDomains (includeMultiPartSuffixes );
560643 converter .doPrivateDomains (privateDomains );
644+ converter .setStripWww (stripWww );
561645 converter .reportConfig ();
562646 String nodesIn = args [argpos + 1 ];
563647 String nodesOut = args [argpos + 2 ];
0 commit comments