From b568cc9d837eecf80f2d121e1d5c95004435e92e Mon Sep 17 00:00:00 2001 From: cube Date: Sun, 29 Sep 2024 12:21:47 +0200 Subject: [PATCH 01/10] [NUTCH-2856] Implement a protocol-smb plugin based on hierynomus/smbj Draft version of a protocol-smb plugin. Lots of todo comments still, but it seems to work. --- .gitignore | 4 + conf/log4j2.xml | 3 +- runNutch.sh | 32 ++ src/plugin/build.xml | 1 + src/plugin/protocol-smb/build.xml | 22 ++ src/plugin/protocol-smb/ivy.xml | 47 +++ src/plugin/protocol-smb/plugin.xml | 53 ++++ .../apache/nutch/protocol/smb/Handler.java | 29 ++ .../org/apache/nutch/protocol/smb/Smb.java | 292 ++++++++++++++++++ .../nutch/protocol/smb/SmbURLConnection.java | 57 ++++ 10 files changed, 539 insertions(+), 1 deletion(-) create mode 100755 runNutch.sh create mode 100755 src/plugin/protocol-smb/build.xml create mode 100755 src/plugin/protocol-smb/ivy.xml create mode 100755 src/plugin/protocol-smb/plugin.xml create mode 100644 src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java create mode 100755 src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java create mode 100644 src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java diff --git a/.gitignore b/.gitignore index 8c521aa68e..f1af65b8b7 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,7 @@ lib/spotbugs-* ivy/dependency-check-ant/* .gradle* ivy/apache-rat-* +.vscode +crawl +urls +solr_datadir \ No newline at end of file diff --git a/conf/log4j2.xml b/conf/log4j2.xml index 9eb807b4fa..4aa7b2dd58 100644 --- a/conf/log4j2.xml +++ b/conf/log4j2.xml @@ -25,7 +25,8 @@ - + + diff --git a/runNutch.sh b/runNutch.sh new file mode 100755 index 0000000000..ca6ced22ae --- /dev/null +++ b/runNutch.sh @@ -0,0 +1,32 @@ +#/bin/bash +echo "Will remove existing CrawlDb..." +sleep 5 +echo "Removing existing CrawlDb..." +rm -rf crawl/* + +./runtime/local/bin/nutch inject crawl/crawldb urls + +while true +do + ./runtime/local/bin/nutch generate crawl/crawldb crawl/segments/ + segment=`ls crawl/segments/ | tail -1` + echo "Found segment $segment" + sleep 5 + if [ "$?" == "0" ] && [ ! -z "$segment" ] + then + ./runtime/local/bin/nutch fetch crawl/segments/$segment + if [ "$?" == "0" ] + then + sleep 5 + ./runtime/local/bin/nutch parse crawl/segments/$segment + sleep 5 + ./runtime/local/bin/nutch updatedb crawl/crawldb crawl/segments/$segment + sleep 5 + ./runtime/local/bin/nutch index crawl/crawldb crawl/segments/$segment + sleep 10 + rm -rf crawl/segments/$segment + fi + else + sleep 30 + fi +done \ No newline at end of file diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 498259a950..975a35dad9 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -78,6 +78,7 @@ + diff --git a/src/plugin/protocol-smb/build.xml b/src/plugin/protocol-smb/build.xml new file mode 100755 index 0000000000..54e6d24059 --- /dev/null +++ b/src/plugin/protocol-smb/build.xml @@ -0,0 +1,22 @@ + + + + + + + diff --git a/src/plugin/protocol-smb/ivy.xml b/src/plugin/protocol-smb/ivy.xml new file mode 100755 index 0000000000..9fd708aad7 --- /dev/null +++ b/src/plugin/protocol-smb/ivy.xml @@ -0,0 +1,47 @@ + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-smb/plugin.xml b/src/plugin/protocol-smb/plugin.xml new file mode 100755 index 0000000000..39df2e14c9 --- /dev/null +++ b/src/plugin/protocol-smb/plugin.xml @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java new file mode 100644 index 0000000000..7c349c5c05 --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +public class Handler extends URLStreamHandler { + + @Override + protected URLConnection openConnection(URL u) { + return new SmbURLConnection(u); + } +} diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java new file mode 100755 index 0000000000..359315bfb8 --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java @@ -0,0 +1,292 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import com.hierynomus.msdtyp.AccessMask; +import com.hierynomus.msfscc.FileAttributes; +import com.hierynomus.msfscc.fileinformation.FileAllInformation; +import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation; +import com.hierynomus.mssmb2.SMB2CreateDisposition; +import com.hierynomus.mssmb2.SMB2CreateOptions; +import com.hierynomus.mssmb2.SMB2ShareAccess; +import com.hierynomus.smbj.auth.AuthenticationContext; +import com.hierynomus.smbj.connection.Connection; +import com.hierynomus.smbj.session.Session; +import com.hierynomus.smbj.share.DiskShare; +import com.hierynomus.smbj.share.File; +import com.hierynomus.smbj.SMBClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.robots.BaseRobotRules; + +public class Smb implements Protocol { + protected static final Logger LOG = LoggerFactory.getLogger(Smb.class); + + private Configuration conf; + + private String user; + private String password; + private String domain; + private int contentLimit; + private Set ignoreFiles; + + public Smb() { + // todo: files that should be skipped could be configurable. + this.ignoreFiles = new HashSet<>(); + ignoreFiles.add("."); + ignoreFiles.add(".."); + ignoreFiles.add(".svn"); + ignoreFiles.add(".git"); + } + + @Override + public Configuration getConf() { + LOG.debug("getConf()"); + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + // todo: is it possible to use configuration "per server" or "per share"? + user = conf.getTrimmed("smb.user"); + if (user == null || user.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.user' not set."); + } + password = conf.getTrimmed("smb.password"); + if (password == null || password.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.password' not set."); + } + domain = conf.getTrimmed("smb.domain"); + contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE); + } + + /** + * list directory. + * + * @return some HTML string + */ + private String getDirectoryContent(DiskShare share, String shareName, String path) throws UnsupportedEncodingException { + StringBuffer sb = new StringBuffer(); + sb.append(""); + sb.append("Index of ").append("/").append(shareName).append(path).append(""); + sb.append(""); + sb.append("

Index of ").append("/").append(shareName).append(path).append("

"); + sb.append("
");
+      for (FileIdBothDirectoryInformation f : share.list(path)) {
+        if (ignoreFiles.contains(f.getFileName())) {
+          LOG.warn("File skipped: " + f.getFileName());
+          continue;
+        }
+        boolean isDir = share.folderExists(path + "/" + f.getFileName());
+
+        sb.append("").append(f.getFileName());
+        if (isDir) {
+          sb.append("/");
+        }
+        sb.append("\t").append(f.getLastWriteTime()).append("\n");
+      }
+      sb.append("
"); + sb.append(""); + + return sb.toString(); + } + + private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); + + /** + * Get the {@link ProtocolOutput} for a given url and crawldatum. + * + * @param url canonical url + * @param datum associated {@link org.apache.nutch.crawl.CrawlDatum} + * @return the {@link ProtocolOutput} + * @see https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/CrawlDatum.java + */ + @Override + public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) { + LOG.warn("getProtocolOutput({}, {})", urlstr, datum); + + + + try { + String u = java.net.URLDecoder.decode(urlstr.toString(), StandardCharsets.UTF_8.name()); + u = u.split("://")[1]; + LOG.warn("u={}", u); + String[] components = u.split("[:/]", 2); + String hostname = components[0]; + String shareAndPath = components[1]; + LOG.warn("hostname={}", hostname); + LOG.warn("shareAndPath={}", shareAndPath); + components = shareAndPath.split("/", 2); + String shareName = components[0]; + String path = components.length>1 ? "/" + components[1]: "/"; + LOG.warn("share={}", shareName); + LOG.warn("path={}", path); + + // todo: we construct and destruct the connection for each and every URL. Can connection pools improve? + SMBClient client = new SMBClient(); + try(Connection connection = client.connect(hostname)) { + + AuthenticationContext ac = new AuthenticationContext(user, password.toCharArray(), domain); + Session session = connection.authenticate(ac); + + // Connect to Share + try (DiskShare share = (DiskShare) session.connectShare(shareName)) { + + // now get the content + if (share.folderExists(path)) { + String c = getDirectoryContent(share, shareName, path); + + String base = urlstr.toString(); + if (base.endsWith("/")) { + base = base + "."; + } + if (!base.endsWith("/.")) { + base = base + "/."; + } + + LOG.warn("base={}", base); + LOG.warn("directory={}", c); + + return new ProtocolOutput( + new Content(base, base, c.getBytes(), "text/html", new Metadata(), getConf()), + ProtocolStatus.STATUS_SUCCESS + ); + } else if (share.fileExists(path)) { + // todo: how can we store this, and maybe more metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream"); + + FileAllInformation fileInfo = share.getFileInformation(path); + File file = share.openFile(path, EnumSet.of(AccessMask.GENERIC_READ), null, SMB2ShareAccess.ALL, SMB2CreateDisposition.FILE_OPEN, null); + + InputStream fileIn = file.getInputStream(); + byte[] bytes = null; + long fileSize = fileInfo.getStandardInformation().getEndOfFile(); + long fetchSize = fileSize; + metadata.add("fileSize", String.valueOf(fileSize)); + + // todo: we run into issues if the file is bigger than 2 GB. I made the limit configurable + // but e.g. zip can no longer be evaluated if too big. + if (fetchSize > contentLimit) { + LOG.warn("trunkating {}", urlstr); + fetchSize = contentLimit; + + // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content + // discovery is incomplete + metadata.add("truncated", String.valueOf(fetchSize)); + } + + bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array + + LOG.warn("retrieved {} bytes", bytes.length); + + StringBuilder sb = new StringBuilder(); + for (int i=0; i>>4]).append(HEX_ARRAY[b & 0xF]); + } + LOG.warn("retrieved {} bytes starting with {}", bytes.length, sb.toString()); + LOG.warn("metadata={}", metadata); + + // create content and return result + String base = urlstr.toString(); + return new ProtocolOutput( + new Content(base, base, bytes, "application/octet-stream", metadata, getConf()), + ProtocolStatus.STATUS_SUCCESS + ); + } else { + // communicate error + String message = "File not found: " + urlstr; + LOG.warn(message); + String base = urlstr.toString(); + return new ProtocolOutput( + new Content(base, base, message.getBytes(), "text/plain", new Metadata(), getConf()), + ProtocolStatus.STATUS_NOTFOUND + ); + } + + } + + } catch (Exception e) { + LOG.error("Could not establish session", e); + + // todo: we can communicate the reason for error as ProtocolStatus + } + + throw new UnsupportedOperationException("neither directory nor file: " + urlstr); + } catch(Exception e) { + LOG.error("Could not get protocol output for " + urlstr, e); + return new ProtocolOutput(null, new ProtocolStatus(e)); + } + } + + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + @Override + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, + List robotsTxtContent) { + LOG.debug("getRobotRules({}, {}, {})", url, datum, robotsTxtContent); + + // todo: we should read some robots file from the smb share + return RobotRulesParser.EMPTY_RULES; + } +} diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java new file mode 100644 index 0000000000..369af4e11e --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java @@ -0,0 +1,57 @@ +package org.apache.nutch.protocol.smb; + +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLConnection; +import java.nio.charset.StandardCharsets; + +public class SmbURLConnection extends URLConnection { + + private String schema; + private String host; + private int port; + private String share; + private String path; + + public SmbURLConnection(URL url) throws UnsupportedEncodingException { + super(url); + + String u = java.net.URLDecoder.decode(url.toString(), StandardCharsets.UTF_8.name()); + String[] parts = u.split("://"); + schema = parts[0]; + u = parts[1]; + + parts = u.split("[:/]", 2); + host = parts[0]; + u = parts[1]; // we have share and path now + + parts = u.split("/", 2); + share = parts[0]; + + path = "/" + parts[1]; + } + + public String getSchema() { + return schema; + } + + public String getHost() { + return host; + } + + public int getPort() { + return port; + } + + public String getShare() { + return share; + } + + public String getPath() { + return path; + } + + public void connect() { + + } +} \ No newline at end of file From a5ad1da935e5a0f646eabedd93abf6c8db2973de Mon Sep 17 00:00:00 2001 From: cube Date: Sat, 5 Oct 2024 00:51:31 +0200 Subject: [PATCH 02/10] Reduce logging Improve error handling Rename class as requested Added license header Improve url parsing added robots.txt --- runNutch.sh | 29 +- src/java/org/apache/nutch/crawl/Injector.java | 2 +- src/plugin/protocol-smb/plugin.xml | 4 +- .../org/apache/nutch/protocol/smb/Smb.java | 292 --------------- .../smb/{Handler.java => SmbHandler.java} | 2 +- .../nutch/protocol/smb/SmbProtocol.java | 333 ++++++++++++++++++ .../nutch/protocol/smb/SmbURLConnection.java | 42 ++- 7 files changed, 395 insertions(+), 309 deletions(-) delete mode 100755 src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java rename src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/{Handler.java => SmbHandler.java} (95%) create mode 100755 src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java diff --git a/runNutch.sh b/runNutch.sh index ca6ced22ae..8769c2134f 100755 --- a/runNutch.sh +++ b/runNutch.sh @@ -1,32 +1,57 @@ #/bin/bash + +if [ -z "$JAVA_HOME" ] +then + echo ERROR: JAVA_HOME is not set. + exit 1 +fi + + echo "Will remove existing CrawlDb..." sleep 5 echo "Removing existing CrawlDb..." -rm -rf crawl/* +banner "Delete DB" +rm -rf crawl/* || exit 1 +docker exec -it solr_nutch solr delete -c nutch || exit 1 +banner "Inject URLs" ./runtime/local/bin/nutch inject crawl/crawldb urls +banner "Create Solr" +cp src/plugin/indexer-solr/schema.xml solr_datadir +docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/solrconfig.xml /var/solr/data/nutch +docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/stopwords.txt /var/solr/data/nutch +docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/protwords.txt /var/solr/data/nutch +docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/synonyms.txt /var/solr/data/nutch +docker exec -it solr_nutch solr create_core -c nutch -d /var/solr/data/nutch || exit 1 + while true do + sleep 5 + banner Generate Segment ./runtime/local/bin/nutch generate crawl/crawldb crawl/segments/ segment=`ls crawl/segments/ | tail -1` echo "Found segment $segment" sleep 5 if [ "$?" == "0" ] && [ ! -z "$segment" ] then + banner "Fetch" ./runtime/local/bin/nutch fetch crawl/segments/$segment if [ "$?" == "0" ] then sleep 5 + banner "Parse" ./runtime/local/bin/nutch parse crawl/segments/$segment sleep 5 + banner UpdateDB ./runtime/local/bin/nutch updatedb crawl/crawldb crawl/segments/$segment sleep 5 + banner Index ./runtime/local/bin/nutch index crawl/crawldb crawl/segments/$segment sleep 10 rm -rf crawl/segments/$segment fi else - sleep 30 + exit 5 fi done \ No newline at end of file diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 0d3740eb44..f1258daab7 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -158,7 +158,7 @@ private String filterNormalize(String url) { if (filters != null) url = filters.filter(url); // filter the url } catch (Exception e) { - LOG.warn("Skipping " + url + ":" + e); + LOG.warn("Skipping {}", url, e); url = null; } } diff --git a/src/plugin/protocol-smb/plugin.xml b/src/plugin/protocol-smb/plugin.xml index 39df2e14c9..420ff89d9c 100755 --- a/src/plugin/protocol-smb/plugin.xml +++ b/src/plugin/protocol-smb/plugin.xml @@ -43,9 +43,9 @@ point="org.apache.nutch.protocol.Protocol"> + class="org.apache.nutch.protocol.smb.SmbProtocol"> - + diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java deleted file mode 100755 index 359315bfb8..0000000000 --- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Smb.java +++ /dev/null @@ -1,292 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.smb; - -import java.io.IOException; -import java.io.InputStream; -import java.io.UnsupportedEncodingException; -import java.net.MalformedURLException; -import java.net.URI; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolOutput; -import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.protocol.RobotRulesParser; -import com.hierynomus.msdtyp.AccessMask; -import com.hierynomus.msfscc.FileAttributes; -import com.hierynomus.msfscc.fileinformation.FileAllInformation; -import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation; -import com.hierynomus.mssmb2.SMB2CreateDisposition; -import com.hierynomus.mssmb2.SMB2CreateOptions; -import com.hierynomus.mssmb2.SMB2ShareAccess; -import com.hierynomus.smbj.auth.AuthenticationContext; -import com.hierynomus.smbj.connection.Connection; -import com.hierynomus.smbj.session.Session; -import com.hierynomus.smbj.share.DiskShare; -import com.hierynomus.smbj.share.File; -import com.hierynomus.smbj.SMBClient; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import crawlercommons.robots.BaseRobotRules; - -public class Smb implements Protocol { - protected static final Logger LOG = LoggerFactory.getLogger(Smb.class); - - private Configuration conf; - - private String user; - private String password; - private String domain; - private int contentLimit; - private Set ignoreFiles; - - public Smb() { - // todo: files that should be skipped could be configurable. - this.ignoreFiles = new HashSet<>(); - ignoreFiles.add("."); - ignoreFiles.add(".."); - ignoreFiles.add(".svn"); - ignoreFiles.add(".git"); - } - - @Override - public Configuration getConf() { - LOG.debug("getConf()"); - return this.conf; - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - - // todo: is it possible to use configuration "per server" or "per share"? - user = conf.getTrimmed("smb.user"); - if (user == null || user.isEmpty()) { - throw new IllegalArgumentException("Config parameter 'smb.user' not set."); - } - password = conf.getTrimmed("smb.password"); - if (password == null || password.isEmpty()) { - throw new IllegalArgumentException("Config parameter 'smb.password' not set."); - } - domain = conf.getTrimmed("smb.domain"); - contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE); - } - - /** - * list directory. - * - * @return some HTML string - */ - private String getDirectoryContent(DiskShare share, String shareName, String path) throws UnsupportedEncodingException { - StringBuffer sb = new StringBuffer(); - sb.append(""); - sb.append("Index of ").append("/").append(shareName).append(path).append(""); - sb.append(""); - sb.append("

Index of ").append("/").append(shareName).append(path).append("

"); - sb.append("
");
-      for (FileIdBothDirectoryInformation f : share.list(path)) {
-        if (ignoreFiles.contains(f.getFileName())) {
-          LOG.warn("File skipped: " + f.getFileName());
-          continue;
-        }
-        boolean isDir = share.folderExists(path + "/" + f.getFileName());
-
-        sb.append("").append(f.getFileName());
-        if (isDir) {
-          sb.append("/");
-        }
-        sb.append("\t").append(f.getLastWriteTime()).append("\n");
-      }
-      sb.append("
"); - sb.append(""); - - return sb.toString(); - } - - private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); - - /** - * Get the {@link ProtocolOutput} for a given url and crawldatum. - * - * @param url canonical url - * @param datum associated {@link org.apache.nutch.crawl.CrawlDatum} - * @return the {@link ProtocolOutput} - * @see https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/CrawlDatum.java - */ - @Override - public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) { - LOG.warn("getProtocolOutput({}, {})", urlstr, datum); - - - - try { - String u = java.net.URLDecoder.decode(urlstr.toString(), StandardCharsets.UTF_8.name()); - u = u.split("://")[1]; - LOG.warn("u={}", u); - String[] components = u.split("[:/]", 2); - String hostname = components[0]; - String shareAndPath = components[1]; - LOG.warn("hostname={}", hostname); - LOG.warn("shareAndPath={}", shareAndPath); - components = shareAndPath.split("/", 2); - String shareName = components[0]; - String path = components.length>1 ? "/" + components[1]: "/"; - LOG.warn("share={}", shareName); - LOG.warn("path={}", path); - - // todo: we construct and destruct the connection for each and every URL. Can connection pools improve? - SMBClient client = new SMBClient(); - try(Connection connection = client.connect(hostname)) { - - AuthenticationContext ac = new AuthenticationContext(user, password.toCharArray(), domain); - Session session = connection.authenticate(ac); - - // Connect to Share - try (DiskShare share = (DiskShare) session.connectShare(shareName)) { - - // now get the content - if (share.folderExists(path)) { - String c = getDirectoryContent(share, shareName, path); - - String base = urlstr.toString(); - if (base.endsWith("/")) { - base = base + "."; - } - if (!base.endsWith("/.")) { - base = base + "/."; - } - - LOG.warn("base={}", base); - LOG.warn("directory={}", c); - - return new ProtocolOutput( - new Content(base, base, c.getBytes(), "text/html", new Metadata(), getConf()), - ProtocolStatus.STATUS_SUCCESS - ); - } else if (share.fileExists(path)) { - // todo: how can we store this, and maybe more metadata? - Metadata metadata = new Metadata(); - metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream"); - - FileAllInformation fileInfo = share.getFileInformation(path); - File file = share.openFile(path, EnumSet.of(AccessMask.GENERIC_READ), null, SMB2ShareAccess.ALL, SMB2CreateDisposition.FILE_OPEN, null); - - InputStream fileIn = file.getInputStream(); - byte[] bytes = null; - long fileSize = fileInfo.getStandardInformation().getEndOfFile(); - long fetchSize = fileSize; - metadata.add("fileSize", String.valueOf(fileSize)); - - // todo: we run into issues if the file is bigger than 2 GB. I made the limit configurable - // but e.g. zip can no longer be evaluated if too big. - if (fetchSize > contentLimit) { - LOG.warn("trunkating {}", urlstr); - fetchSize = contentLimit; - - // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content - // discovery is incomplete - metadata.add("truncated", String.valueOf(fetchSize)); - } - - bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array - - LOG.warn("retrieved {} bytes", bytes.length); - - StringBuilder sb = new StringBuilder(); - for (int i=0; i>>4]).append(HEX_ARRAY[b & 0xF]); - } - LOG.warn("retrieved {} bytes starting with {}", bytes.length, sb.toString()); - LOG.warn("metadata={}", metadata); - - // create content and return result - String base = urlstr.toString(); - return new ProtocolOutput( - new Content(base, base, bytes, "application/octet-stream", metadata, getConf()), - ProtocolStatus.STATUS_SUCCESS - ); - } else { - // communicate error - String message = "File not found: " + urlstr; - LOG.warn(message); - String base = urlstr.toString(); - return new ProtocolOutput( - new Content(base, base, message.getBytes(), "text/plain", new Metadata(), getConf()), - ProtocolStatus.STATUS_NOTFOUND - ); - } - - } - - } catch (Exception e) { - LOG.error("Could not establish session", e); - - // todo: we can communicate the reason for error as ProtocolStatus - } - - throw new UnsupportedOperationException("neither directory nor file: " + urlstr); - } catch(Exception e) { - LOG.error("Could not get protocol output for " + urlstr, e); - return new ProtocolOutput(null, new ProtocolStatus(e)); - } - } - - /** - * Retrieve robot rules applicable for this URL. - * - * @param url - * URL to check - * @param datum - * page datum - * @param robotsTxtContent - * container to store responses when fetching the robots.txt file for - * debugging or archival purposes. Instead of a robots.txt file, it - * may include redirects or an error page (404, etc.). Response - * {@link Content} is appended to the passed list. If null is passed - * nothing is stored. - * @return robot rules (specific for this URL or default), never null - */ - @Override - public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, - List robotsTxtContent) { - LOG.debug("getRobotRules({}, {}, {})", url, datum, robotsTxtContent); - - // todo: we should read some robots file from the smb share - return RobotRulesParser.EMPTY_RULES; - } -} diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbHandler.java similarity index 95% rename from src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java rename to src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbHandler.java index 7c349c5c05..cb7135f689 100644 --- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/Handler.java +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbHandler.java @@ -20,7 +20,7 @@ import java.net.URLConnection; import java.net.URLStreamHandler; -public class Handler extends URLStreamHandler { +public class SmbHandler extends URLStreamHandler { @Override protected URLConnection openConnection(URL u) { diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java new file mode 100755 index 0000000000..01cf73f9c9 --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java @@ -0,0 +1,333 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import com.hierynomus.msdtyp.AccessMask; +import com.hierynomus.msfscc.FileAttributes; +import com.hierynomus.msfscc.fileinformation.FileAllInformation; +import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation; +import com.hierynomus.mssmb2.SMB2CreateDisposition; +import com.hierynomus.mssmb2.SMB2CreateOptions; +import com.hierynomus.mssmb2.SMB2ShareAccess; +import com.hierynomus.smbj.auth.AuthenticationContext; +import com.hierynomus.smbj.connection.Connection; +import com.hierynomus.smbj.session.Session; +import com.hierynomus.smbj.share.DiskShare; +import com.hierynomus.smbj.share.File; +import com.hierynomus.smbj.SMBClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.robots.BaseRobotRules; + +public class SmbProtocol implements Protocol { + protected static final Logger LOG = LoggerFactory.getLogger(SmbProtocol.class); + + private Configuration conf; + + private String user; + private String password; + private String domain; + private int contentLimit; + private Set ignoreFiles; + + public SmbProtocol() { + // todo: files that should be skipped could be configurable. + this.ignoreFiles = new HashSet<>(); + ignoreFiles.add("."); + ignoreFiles.add(".."); + ignoreFiles.add(".svn"); + ignoreFiles.add(".git"); + } + + @Override + public Configuration getConf() { + LOG.debug("getConf()"); + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + // todo: is it possible to use configuration "per server" or "per share"? + user = conf.getTrimmed("smb.user"); + if (user == null || user.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.user' not set."); + } + password = conf.getTrimmed("smb.password"); + if (password == null || password.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.password' not set."); + } + domain = conf.getTrimmed("smb.domain"); + contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE); + } + + /** + * list directory. + * + * @return some HTML string + */ + private String getDirectoryContent(DiskShare share, String shareName, String path) throws UnsupportedEncodingException { + StringBuffer sb = new StringBuffer(); + sb.append(""); + sb.append("Index of ").append("/").append(shareName).append(path).append(""); + sb.append(""); + sb.append("

Index of ").append("/").append(shareName).append(path).append("

"); + sb.append("
");
+      for (FileIdBothDirectoryInformation f : share.list(path)) {
+        if (ignoreFiles.contains(f.getFileName())) {
+          LOG.warn("File skipped: " + f.getFileName());
+          continue;
+        }
+        boolean isDir = share.folderExists(path + "/" + f.getFileName());
+
+        sb.append("").append(f.getFileName());
+        if (isDir) {
+          sb.append("/");
+        }
+        sb.append("\t").append(f.getLastWriteTime()).append("\n");
+      }
+      sb.append("
"); + sb.append(""); + + return sb.toString(); + } + + private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); + + private DiskShare getDiskShare(URL url) throws UnsupportedEncodingException, IOException { + String hostname = url.getHost(); + int port = url.getPort(); + String shareAndPath = url.getPath(); + + if (port == -1) { + port = 445; + } + String[] components = shareAndPath.split("/", 3); + String shareName = components[1]; + shareName = java.net.URLDecoder.decode(shareName, StandardCharsets.UTF_8.name()); + String path = components.length>2 ? "/" + components[2]: "/"; + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8.name()); + + LOG.trace("hostname={}", hostname); + LOG.trace("port={}", port); + LOG.trace("shareAndPath={}", shareAndPath); + LOG.trace("share={}", shareName); + LOG.trace("path={}", path); + + // todo: we construct and destruct the connection for each and every URL. Can connection pools improve? + SMBClient client = new SMBClient(); + Connection connection = client.connect(hostname, port); + Session session = connection.authenticate( + new AuthenticationContext(user, password.toCharArray(), domain) + ); + // Connect to Share + DiskShare share = (DiskShare) session.connectShare(shareName); + return share; + } + + /** + * Splits an absolute path into share and path. + * The share is the top level directory, everything else will become the path. + * Since the whole structure can be transported via URLs, URL-decoding is also + * applied. + * + * @param url the url to parse + * @return an array consisting of [share, path] + */ + private String[] getSmbShareAndPath(URL url) throws UnsupportedEncodingException { + String shareAndPath = url.getPath(); + String[] components = shareAndPath.split("/", 3); + String shareName = components[1]; + shareName = java.net.URLDecoder.decode(shareName, StandardCharsets.UTF_8.name()); + String path = components.length>2 ? "/" + components[2]: "/"; + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8.name()); + + return new String[]{shareName, path}; + } + + /** + * Get the {@link ProtocolOutput} for a given url and crawldatum. + * + * @param url canonical url + * @param datum associated {@link org.apache.nutch.crawl.CrawlDatum} + * @return the {@link ProtocolOutput} + * @see https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/CrawlDatum.java + */ + @Override + public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) { + LOG.debug("getProtocolOutput({}, {})", urlstr, datum); + + try { + URL url = new URI(urlstr.toString()).toURL(); + String[] shareAndPath = getSmbShareAndPath(url); + String shareName = shareAndPath[0]; + String path = shareAndPath[1]; + + DiskShare share = getDiskShare(url); + + // now get the content + if (share.folderExists(path)) { + String htmlContent = getDirectoryContent(share, shareName, path); + + // construct a suitable base + String base = urlstr.toString(); + if (base.endsWith("/")) { + base = base + "."; + } + if (!base.endsWith("/.")) { + base = base + "/."; + } + + LOG.trace("base={}", base); + LOG.trace("directory={}", htmlContent); + + return new ProtocolOutput( + new Content(base, base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), + ProtocolStatus.STATUS_SUCCESS + ); + } else if (share.fileExists(path)) { + // todo: how can we store this, and maybe more metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream"); + + FileAllInformation fileInfo = share.getFileInformation(path); + File file = share.openFile(path, EnumSet.of(AccessMask.GENERIC_READ), null, SMB2ShareAccess.ALL, SMB2CreateDisposition.FILE_OPEN, null); + + InputStream fileIn = file.getInputStream(); + byte[] bytes = null; + long fileSize = fileInfo.getStandardInformation().getEndOfFile(); + long fetchSize = fileSize; + metadata.add("fileSize", String.valueOf(fileSize)); + + // todo: we run into issues if the file is bigger than 2 GB. I made the limit configurable + // but e.g. zip can no longer be evaluated if too big. + if (fetchSize > contentLimit) { + LOG.info("trunkating {}", urlstr); + fetchSize = contentLimit; + + // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content + // discovery is incomplete + metadata.add("truncated", String.valueOf(fetchSize)); + } + + bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array + + LOG.trace("retrieved {} bytes", bytes.length); + + if (LOG.isTraceEnabled()) { + StringBuilder sb = new StringBuilder(); + for (int i=0; i>>4]).append(HEX_ARRAY[b & 0xF]); + } + LOG.trace("retrieved {} bytes starting with {}", bytes.length, sb.toString()); + } + LOG.trace("metadata={}", metadata); + + // create content and return result + String base = urlstr.toString(); + return new ProtocolOutput( + new Content(base, base, bytes, "application/octet-stream", metadata, getConf()), + ProtocolStatus.STATUS_SUCCESS + ); + } else { + // communicate error + String message = "File not found: " + urlstr; + LOG.info(message); + String base = urlstr.toString(); + return new ProtocolOutput( + new Content(base, base, message.getBytes(), "text/plain", new Metadata(), getConf()), + ProtocolStatus.STATUS_NOTFOUND + ); + } + + } catch(Exception e) { + LOG.error("Could not get protocol output for {}", urlstr, e); + return new ProtocolOutput(null, new ProtocolStatus(e)); + } + } + + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + @Override + public BaseRobotRules getRobotRules(Text urlstr, CrawlDatum datum, List robotsTxtContent) { + LOG.trace("getRobotRules({}, {}, {})", urlstr, datum, robotsTxtContent); + + try { + URL url = new URI(urlstr.toString()).toURL(); + DiskShare share = getDiskShare(url); + if (!share.fileExists("/robots.txt")) { + // no robots file? Then we can scan everything + LOG.debug("No robots.txt found -> crawl everything"); + return RobotRulesParser.EMPTY_RULES; + } + + // todo: we should read some robots file from the smb share + // until then we simply do nothing + LOG.info("/robots.txt found -> we do not crawl"); + return RobotRulesParser.FORBID_ALL_RULES; + } catch (Exception e) { + LOG.info("Could not get robot rules for {}", e); + return RobotRulesParser.DEFER_VISIT_RULES; + } + } +} diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java index 369af4e11e..22170478e0 100644 --- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java @@ -1,3 +1,19 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.protocol.smb; import java.io.UnsupportedEncodingException; @@ -13,22 +29,26 @@ public class SmbURLConnection extends URLConnection { private String share; private String path; - public SmbURLConnection(URL url) throws UnsupportedEncodingException { + public SmbURLConnection(URL url) { super(url); - String u = java.net.URLDecoder.decode(url.toString(), StandardCharsets.UTF_8.name()); - String[] parts = u.split("://"); - schema = parts[0]; - u = parts[1]; + try { + String u = java.net.URLDecoder.decode(url.toString(), StandardCharsets.UTF_8.name()); + String[] parts = u.split("://"); + schema = parts[0]; + u = parts[1]; - parts = u.split("[:/]", 2); - host = parts[0]; - u = parts[1]; // we have share and path now + parts = u.split("[:/]", 2); + host = parts[0]; + u = parts[1]; // we have share and path now - parts = u.split("/", 2); - share = parts[0]; + parts = u.split("/", 2); + share = parts[0]; - path = "/" + parts[1]; + path = "/" + parts[1]; + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("could not decypher given url", e); + } } public String getSchema() { From 2d9d4c2079158d1a4952d20582818957be21a470 Mon Sep 17 00:00:00 2001 From: cube Date: Mon, 7 Oct 2024 11:50:29 +0200 Subject: [PATCH 03/10] fix connection problems --- .../nutch/protocol/smb/SmbProtocol.java | 241 +++++++++++------- 1 file changed, 146 insertions(+), 95 deletions(-) diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java index 01cf73f9c9..4b99ae74c0 100755 --- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java @@ -25,6 +25,7 @@ import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Collection; import java.util.EnumSet; import java.util.HashSet; import java.util.List; @@ -55,10 +56,13 @@ import com.hierynomus.smbj.share.DiskShare; import com.hierynomus.smbj.share.File; import com.hierynomus.smbj.SMBClient; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import crawlercommons.robots.BaseRobotRules; +import crawlercommons.robots.SimpleRobotRules; +import crawlercommons.robots.SimpleRobotRulesParser; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class SmbProtocol implements Protocol { protected static final Logger LOG = LoggerFactory.getLogger(SmbProtocol.class); @@ -70,6 +74,7 @@ public class SmbProtocol implements Protocol { private String domain; private int contentLimit; private Set ignoreFiles; + private Collection agentNames; public SmbProtocol() { // todo: files that should be skipped could be configurable. @@ -90,14 +95,19 @@ public Configuration getConf() { public void setConf(Configuration conf) { this.conf = conf; + agentNames = conf.getTrimmedStringCollection("smb.agent.name"); + if (agentNames == null || agentNames.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.agent.name' not set or empty."); + } + // todo: is it possible to use configuration "per server" or "per share"? user = conf.getTrimmed("smb.user"); if (user == null || user.isEmpty()) { - throw new IllegalArgumentException("Config parameter 'smb.user' not set."); + throw new IllegalArgumentException("Config parameter 'smb.user' not set or empty."); } password = conf.getTrimmed("smb.password"); if (password == null || password.isEmpty()) { - throw new IllegalArgumentException("Config parameter 'smb.password' not set."); + throw new IllegalArgumentException("Config parameter 'smb.password' not set or empty."); } domain = conf.getTrimmed("smb.domain"); contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE); @@ -117,7 +127,7 @@ private String getDirectoryContent(DiskShare share, String shareName, String pat sb.append("
");
       for (FileIdBothDirectoryInformation f : share.list(path)) {
         if (ignoreFiles.contains(f.getFileName())) {
-          LOG.warn("File skipped: " + f.getFileName());
+          LOG.debug("File skipped: " + f.getFileName());
           continue;
         }
         boolean isDir = share.folderExists(path + "/" + f.getFileName());
@@ -140,7 +150,7 @@ private String getDirectoryContent(DiskShare share, String shareName, String pat
 
   private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray();
 
-  private DiskShare getDiskShare(URL url) throws UnsupportedEncodingException, IOException {
+  private Connection getSMBConnection(URL url) throws UnsupportedEncodingException, IOException {
     String hostname = url.getHost();
     int port = url.getPort();
     String shareAndPath = url.getPath();
@@ -163,6 +173,21 @@ private DiskShare getDiskShare(URL url) throws UnsupportedEncodingException, IOE
     // todo: we construct and destruct the connection for each and every URL. Can connection pools improve?
     SMBClient client = new SMBClient();
     Connection connection = client.connect(hostname, port);
+    return connection;
+  }
+
+  private DiskShare getDiskShare(URL url, Connection connection) throws UnsupportedEncodingException, IOException {
+    String shareAndPath = url.getPath();
+    String[] components = shareAndPath.split("/", 3);
+    String shareName = components[1];
+    shareName = java.net.URLDecoder.decode(shareName, StandardCharsets.UTF_8.name());
+    String path = components.length>2 ? "/" + components[2]: "/";
+    path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8.name());
+
+    LOG.trace("shareAndPath={}", shareAndPath);
+    LOG.trace("share={}", shareName);
+    LOG.trace("path={}", path);
+
     Session session = connection.authenticate(
       new AuthenticationContext(user, password.toCharArray(), domain)
     );
@@ -191,6 +216,58 @@ private String[] getSmbShareAndPath(URL url) throws UnsupportedEncodingException
     return new String[]{shareName, path};
   }
 
+  private Content getFileContent(String urlstr, String base, DiskShare share, String path, Metadata metadata) throws IOException {
+    FileAllInformation fileInfo = share.getFileInformation(path);
+    File file = share.openFile(path, EnumSet.of(AccessMask.GENERIC_READ), null, SMB2ShareAccess.ALL, SMB2CreateDisposition.FILE_OPEN, null);
+
+    InputStream fileIn = file.getInputStream();
+    byte[] bytes = null;
+    long fileSize = fileInfo.getStandardInformation().getEndOfFile();
+    long fetchSize = fileSize;
+    metadata.add("fileSize", String.valueOf(fileSize));
+
+    // todo: we run into issues if the file is bigger than 2 GB. I made the limit configurable
+    // but e.g. zip can no longer be evaluated if too big.
+    if (fetchSize > contentLimit) {
+      LOG.info("trunkating {}", urlstr);
+      fetchSize = contentLimit;
+
+      // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content
+      // discovery is incomplete
+      metadata.add("truncated", String.valueOf(fetchSize));
+    }
+
+    bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array
+
+    LOG.trace("retrieved {} bytes", bytes.length);
+
+    if (LOG.isTraceEnabled()) {
+      StringBuilder sb = new StringBuilder();
+      for (int i=0; i>>4]).append(HEX_ARRAY[b & 0xF]);
+      }
+      LOG.trace("retrieved {} bytes starting with {}", bytes.length, sb.toString());
+    }
+    LOG.trace("metadata={}", metadata);
+
+    return new Content(urlstr, base, bytes, "application/octet-stream", metadata, getConf());
+  }
+
+  private String getBase(Text urlstr) {
+          // construct a suitable base
+          String base = urlstr.toString();
+          if (base.endsWith("/")) {
+            base = base + ".";
+          }
+          if (!base.endsWith("/.")) {
+            base = base + "/.";
+          }
+
+          LOG.trace("base={}", base);
+          return base;
+  }
+
   /**
    * Get the {@link ProtocolOutput} for a given url and crawldatum.
    * 
@@ -205,86 +282,48 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
 
     try {
       URL url = new URI(urlstr.toString()).toURL();
-      String[] shareAndPath = getSmbShareAndPath(url);
-      String shareName = shareAndPath[0];
-      String path = shareAndPath[1];
-
-      DiskShare share = getDiskShare(url);
-
-      // now get the content
-      if (share.folderExists(path)) {
-        String htmlContent = getDirectoryContent(share, shareName, path);
-
-        // construct a suitable base
-        String base = urlstr.toString();
-        if (base.endsWith("/")) {
-          base = base + ".";
-        }
-        if (!base.endsWith("/.")) {
-          base = base + "/.";
-        }
-
-        LOG.trace("base={}", base);
-        LOG.trace("directory={}", htmlContent);
-
-        return new ProtocolOutput(
-          new Content(base, base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), 
-            ProtocolStatus.STATUS_SUCCESS
-          );
-      } else if (share.fileExists(path)) {
-        // todo: how can we store this, and maybe more metadata?
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
-
-        FileAllInformation fileInfo = share.getFileInformation(path);
-        File file = share.openFile(path, EnumSet.of(AccessMask.GENERIC_READ), null, SMB2ShareAccess.ALL, SMB2CreateDisposition.FILE_OPEN, null);
-
-        InputStream fileIn = file.getInputStream();
-        byte[] bytes = null;
-        long fileSize = fileInfo.getStandardInformation().getEndOfFile();
-        long fetchSize = fileSize;
-        metadata.add("fileSize", String.valueOf(fileSize));
-
-        // todo: we run into issues if the file is bigger than 2 GB. I made the limit configurable
-        // but e.g. zip can no longer be evaluated if too big.
-        if (fetchSize > contentLimit) {
-          LOG.info("trunkating {}", urlstr);
-          fetchSize = contentLimit;
-
-          // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content
-          // discovery is incomplete
-          metadata.add("truncated", String.valueOf(fetchSize));
-        }
-
-        bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array
-
-        LOG.trace("retrieved {} bytes", bytes.length);
-
-        if (LOG.isTraceEnabled()) {
-          StringBuilder sb = new StringBuilder();
-          for (int i=0; i>>4]).append(HEX_ARRAY[b & 0xF]);
+      String[] components = getSmbShareAndPath(url);
+      String shareName = components[0];
+      String path = components[1];
+
+      try (Connection connection = getSMBConnection(url)) {
+        try (DiskShare share = getDiskShare(url, connection)) {
+
+          // now get the content
+          if (share.folderExists(path)) {
+            String htmlContent = getDirectoryContent(share, shareName, path);
+            String base = getBase(urlstr);
+            LOG.trace("directory={}", htmlContent);
+
+            return new ProtocolOutput(
+              new Content(base, base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), 
+                ProtocolStatus.STATUS_SUCCESS
+              );
+          } else if (share.fileExists(path)) {
+            // todo: how can we store this, and maybe more metadata?
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
+
+            Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, path, metadata);
+
+            // create content and return result
+            String base = urlstr.toString();
+            return new ProtocolOutput(
+              content, 
+              ProtocolStatus.STATUS_SUCCESS
+            );
+
+          } else {
+            // communicate error
+            String message = "File not found: " + urlstr;
+            LOG.info(message);
+            String base = urlstr.toString();
+            return new ProtocolOutput(
+              new Content(base, base, message.getBytes(), "text/plain", new Metadata(), getConf()),
+              ProtocolStatus.STATUS_NOTFOUND
+            );
           }
-          LOG.trace("retrieved {} bytes starting with {}", bytes.length, sb.toString());
         }
-        LOG.trace("metadata={}", metadata);
-
-        // create content and return result
-        String base = urlstr.toString();
-        return new ProtocolOutput(
-          new Content(base, base, bytes, "application/octet-stream", metadata, getConf()), 
-          ProtocolStatus.STATUS_SUCCESS
-        );
-      } else {
-        // communicate error
-        String message = "File not found: " + urlstr;
-        LOG.info(message);
-        String base = urlstr.toString();
-        return new ProtocolOutput(
-          new Content(base, base, message.getBytes(), "text/plain", new Metadata(), getConf()),
-          ProtocolStatus.STATUS_NOTFOUND
-        );
       }
 
     } catch(Exception e) {
@@ -314,19 +353,31 @@ public BaseRobotRules getRobotRules(Text urlstr, CrawlDatum datum, List
 
     try {
       URL url = new URI(urlstr.toString()).toURL();
-      DiskShare share = getDiskShare(url);
-      if (!share.fileExists("/robots.txt")) {
-        // no robots file? Then we can scan everything
-        LOG.debug("No robots.txt found -> crawl everything");
-        return RobotRulesParser.EMPTY_RULES;
-      }
+      try (Connection connection = getSMBConnection(url)) {
+        try (DiskShare share = getDiskShare(url, connection)) {
+          // search for the file compliant to https://www.rfc-editor.org/rfc/rfc9309.html
+          // chapter 2.3
+          if (!share.fileExists("/robots.txt")) {
+            // no robots file? Then we can scan everything
+            LOG.info("No robots.txt found for {} -> crawl everything", urlstr);
+            return RobotRulesParser.EMPTY_RULES;
+          }
+
+          Metadata metadata = new Metadata();
+          Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, "/robots.txt", metadata);
 
-      // todo: we should read some robots file from the smb share
-      // until then we simply do nothing
-      LOG.info("/robots.txt found -> we do not crawl");
-      return RobotRulesParser.FORBID_ALL_RULES;
+          // make use of
+          // https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/robots/SimpleRobotRulesParser.html#parseContent(java.lang.String,byte%5B%5D,java.lang.String,java.util.Collection)
+          SimpleRobotRulesParser simpleRobotsRulesParser = new SimpleRobotRulesParser();
+          SimpleRobotRules rules =  simpleRobotsRulesParser.parseContent(urlstr.toString(), content.getContent(), content.getContentType(), agentNames);
+
+          LOG.info("robots.txt for {} found and parsed", urlstr);
+          return rules;
+        }
+      }
+      
     } catch (Exception e) {
-      LOG.info("Could not get robot rules for {}", e);
+      LOG.info("Could not get robot rules for {}", urlstr, e);
       return RobotRulesParser.DEFER_VISIT_RULES;
     }
   }

From 3dbb911beaf0b37cbae7bb3f885a58341eb46165 Mon Sep 17 00:00:00 2001
From: cube 
Date: Mon, 7 Oct 2024 13:05:21 +0200
Subject: [PATCH 04/10] better handling of 'invalid network name' (invalid
 share names)

---
 .../nutch/protocol/smb/SmbProtocol.java       | 33 ++++++++++++++++---
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
index 4b99ae74c0..e4a693ca3b 100755
--- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
+++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
@@ -44,9 +44,11 @@
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.protocol.RobotRulesParser;
 import com.hierynomus.msdtyp.AccessMask;
+import com.hierynomus.mserref.NtStatus;
 import com.hierynomus.msfscc.FileAttributes;
 import com.hierynomus.msfscc.fileinformation.FileAllInformation;
 import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation;
+import com.hierynomus.mssmb2.SMBApiException;
 import com.hierynomus.mssmb2.SMB2CreateDisposition;
 import com.hierynomus.mssmb2.SMB2CreateOptions;
 import com.hierynomus.mssmb2.SMB2ShareAccess;
@@ -287,16 +289,17 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
       String path = components[1];
 
       try (Connection connection = getSMBConnection(url)) {
+        String base = base = getBase(urlstr);
+
         try (DiskShare share = getDiskShare(url, connection)) {
 
           // now get the content
           if (share.folderExists(path)) {
             String htmlContent = getDirectoryContent(share, shareName, path);
-            String base = getBase(urlstr);
             LOG.trace("directory={}", htmlContent);
 
             return new ProtocolOutput(
-              new Content(base, base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), 
+              new Content(urlstr.toString(), base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), 
                 ProtocolStatus.STATUS_SUCCESS
               );
           } else if (share.fileExists(path)) {
@@ -307,7 +310,6 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
             Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, path, metadata);
 
             // create content and return result
-            String base = urlstr.toString();
             return new ProtocolOutput(
               content, 
               ProtocolStatus.STATUS_SUCCESS
@@ -317,12 +319,23 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
             // communicate error
             String message = "File not found: " + urlstr;
             LOG.info(message);
-            String base = urlstr.toString();
             return new ProtocolOutput(
-              new Content(base, base, message.getBytes(), "text/plain", new Metadata(), getConf()),
+              new Content(urlstr.toString(), base, message.getBytes(), "text/plain", new Metadata(), getConf()),
               ProtocolStatus.STATUS_NOTFOUND
             );
           }
+        } catch (SMBApiException e) {
+          if (e.getStatus() == NtStatus.STATUS_BAD_NETWORK_NAME) {
+
+            // this URL makes to sense to be scanned. Make sure this URL gets evicted from the CrawlDB.
+            LOG.error("Bad network name: {}", urlstr);
+            return new ProtocolOutput(
+              new Content(urlstr.toString(), base, e.getMessage().getBytes(), "text/plain", new Metadata(), getConf()),
+              ProtocolStatus.STATUS_NOTFOUND
+            );
+          } else {
+            throw e;
+          }
         }
       }
 
@@ -373,6 +386,16 @@ public BaseRobotRules getRobotRules(Text urlstr, CrawlDatum datum, List
 
           LOG.info("robots.txt for {} found and parsed", urlstr);
           return rules;
+        } catch (SMBApiException e) {
+          if (e.getStatus() == NtStatus.STATUS_BAD_NETWORK_NAME) {
+
+            // this URL makes to sense to be scanned. But we assume 'empty rules' as no robots.txt exists and
+            // in getProtocolOutput we can make sure this URL gets evicted from the CrawlDB.
+            LOG.error("Bad network name: {} -> crawl everything", urlstr);
+            return RobotRulesParser.EMPTY_RULES;
+          } else {
+            throw e;
+          }
         }
       }
       

From 17a9103bbdbbfdf2d298de88d35928533de95478 Mon Sep 17 00:00:00 2001
From: cube 
Date: Mon, 14 Oct 2024 22:46:59 +0200
Subject: [PATCH 05/10] project cleanup

---
 .gitignore           |  3 ---
 runNutch.sh          | 57 --------------------------------------------
 src/plugin/build.xml |  2 ++
 3 files changed, 2 insertions(+), 60 deletions(-)
 delete mode 100755 runNutch.sh

diff --git a/.gitignore b/.gitignore
index f1af65b8b7..9cac3379cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,3 @@ ivy/dependency-check-ant/*
 .gradle*
 ivy/apache-rat-*
 .vscode
-crawl
-urls
-solr_datadir
\ No newline at end of file
diff --git a/runNutch.sh b/runNutch.sh
deleted file mode 100755
index 8769c2134f..0000000000
--- a/runNutch.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#/bin/bash
-
-if [ -z "$JAVA_HOME" ]
-then
-  echo ERROR: JAVA_HOME is not set.
-  exit 1
-fi
-
-
-echo "Will remove existing CrawlDb..."
-sleep 5
-echo "Removing existing CrawlDb..."
-banner "Delete DB"
-rm -rf crawl/* || exit 1
-docker exec -it solr_nutch solr delete -c nutch || exit 1
-
-banner "Inject URLs"
-./runtime/local/bin/nutch inject crawl/crawldb urls
-
-banner "Create Solr"
-cp src/plugin/indexer-solr/schema.xml solr_datadir
-docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/solrconfig.xml /var/solr/data/nutch
-docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/stopwords.txt /var/solr/data/nutch
-docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/protwords.txt /var/solr/data/nutch
-docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/synonyms.txt /var/solr/data/nutch
-docker exec -it solr_nutch solr create_core -c nutch -d /var/solr/data/nutch || exit 1
-
-while true
-do
-  sleep 5
-  banner Generate Segment
-  ./runtime/local/bin/nutch generate crawl/crawldb crawl/segments/
-  segment=`ls crawl/segments/ | tail -1`
-  echo "Found segment $segment"
-  sleep 5
-  if [ "$?" == "0" ] && [ ! -z "$segment" ]
-  then
-    banner "Fetch"
-    ./runtime/local/bin/nutch fetch crawl/segments/$segment
-    if [ "$?" == "0" ]
-    then
-      sleep 5
-      banner "Parse"
-      ./runtime/local/bin/nutch parse crawl/segments/$segment
-      sleep 5
-      banner UpdateDB
-      ./runtime/local/bin/nutch updatedb crawl/crawldb crawl/segments/$segment
-      sleep 5
-      banner Index
-      ./runtime/local/bin/nutch index crawl/crawldb crawl/segments/$segment
-      sleep 10
-      rm -rf crawl/segments/$segment
-    fi
-  else
-    exit 5
-  fi
-done
\ No newline at end of file
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 975a35dad9..92430dd9f4 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -143,6 +143,7 @@
      
      
      
+     
      
      
      
@@ -227,6 +228,7 @@
     
     
     
+    
     
     
     

From c354088d54b499a3a853ce5b05cb54aeb4acab8c Mon Sep 17 00:00:00 2001
From: cube 
Date: Mon, 14 Oct 2024 23:56:19 +0200
Subject: [PATCH 06/10] remove non-required dependencies

---
 src/plugin/protocol-smb/ivy.xml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/plugin/protocol-smb/ivy.xml b/src/plugin/protocol-smb/ivy.xml
index 9fd708aad7..5a15095a4d 100755
--- a/src/plugin/protocol-smb/ivy.xml
+++ b/src/plugin/protocol-smb/ivy.xml
@@ -38,10 +38,15 @@
 
   
     
+
   
   
 

From a7dc278a2444a362c9732824af72eb1b20a2cd73 Mon Sep 17 00:00:00 2001
From: cube 
Date: Wed, 16 Oct 2024 21:18:22 +0200
Subject: [PATCH 07/10] can use different authentication per URL pattern

---
 conf/url-authentication.xml.template          |   6 +
 .../nutch/protocol/smb/SmbProtocol.java       |  47 ++++---
 .../nutch/protocol/smb/URLAuthentication.java | 120 ++++++++++++++++++
 3 files changed, 153 insertions(+), 20 deletions(-)
 create mode 100644 conf/url-authentication.xml.template
 create mode 100644 src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java

diff --git a/conf/url-authentication.xml.template b/conf/url-authentication.xml.template
new file mode 100644
index 0000000000..84ddc51c16
--- /dev/null
+++ b/conf/url-authentication.xml.template
@@ -0,0 +1,6 @@
+
+
+
+    
+    
+
\ No newline at end of file
diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
index e4a693ca3b..605ef66ee8 100755
--- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
+++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
@@ -43,6 +43,8 @@
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.protocol.smb.URLAuthentication.Authentication;
+import org.xml.sax.InputSource;
 import com.hierynomus.msdtyp.AccessMask;
 import com.hierynomus.mserref.NtStatus;
 import com.hierynomus.msfscc.FileAttributes;
@@ -70,21 +72,18 @@ public class SmbProtocol implements Protocol {
   protected static final Logger LOG = LoggerFactory.getLogger(SmbProtocol.class);
 
   private Configuration conf;
+  private URLAuthentication urlAuthentication;
 
-  private String user;
-  private String password;
-  private String domain;
   private int contentLimit;
   private Set ignoreFiles;
   private Collection agentNames;
 
   public SmbProtocol() {
-    // todo: files that should be skipped could be configurable.
+    // Place here only files that SMB needs to ignore. Other files such as
+    // version control (.git, .svn) can be ignored via the regex url filter.
     this.ignoreFiles = new HashSet<>();
     ignoreFiles.add(".");
     ignoreFiles.add("..");
-    ignoreFiles.add(".svn");
-    ignoreFiles.add(".git");
   }
 
   @Override
@@ -102,17 +101,11 @@ public void setConf(Configuration conf) {
       throw new IllegalArgumentException("Config parameter 'smb.agent.name' not set or empty.");
     }
 
-    // todo: is it possible to use configuration "per server" or "per share"?
-    user = conf.getTrimmed("smb.user");
-    if (user == null || user.isEmpty()) {
-      throw new IllegalArgumentException("Config parameter 'smb.user' not set or empty.");
-    }
-    password = conf.getTrimmed("smb.password");
-    if (password == null || password.isEmpty()) {
-      throw new IllegalArgumentException("Config parameter 'smb.password' not set or empty.");
-    }
-    domain = conf.getTrimmed("smb.domain");
-    contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE);
+    // load authentication data
+    String filename = conf.get("smb.url-authentication.file", "url-authentication.xml");
+    InputStream ssInputStream = conf.getConfResourceAsInputStream(filename);
+    InputSource inputSource = new InputSource(ssInputStream);
+    urlAuthentication = URLAuthentication.loadAuthentication(inputSource);
   }
 
   /**
@@ -179,6 +172,10 @@ private Connection getSMBConnection(URL url) throws UnsupportedEncodingException
   }
 
   private DiskShare getDiskShare(URL url, Connection connection) throws UnsupportedEncodingException, IOException {
+    if (urlAuthentication == null) {
+      throw new IllegalStateException("urlAuthentication must not be null");
+    }
+
     String shareAndPath = url.getPath();
     String[] components = shareAndPath.split("/", 3);
     String shareName = components[1];
@@ -190,9 +187,19 @@ private DiskShare getDiskShare(URL url, Connection connection) throws Unsupporte
     LOG.trace("share={}", shareName);
     LOG.trace("path={}", path);
 
-    Session session = connection.authenticate(
-      new AuthenticationContext(user, password.toCharArray(), domain)
-    );
+    Authentication auth = urlAuthentication.getAuthenticationFor(url.toString());
+    Session session = null;
+    if (auth == null) {
+      LOG.trace("Anonymously connecting to {}", url);
+      session = connection.authenticate(
+        AuthenticationContext.anonymous()
+      );
+    } else {
+      LOG.trace("Authenticating with {}", auth);
+      session = connection.authenticate(
+        new AuthenticationContext(auth.getUser(), auth.getPassword(), auth.getDomain())
+      );
+    }
     // Connect to Share
     DiskShare share = (DiskShare) session.connectShare(shareName);
     return share;
diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java
new file mode 100644
index 0000000000..2eeca7bca0
--- /dev/null
+++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java
@@ -0,0 +1,120 @@
+package org.apache.nutch.protocol.smb;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+public class URLAuthentication {
+    protected static final Logger LOG = LoggerFactory.getLogger(URLAuthentication.class);
+
+    public static class Authentication {
+        protected static final Logger LOG = LoggerFactory.getLogger(Authentication.class);
+
+        private Pattern pattern;
+        private String user;
+        private String domain;
+        private char[] password;
+
+        protected Authentication(String pattern, String user, String domain, char[] password) {
+            LOG.debug("Authentication({}, {}, {}, *****)", pattern, user, domain);
+            if (pattern == null || pattern.isEmpty()) {
+                throw new IllegalArgumentException("pattern must not be null");
+            }
+            if (user == null || user.isEmpty()) {
+                throw new IllegalArgumentException("user must not be null");
+            }
+            if (password == null) {
+                throw new IllegalArgumentException("password must not be null");
+            }
+            this.pattern = Pattern.compile(pattern);
+            this.user = user;
+            this.domain = domain;
+            this.password = password;
+        }
+
+        public boolean matches(String url) {
+            LOG.debug("matches({})", url);
+            return pattern.matcher(url).matches();
+        }
+
+        protected Pattern getPattern() {
+            return pattern;
+        }
+
+        public String getUser() {
+            return user;
+        }
+
+        public char[] getPassword() {
+            return password;
+        }
+
+        public String getDomain() {
+            return domain;
+        }
+    }
+
+    private List authentications;
+    
+    public static URLAuthentication loadAuthentication(InputSource inputSource) {
+        LOG.debug("loadAuthentication(...)");
+
+        URLAuthentication result = new URLAuthentication();
+
+        try {
+            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+            DocumentBuilder builder = factory.newDocumentBuilder();
+            Document document = builder.parse(inputSource);
+            Element rootElement = document.getDocumentElement();
+            NodeList authList = rootElement.getElementsByTagName("authentication");
+            for (int i = 0; i();
+    }
+
+    private void addAuthentication(Authentication auth) {
+        LOG.debug("addAuthentication({})", auth);
+        authentications.add(auth);
+    }
+
+    public Authentication getAuthenticationFor(String url) {
+        LOG.debug("getAuthenticationFor({})", url);
+
+        for (Authentication auth: authentications) {
+            if (auth.matches(url)) {
+                LOG.trace("matched pattern {}", auth.getPattern());
+                return auth;
+            } else {
+                LOG.trace("missed pattern {}", auth.getPattern());
+            }
+        }
+
+        LOG.trace("Nothing found in {} entries", authentications.size());
+        return null;
+    }
+
+}

From d2726adf73c458036192a0689c0d7b7f27557696 Mon Sep 17 00:00:00 2001
From: cube 
Date: Thu, 17 Oct 2024 20:39:09 +0200
Subject: [PATCH 08/10] added ALv2 header

---
 conf/url-authentication.xml.template             | 16 ++++++++++++++++
 .../nutch/protocol/smb/URLAuthentication.java    | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/conf/url-authentication.xml.template b/conf/url-authentication.xml.template
index 84ddc51c16..17c13fb962 100644
--- a/conf/url-authentication.xml.template
+++ b/conf/url-authentication.xml.template
@@ -1,4 +1,20 @@
 
+
+
 
 
     
diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java
index 2eeca7bca0..d623b28ee1 100644
--- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java
+++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java
@@ -1,3 +1,19 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.protocol.smb;
 
 import java.util.ArrayList;

From 379ccf74284910462113585f3f7112573ffb0d11 Mon Sep 17 00:00:00 2001
From: cube 
Date: Fri, 18 Oct 2024 21:40:07 +0200
Subject: [PATCH 09/10] Cache robots.txt to cut network requests in half

---
 src/plugin/protocol-smb/ivy.xml               |   2 +
 .../nutch/protocol/smb/SmbProtocol.java       | 124 +++++++++++++-----
 2 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/src/plugin/protocol-smb/ivy.xml b/src/plugin/protocol-smb/ivy.xml
index 5a15095a4d..94c0bb3bb4 100755
--- a/src/plugin/protocol-smb/ivy.xml
+++ b/src/plugin/protocol-smb/ivy.xml
@@ -47,6 +47,8 @@
     
     
 -->
+    
+
   
   
 
diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
index 605ef66ee8..c74029eec4 100755
--- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
+++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
@@ -64,11 +64,13 @@
 import crawlercommons.robots.BaseRobotRules;
 import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
+import java.util.Map;
+import java.util.TreeMap;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public class SmbProtocol implements Protocol {
+public class SmbProtocol implements Protocol, AutoCloseable {
   protected static final Logger LOG = LoggerFactory.getLogger(SmbProtocol.class);
 
   private Configuration conf;
@@ -77,6 +79,12 @@ public class SmbProtocol implements Protocol {
   private int contentLimit;
   private Set ignoreFiles;
   private Collection agentNames;
+  
+  private long scannedFolderCount;
+  private long scannedFileCount;
+  private long truncatedFileCount;
+  
+  private Map robotsCache = new TreeMap<>();
 
   public SmbProtocol() {
     // Place here only files that SMB needs to ignore. Other files such as
@@ -170,6 +178,13 @@ private Connection getSMBConnection(URL url) throws UnsupportedEncodingException
     Connection connection = client.connect(hostname, port);
     return connection;
   }
+  
+  private URL getRobotsUrl(URL url) throws URISyntaxException, MalformedURLException {
+    String shareAndPath = url.getPath();
+    String[] components = shareAndPath.split("/", 3);
+    String shareName = components[1];
+    return new URI(url.getProtocol(), url.getUserInfo(), url.getHost(), url.getPort(), "/" + shareName + "/robots.txt", null, null).toURL();
+  }
 
   private DiskShare getDiskShare(URL url, Connection connection) throws UnsupportedEncodingException, IOException {
     if (urlAuthentication == null) {
@@ -244,6 +259,7 @@ private Content getFileContent(String urlstr, String base, DiskShare share, Stri
       // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content
       // discovery is incomplete
       metadata.add("truncated", String.valueOf(fetchSize));
+      truncatedFileCount++;
     }
 
     bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array
@@ -304,6 +320,7 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
           if (share.folderExists(path)) {
             String htmlContent = getDirectoryContent(share, shareName, path);
             LOG.trace("directory={}", htmlContent);
+            scannedFolderCount++;
 
             return new ProtocolOutput(
               new Content(urlstr.toString(), base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), 
@@ -315,6 +332,7 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
             metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
 
             Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, path, metadata);
+            scannedFileCount++;
 
             // create content and return result
             return new ProtocolOutput(
@@ -371,44 +389,84 @@ public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) {
   public BaseRobotRules getRobotRules(Text urlstr, CrawlDatum datum, List robotsTxtContent) {
     LOG.trace("getRobotRules({}, {}, {})", urlstr, datum, robotsTxtContent);
 
+    URL url = null;
+    URL robotsURL = null;
     try {
-      URL url = new URI(urlstr.toString()).toURL();
-      try (Connection connection = getSMBConnection(url)) {
-        try (DiskShare share = getDiskShare(url, connection)) {
-          // search for the file compliant to https://www.rfc-editor.org/rfc/rfc9309.html
-          // chapter 2.3
-          if (!share.fileExists("/robots.txt")) {
-            // no robots file? Then we can scan everything
-            LOG.info("No robots.txt found for {} -> crawl everything", urlstr);
-            return RobotRulesParser.EMPTY_RULES;
-          }
+      // calculate new URL
+      url = new URI(urlstr.toString()).toURL();
+      robotsURL = getRobotsUrl(url);
+      LOG.debug("Robots URL = {}", robotsURL);
 
-          Metadata metadata = new Metadata();
-          Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, "/robots.txt", metadata);
-
-          // make use of
-          // https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/robots/SimpleRobotRulesParser.html#parseContent(java.lang.String,byte%5B%5D,java.lang.String,java.util.Collection)
-          SimpleRobotRulesParser simpleRobotsRulesParser = new SimpleRobotRulesParser();
-          SimpleRobotRules rules =  simpleRobotsRulesParser.parseContent(urlstr.toString(), content.getContent(), content.getContentType(), agentNames);
-
-          LOG.info("robots.txt for {} found and parsed", urlstr);
-          return rules;
-        } catch (SMBApiException e) {
-          if (e.getStatus() == NtStatus.STATUS_BAD_NETWORK_NAME) {
-
-            // this URL makes to sense to be scanned. But we assume 'empty rules' as no robots.txt exists and
-            // in getProtocolOutput we can make sure this URL gets evicted from the CrawlDB.
-            LOG.error("Bad network name: {} -> crawl everything", urlstr);
-            return RobotRulesParser.EMPTY_RULES;
-          } else {
-            throw e;
-          }
+      
+      // if we are running multithreaded, make only one thread at a time check
+      // the cache. It means if we miss, only one thread will go and fetch/parse
+      // robots.txt while other threads will wait
+      synchronized(robotsCache) {          
+        if (robotsCache.containsKey(robotsURL.toString())) {
+            LOG.debug("Found {} in cache", robotsURL);
+            return robotsCache.get(robotsURL.toString());
         }
-      }
+      
+        try (Connection connection = getSMBConnection(url)) {
+          try (DiskShare share = getDiskShare(url, connection)) {
+            // search for the file compliant to https://www.rfc-editor.org/rfc/rfc9309.html
+            // chapter 2.3
+            if (!share.fileExists("/robots.txt")) {
+              // no robots file? Then we can scan everything
+              LOG.info("No robots.txt found for {} -> crawl everything", robotsURL);
+              BaseRobotRules rules = RobotRulesParser.EMPTY_RULES;
+              robotsCache.put(robotsURL.toString(), rules); // cache the value - we will need it more often
+              return rules;
+            }
+
+            Metadata metadata = new Metadata();
+            Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, "/robots.txt", metadata);
+
+            // make use of
+            // https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/robots/SimpleRobotRulesParser.html#parseContent(java.lang.String,byte%5B%5D,java.lang.String,java.util.Collection)
+            SimpleRobotRulesParser simpleRobotsRulesParser = new SimpleRobotRulesParser();
+            SimpleRobotRules rules =  simpleRobotsRulesParser.parseContent(urlstr.toString(), content.getContent(), content.getContentType(), agentNames);
+            robotsCache.put(robotsURL.toString(), rules); // cache the value - we will need it more often
+            LOG.info("found and parsed {}", robotsURL);
+            return rules;
+          } catch (SMBApiException e) {
+            if (e.getStatus() == NtStatus.STATUS_BAD_NETWORK_NAME) {
+
+              // this URL makes to sense to be scanned. But we assume 'empty rules' as no robots.txt exists and
+              // in getProtocolOutput we can make sure this URL gets evicted from the CrawlDB.
+              LOG.error("Bad network name: {} -> crawl everything", urlstr);
+              BaseRobotRules rules = RobotRulesParser.EMPTY_RULES;
+              robotsCache.put(robotsURL.toString(), rules); // cache the value - we will need it more often
+              return rules;
+            } else {
+              throw e;
+            }
+          } // DiskShare
+        } // Connection
+      } // synchronized
       
     } catch (Exception e) {
-      LOG.info("Could not get robot rules for {}", urlstr, e);
+      LOG.info("Could not get robot rules for {} (initially {})", robotsURL, urlstr, e);
       return RobotRulesParser.DEFER_VISIT_RULES;
     }
   }
+
+  /**
+   * Closes this resource, relinquishing any underlying resources.
+   * 
+   * Some statistics is printed.
+   */
+  public void close() {
+    LOG.info("Closing plugin");
+    LOG.info("Scanned folders: {}", scannedFolderCount);
+    LOG.info("Scanned files    {}", scannedFileCount);
+    LOG.info("Truncated files  {}", truncatedFileCount);
+  }
+  
+  /**
+   * As Nutch does not close protocols let's do that before GC.
+   */
+  public void finalize() {
+      close();
+  }
 }

From 4596f71803f5f447aa462791857e6a10a2df9096 Mon Sep 17 00:00:00 2001
From: cube 
Date: Sun, 3 Nov 2024 16:36:39 +0100
Subject: [PATCH 10/10] bugfix: Instread of trunkating at zero bytes by default
 grab a maximum of content

---
 .../nutch/protocol/smb/SmbProtocol.java       |  3 ++
 .../nutch/protocol/smb/TestSmbProtocol.java   | 51 +++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java

diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
index c74029eec4..8e8262d53f 100755
--- a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
+++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java
@@ -114,6 +114,9 @@ public void setConf(Configuration conf) {
     InputStream ssInputStream = conf.getConfResourceAsInputStream(filename);
     InputSource inputSource = new InputSource(ssInputStream);
     urlAuthentication = URLAuthentication.loadAuthentication(inputSource);
+
+    contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE-100);
+    LOG.info("Understood smb.content.limit={}", contentLimit);
   }
 
   /**
diff --git a/src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java b/src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java
new file mode 100644
index 0000000000..7243148251
--- /dev/null
+++ b/src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package plugin.protocol-smb.src.test.org.apache.nutch.protocol.smb;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestSmbProtocol {
+    protected static final Logger LOG = LoggerFactory.getLogger(TestSmbProtocol.class);
+    
+    @Before
+    public void setUp() {
+        LOG.warn("setUp()");
+        Assert.fail();
+    }
+  
+    @Test
+    public void testSetContentType1() {
+        LOG.warn("testSetContentType1()");
+        Assert.fail();
+    }
+  
+    @Test
+    public void testSetContentType12) {
+        LOG.warn("testSetContentType2()");
+        Assert.fail();
+    }
+  
+}