diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index caa3f861ea..63b9224b0d 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -223,6 +223,12 @@ public void setConf(Configuration conf) { this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 1024 * 1024); this.maxDuration = conf.getInt("http.time.limit", -1); + if (maxDuration >= 0 && (maxDuration * 1000) < timeout) { + LOG.warn( + "The configuration property http.time.limit ({} seconds) is less than http.timeout ({} ms), " + + "the entire request will time out before individual reads are timed out.", + maxDuration, timeout); + } this.partialAsTruncated = conf.getBoolean("http.partial.truncated", false); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf.get("http.agent.description"), @@ -272,8 +278,8 @@ public void setConf(Configuration conf) { } } catch (Exception e) { - this.logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, - StringUtils.stringifyException(e)); + this.logger.warn("Failed to read http.agent.rotate.file {}:", + agentsFile, e); this.userAgentNames = null; } finally { if (br != null) { @@ -314,8 +320,8 @@ public void setConf(Configuration conf) { } } } catch (Exception e) { - this.logger.warn("Failed to read http.agent.host.cookie.file {}: {}", - cookieFile, StringUtils.stringifyException(e)); + this.logger.warn("Failed to read http.agent.host.cookie.file {}:", + cookieFile, e); this.hostCookies = null; } finally { if (br != null) { @@ -614,8 +620,9 @@ protected void logConf() { this.logger.info("http.proxy.host = {}", this.proxyHost); this.logger.info("http.proxy.port = {}", this.proxyPort); this.logger.info("http.proxy.exception.list = {}", this.useProxy); - this.logger.info("http.timeout = {}", this.timeout); - this.logger.info("http.content.limit = {}", this.maxContent); + this.logger.info("http.timeout = {} ms", this.timeout); + this.logger.info("http.time.limit = {} seconds", this.maxDuration); + this.logger.info("http.content.limit = {} bytes", this.maxContent); this.logger.info("http.agent = {}", this.userAgent); this.logger.info("http.accept.language = {}", this.acceptLanguage); this.logger.info("http.accept = {}", this.accept); diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index a9d2b14d42..b47142cb24 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -119,6 +119,11 @@ public void setConf(Configuration conf) { .writeTimeout(this.timeout, TimeUnit.MILLISECONDS) .readTimeout(this.timeout, TimeUnit.MILLISECONDS); + if (this.maxDuration >= 0) { + // timeout for the entire request + builder.callTimeout(this.maxDuration, TimeUnit.SECONDS); + } + if (!this.tlsCheckCertificate) { try { SSLContext trustAllSslContext = SSLContext.getInstance("TLS"); diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 605c03390f..9aa1526157 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -17,6 +17,7 @@ package org.apache.nutch.protocol.okhttp; import java.io.IOException; +import java.io.InterruptedIOException; import java.lang.invoke.MethodHandles; import java.net.URL; import java.util.Base64; @@ -179,7 +180,12 @@ private final byte[] toByteArray(final ResponseBody responseBody, } catch (IOException e) { if (partialAsTruncated && source.getBuffer().size() > 0) { // treat already fetched content as truncated - truncated.setReason(TruncatedContentReason.DISCONNECT); + if (e instanceof InterruptedIOException) { + // thrown by OkHttp if the call timeout is hit + truncated.setReason(TruncatedContentReason.TIME); + } else { + truncated.setReason(TruncatedContentReason.DISCONNECT); + } LOG.info("Truncated content for {}, partial fetch caused by:", this.url, e); } else {