diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 0000000000..377447ad9a --- /dev/null +++ b/.codespellrc @@ -0,0 +1,5 @@ +[codespell] +# JUnit 5 annotations incorrectly flagged as spelling errors +ignore-words-list = AfterAll,BeforeAll +# CHANGES.md is a historical changelog not maintained by this branch +skip = CHANGES.md diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 95cfcd1c25..6afc610b38 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -224,6 +224,8 @@ jobs: - 'src/plugin/**' indexer_plugins: - 'src/plugin/indexer-*/**' + protocol_plugins: + - 'src/plugin/protocol-*/**' buildconf: - 'build.xml' - 'ivy/ivy.xml' @@ -244,6 +246,10 @@ jobs: - name: test indexer integration if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }} run: ant clean test-indexer-integration -buildfile build.xml + # run protocol integration tests when protocol plugin files change (Docker required, ubuntu-latest only) + - name: test protocol integration + if: ${{ steps.filter.outputs.protocol_plugins == 'true' && matrix.os == 'ubuntu-latest' }} + run: ant clean test-protocol-integration -buildfile build.xml - name: Check for test results id: check_tests if: always() && matrix.os == 'ubuntu-latest' diff --git a/.yetus/blanks-eol.txt b/.yetus/blanks-eol.txt index 2362619874..fed8fe9e3a 100644 --- a/.yetus/blanks-eol.txt +++ b/.yetus/blanks-eol.txt @@ -1,3 +1,4 @@ # Ignore trailing blanks in Yetus-generated patch/diff and logs (not source files). # See --blanks-eol-ignore-file in the blanks plugin. ^out/ +CHANGES.md diff --git a/.yetus/blanks-tabs.txt b/.yetus/blanks-tabs.txt index 07e4fb8c9c..389b33dbcf 100644 --- a/.yetus/blanks-tabs.txt +++ b/.yetus/blanks-tabs.txt @@ -1,3 +1,4 @@ # Ignore tabs in Yetus-generated patch dir (not source files). # See --blanks-tabs-ignore-file in the blanks plugin. ^out/ +CHANGES.md diff --git a/build.xml b/build.xml index b6fa266a4a..3f8fad7c94 100644 --- a/build.xml +++ b/build.xml @@ -99,7 +99,7 @@ - + @@ -528,6 +528,10 @@ + + + + @@ -1079,7 +1083,7 @@ projectName="Apache Nutch Spotbugs Analysis" stylesheet="fancy-hist.xsl" > - + diff --git a/conf/log4j2.xml b/conf/log4j2.xml index 713bfdc7fe..6faf4329fa 100644 --- a/conf/log4j2.xml +++ b/conf/log4j2.xml @@ -19,16 +19,16 @@ - ${sys:hadoop.log.dir:-./logs} - ${sys:hadoop.log.file:-hadoop.log} + ${sys:hadoop.log.dir:-./logs} + ${sys:hadoop.log.file:-hadoop.log} - + - + diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 16d44674fa..70763adc02 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -122,9 +122,13 @@ - + + + + + diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index 7b07810ae2..0690863361 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -87,6 +87,7 @@ + @@ -277,6 +278,33 @@ Indexer integration tests failed! + + + + + + + + + + + + + + + + + + + + + + + + + Protocol integration tests failed! + + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 18d00da3b3..dacc5fd616 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -183,6 +183,19 @@ + + + + + + + + + + + + + diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml index 7749a873ff..5e6a0d8c72 100644 --- a/src/plugin/protocol-ftp/ivy.xml +++ b/src/plugin/protocol-ftp/ivy.xml @@ -37,7 +37,8 @@ - + + diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 8cf58f75e7..3570d91188 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -183,6 +183,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { } catch (Exception e) { LOG.error("Could not get protocol output for {}: {}", url, e.getMessage()); + datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text("500")); return new ProtocolOutput(null, new ProtocolStatus(e)); } } diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java index d6f7fd64a4..8796cfc0b3 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -164,7 +164,8 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) Ftp.LOG.info("connect to {}", addr); } - ftp.client.connect(addr); + int port = url.getPort(); + ftp.client.connect(addr, port > 0 ? port : FTP.DEFAULT_PORT); if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) { ftp.client.disconnect(); Ftp.LOG.warn("ftp.client.connect() failed: {} {}", addr, @@ -206,6 +207,11 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) try { ftp.parser = null; String parserKey = ftp.client.getSystemName(); + // strip surrounding quotes that some servers include in SYST reply + if (parserKey.length() > 2 && parserKey.charAt(0) == '"' + && parserKey.charAt(parserKey.length() - 1) == '"') { + parserKey = parserKey.substring(1, parserKey.length() - 1); + } // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8 if (parserKey.startsWith("UNKNOWN Type: L8")) parserKey = "UNIX Type: L8"; @@ -302,6 +308,11 @@ private void getFileAsHttpResponse(String path, long lastModified) list = new LinkedList(); ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser); + if (list.isEmpty()) { + this.code = 404; // file not found (server returned empty listing) + return; + } + FTPFile ftpFile = (FTPFile) list.get(0); this.headers.set(Response.CONTENT_LENGTH, Long.valueOf(ftpFile.getSize()).toString()); diff --git a/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java new file mode 100644 index 0000000000..ccd3cd1ccb --- /dev/null +++ b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.ftp; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolPluginIntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockftpserver.fake.FakeFtpServer; +import org.mockftpserver.fake.UserAccount; +import org.mockftpserver.fake.filesystem.DirectoryEntry; +import org.mockftpserver.fake.filesystem.FileEntry; +import org.mockftpserver.fake.filesystem.UnixFakeFileSystem; + +/** + * Integration tests for protocol-ftp using an in-process FakeFtpServer. + * + *

FTP passive mode with Testcontainers requires that the PASV response IP + * matches the host-visible address of the container, which is not reliable + * across Docker Desktop (macOS/Windows) and Linux Docker environments. An + * in-process {@link FakeFtpServer} from MockFtpServer avoids this constraint + * while still testing the Nutch FTP client against a real FTP protocol + * implementation. + */ +public class FtpProtocolIT implements ProtocolPluginIntegrationTest { + + private static final String FTP_USER = "testuser"; + private static final String FTP_PASS = "testpass"; + private static final String FTP_HOME = "/home/testuser"; + private static final String TEST_FILE = "test.txt"; + private static final String TEST_CONTENT = "FTP integration test content"; + + private static FakeFtpServer fakeFtpServer; + private Ftp protocol; + + @BeforeAll + static void startFtpServer() { + fakeFtpServer = new FakeFtpServer(); + fakeFtpServer.setServerControlPort(0); // bind to a random free port + + UserAccount userAccount = new UserAccount(FTP_USER, FTP_PASS, FTP_HOME); + fakeFtpServer.addUserAccount(userAccount); + + UnixFakeFileSystem fileSystem = new UnixFakeFileSystem(); + fileSystem.add(new DirectoryEntry(FTP_HOME)); + fileSystem.add(new FileEntry(FTP_HOME + "/" + TEST_FILE, TEST_CONTENT)); + fakeFtpServer.setFileSystem(fileSystem); + + fakeFtpServer.start(); + } + + @AfterAll + static void stopFtpServer() { + if (fakeFtpServer != null) { + fakeFtpServer.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-ftp|nutch-extensionpoints"); + conf.set("http.agent.name", "NutchFtpProtocolIT"); + conf.set("ftp.username", FTP_USER); + conf.set("ftp.password", FTP_PASS); + conf.setInt("ftp.timeout", 10000); + protocol = new Ftp(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/" + TEST_FILE; + } + + @Test + void testFtpFileDownload() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected FTP 200 for file download"); + + assertNotNull(output.getContent(), "Content must not be null"); + String body = new String(output.getContent().getContent(), StandardCharsets.UTF_8); + assertTrue(body.contains(TEST_CONTENT), + "Downloaded content must match the file on the FTP server"); + } + + @Test + void testFtpDirectoryListing() throws Exception { + String dirUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/"; + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(dirUrl), datum); + + assertNotNull(output, "ProtocolOutput for directory listing must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected FTP 200 for directory listing"); + } + + @Test + void testFtpMissingFileReturnsError() throws Exception { + String missingUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/nonexistent.txt"; + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(missingUrl), datum); + assertNotNull(output, "ProtocolOutput must not be null even for missing files"); + // FTP 550 "No such file" maps to a non-200 Nutch status + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertTrue(code != 200, "Expected non-200 code for missing FTP file, got: " + code); + } +} diff --git a/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java new file mode 100644 index 0000000000..42c551b72f --- /dev/null +++ b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-htmlunit using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class HtmlUnitProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-htmlunit|lib-htmlunit|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-htmlunit must return non-null content for a live nginx page"); + } +} diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java new file mode 100644 index 0000000000..87db32335b --- /dev/null +++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-http using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class HttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-http|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchRedirect301() throws Exception { + // nginx returns 301 for directory URLs without trailing slash when autoindex + // is off; test a manual redirect via the default nginx welcome page path + String redirectUrl = + "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/index.html"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(redirectUrl), datum); + int code = getHttpStatusCode(datum); + // nginx serves index.html directly with 200; the base test covers 200/404 + assertEquals(200, code, "Expected 200 for index.html from nginx"); + } +} diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml index 0b3ce0af73..e5987074b8 100644 --- a/src/plugin/protocol-httpclient/ivy.xml +++ b/src/plugin/protocol-httpclient/ivy.xml @@ -38,6 +38,7 @@ + diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java new file mode 100644 index 0000000000..7345e4b029 --- /dev/null +++ b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.httpclient; + +import static com.github.tomakehurst.wiremock.client.WireMock.aResponse; +import static com.github.tomakehurst.wiremock.client.WireMock.get; +import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import com.github.tomakehurst.wiremock.WireMockServer; +import com.github.tomakehurst.wiremock.core.WireMockConfiguration; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolPluginIntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Integration tests for protocol-httpclient using an in-process WireMock + * server. + * + *

WireMock runs in the test JVM so no Docker container is required. The + * Nutch httpclient plugin connects to it over a real TCP socket, exercising + * the full HTTP client stack including header handling and Basic-auth + * challenge/response. + */ +public class HttpClientProtocolIT implements ProtocolPluginIntegrationTest { + + private static WireMockServer wireMock; + private Http protocol; + + @BeforeAll + static void startWireMock() { + wireMock = new WireMockServer(WireMockConfiguration.options().dynamicPort()); + wireMock.start(); + + wireMock.stubFor(get(urlEqualTo("/")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Integration test"))); + + wireMock.stubFor(get(urlEqualTo("/notfound")) + .willReturn(aResponse().withStatus(404))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .withBasicAuth("testuser", "testpass") + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Authenticated"))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .willReturn(aResponse() + .withStatus(401) + .withHeader("WWW-Authenticate", "Basic realm=\"Test\"") + .withBody("Unauthorized"))); + } + + @AfterAll + static void stopWireMock() { + if (wireMock != null) { + wireMock.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-httpclient|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://localhost:" + wireMock.port() + "/"; + } + + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected HTTP 200 from WireMock stub"); + } + + @Test + void testFetch404() throws Exception { + String url = "http://localhost:" + wireMock.port() + "/notfound"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(url), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(404, code, "Expected HTTP 404 for /notfound stub"); + } + + @Test + void testUnauthenticatedRequestReturns401() throws Exception { + String secureUrl = "http://localhost:" + wireMock.port() + "/secure"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(secureUrl), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(401, code, + "Unauthenticated request to /secure should return 401"); + } +} diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java new file mode 100644 index 0000000000..d5342d8309 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-okhttp using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class OkHttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private OkHttp protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-okhttp|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new OkHttp(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + /** OkHttp transparently decompresses gzip; verify content is returned. */ + @Test + void testFetchWithAcceptEncoding() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "Content must be present even when server uses compression"); + } +} diff --git a/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java new file mode 100644 index 0000000000..ec928df64f --- /dev/null +++ b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.selenium; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-selenium using a real nginx container. + * + *

Note: protocol-selenium uses raw HTTP sockets (the same underlying + * transport as protocol-http) rather than a Selenium WebDriver. Tests here + * validate that the plugin connects to and fetches content from a live HTTP + * server. Browser-based rendering is covered by protocol-interactiveselenium + * which is excluded from automated integration tests due to its stateful + * handler requirements. + */ +@Testcontainers(disabledWithoutDocker = true) +public class SeleniumProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-selenium|lib-http|lib-selenium|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-selenium must return non-null content for a live nginx page"); + } +} diff --git a/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java new file mode 100644 index 0000000000..9469b168fb --- /dev/null +++ b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Abstract base for Protocol plugin integration tests using Testcontainers. + * Provides common test logic for fetching URLs and verifying status codes. + * + *

Subclasses declare a static {@code @Container} field for the server + * container, implement {@link ProtocolPluginIntegrationTest}, and may add + * protocol-specific tests (e.g., redirect handling, authentication). + */ +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractProtocolPluginIT implements ProtocolPluginIntegrationTest { + + @BeforeEach + void setUp() throws Exception { + setUpProtocol(); + } + + @AfterEach + void tearDown() throws Exception { + tearDownProtocol(); + } + + /** Fetch the test URL and assert an HTTP 200 response. */ + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = getProtocol() + .getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + assertEquals(200, getHttpStatusCode(datum), + "Expected HTTP 200 for " + getTestUrl()); + verifyFetchedContent(output, datum); + } + + /** Fetch a non-existent path and assert an HTTP 404 response. */ + @Test + void testFetch404() throws Exception { + String url = get404Url(); + CrawlDatum datum = new CrawlDatum(); + getProtocol().getProtocolOutput(new Text(url), datum); + assertEquals(404, getHttpStatusCode(datum), + "Expected HTTP 404 for " + url); + } + + /** + * Returns a URL expected to produce a 404. Default appends a random path + * segment to {@link #getTestUrl()}; override if the server needs a specific + * path. + */ + protected String get404Url() { + String base = getTestUrl(); + if (base.endsWith("/")) { + return base + "nonexistent-path-xyz"; + } + return base + "/nonexistent-path-xyz"; + } + + /** + * Reads the HTTP status code stored in the CrawlDatum metadata by Nutch + * protocol plugins. Returns -1 if no status code was stored. + */ + protected static int getHttpStatusCode(CrawlDatum datum) { + if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) { + return Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + } + return -1; + } +} diff --git a/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java new file mode 100644 index 0000000000..b3778077d9 --- /dev/null +++ b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import org.apache.nutch.crawl.CrawlDatum; + +/** + * Contract for Protocol plugin integration tests. Implementations run against + * real server backends (via Testcontainers or embedded servers). + */ +public interface ProtocolPluginIntegrationTest { + + /** Set up the protocol plugin and its backing server before tests. */ + void setUpProtocol() throws Exception; + + /** Shut down the protocol plugin after tests. */ + void tearDownProtocol() throws Exception; + + /** The Protocol under test. */ + Protocol getProtocol(); + + /** + * A URL that the backing server will serve with a 200/success response. + * Must point into the container or embedded server started by this test. + */ + String getTestUrl(); + + /** + * Optional extra verification after a successful fetch. + * Default is a no-op; override to inspect content, headers, etc. + */ + default void verifyFetchedContent(ProtocolOutput output, CrawlDatum datum) + throws Exception { + // no-op + } +}