FTP passive mode with Testcontainers requires that the PASV response IP
+ * matches the host-visible address of the container, which is not reliable
+ * across Docker Desktop (macOS/Windows) and Linux Docker environments. An
+ * in-process {@link FakeFtpServer} from MockFtpServer avoids this constraint
+ * while still testing the Nutch FTP client against a real FTP protocol
+ * implementation.
+ */
+public class FtpProtocolIT implements ProtocolPluginIntegrationTest {
+
+ private static final String FTP_USER = "testuser";
+ private static final String FTP_PASS = "testpass";
+ private static final String FTP_HOME = "/home/testuser";
+ private static final String TEST_FILE = "test.txt";
+ private static final String TEST_CONTENT = "FTP integration test content";
+
+ private static FakeFtpServer fakeFtpServer;
+ private Ftp protocol;
+
+ @BeforeAll
+ static void startFtpServer() {
+ fakeFtpServer = new FakeFtpServer();
+ fakeFtpServer.setServerControlPort(0); // bind to a random free port
+
+ UserAccount userAccount = new UserAccount(FTP_USER, FTP_PASS, FTP_HOME);
+ fakeFtpServer.addUserAccount(userAccount);
+
+ UnixFakeFileSystem fileSystem = new UnixFakeFileSystem();
+ fileSystem.add(new DirectoryEntry(FTP_HOME));
+ fileSystem.add(new FileEntry(FTP_HOME + "/" + TEST_FILE, TEST_CONTENT));
+ fakeFtpServer.setFileSystem(fileSystem);
+
+ fakeFtpServer.start();
+ }
+
+ @AfterAll
+ static void stopFtpServer() {
+ if (fakeFtpServer != null) {
+ fakeFtpServer.stop();
+ }
+ }
+
+ @BeforeEach
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes", "protocol-ftp|nutch-extensionpoints");
+ conf.set("http.agent.name", "NutchFtpProtocolIT");
+ conf.set("ftp.username", FTP_USER);
+ conf.set("ftp.password", FTP_PASS);
+ conf.setInt("ftp.timeout", 10000);
+ protocol = new Ftp();
+ protocol.setConf(conf);
+ }
+
+ @AfterEach
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ + FTP_HOME + "/" + TEST_FILE;
+ }
+
+ @Test
+ void testFtpFileDownload() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum);
+
+ assertNotNull(output, "ProtocolOutput must not be null");
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(200, code, "Expected FTP 200 for file download");
+
+ assertNotNull(output.getContent(), "Content must not be null");
+ String body = new String(output.getContent().getContent(), StandardCharsets.UTF_8);
+ assertTrue(body.contains(TEST_CONTENT),
+ "Downloaded content must match the file on the FTP server");
+ }
+
+ @Test
+ void testFtpDirectoryListing() throws Exception {
+ String dirUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ + FTP_HOME + "/";
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(dirUrl), datum);
+
+ assertNotNull(output, "ProtocolOutput for directory listing must not be null");
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(200, code, "Expected FTP 200 for directory listing");
+ }
+
+ @Test
+ void testFtpMissingFileReturnsError() throws Exception {
+ String missingUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ + FTP_HOME + "/nonexistent.txt";
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(missingUrl), datum);
+ assertNotNull(output, "ProtocolOutput must not be null even for missing files");
+ // FTP 550 "No such file" maps to a non-200 Nutch status
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertTrue(code != 200, "Expected non-200 code for missing FTP file, got: " + code);
+ }
+}
diff --git a/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java
new file mode 100644
index 0000000000..42c551b72f
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.AbstractProtocolPluginIT;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for protocol-htmlunit using a real nginx container.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class HtmlUnitProtocolIT extends AbstractProtocolPluginIT {
+
+ @Container
+ private static final GenericContainer> nginx =
+ new GenericContainer<>("nginx:alpine").withExposedPorts(80);
+
+ private Http protocol;
+
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes",
+ "protocol-htmlunit|lib-htmlunit|lib-http|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new Http();
+ protocol.setConf(conf);
+ }
+
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/";
+ }
+
+ @Test
+ void testFetchReturnsContent() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(
+ new org.apache.hadoop.io.Text(getTestUrl()), datum);
+ assertNotNull(output.getContent(),
+ "protocol-htmlunit must return non-null content for a live nginx page");
+ }
+}
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java
new file mode 100644
index 0000000000..87db32335b
--- /dev/null
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.AbstractProtocolPluginIT;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for protocol-http using a real nginx container.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class HttpProtocolIT extends AbstractProtocolPluginIT {
+
+ @Container
+ private static final GenericContainer> nginx =
+ new GenericContainer<>("nginx:alpine").withExposedPorts(80);
+
+ private Http protocol;
+
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes", "protocol-http|lib-http|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new Http();
+ protocol.setConf(conf);
+ }
+
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/";
+ }
+
+ @Test
+ void testFetchRedirect301() throws Exception {
+ // nginx returns 301 for directory URLs without trailing slash when autoindex
+ // is off; test a manual redirect via the default nginx welcome page path
+ String redirectUrl =
+ "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/index.html";
+ CrawlDatum datum = new CrawlDatum();
+ protocol.getProtocolOutput(new Text(redirectUrl), datum);
+ int code = getHttpStatusCode(datum);
+ // nginx serves index.html directly with 200; the base test covers 200/404
+ assertEquals(200, code, "Expected 200 for index.html from nginx");
+ }
+}
diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml
index 0b3ce0af73..e5987074b8 100644
--- a/src/plugin/protocol-httpclient/ivy.xml
+++ b/src/plugin/protocol-httpclient/ivy.xml
@@ -38,6 +38,7 @@
WireMock runs in the test JVM so no Docker container is required. The + * Nutch httpclient plugin connects to it over a real TCP socket, exercising + * the full HTTP client stack including header handling and Basic-auth + * challenge/response. + */ +public class HttpClientProtocolIT implements ProtocolPluginIntegrationTest { + + private static WireMockServer wireMock; + private Http protocol; + + @BeforeAll + static void startWireMock() { + wireMock = new WireMockServer(WireMockConfiguration.options().dynamicPort()); + wireMock.start(); + + wireMock.stubFor(get(urlEqualTo("/")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("
Integration test"))); + + wireMock.stubFor(get(urlEqualTo("/notfound")) + .willReturn(aResponse().withStatus(404))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .withBasicAuth("testuser", "testpass") + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Authenticated"))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .willReturn(aResponse() + .withStatus(401) + .withHeader("WWW-Authenticate", "Basic realm=\"Test\"") + .withBody("Unauthorized"))); + } + + @AfterAll + static void stopWireMock() { + if (wireMock != null) { + wireMock.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-httpclient|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://localhost:" + wireMock.port() + "/"; + } + + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected HTTP 200 from WireMock stub"); + } + + @Test + void testFetch404() throws Exception { + String url = "http://localhost:" + wireMock.port() + "/notfound"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(url), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(404, code, "Expected HTTP 404 for /notfound stub"); + } + + @Test + void testUnauthenticatedRequestReturns401() throws Exception { + String secureUrl = "http://localhost:" + wireMock.port() + "/secure"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(secureUrl), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(401, code, + "Unauthenticated request to /secure should return 401"); + } +} diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java new file mode 100644 index 0000000000..d5342d8309 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-okhttp using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class OkHttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer> nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private OkHttp protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-okhttp|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new OkHttp(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + /** OkHttp transparently decompresses gzip; verify content is returned. */ + @Test + void testFetchWithAcceptEncoding() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "Content must be present even when server uses compression"); + } +} diff --git a/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java new file mode 100644 index 0000000000..ec928df64f --- /dev/null +++ b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.selenium; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-selenium using a real nginx container. + * + *Note: protocol-selenium uses raw HTTP sockets (the same underlying + * transport as protocol-http) rather than a Selenium WebDriver. Tests here + * validate that the plugin connects to and fetches content from a live HTTP + * server. Browser-based rendering is covered by protocol-interactiveselenium + * which is excluded from automated integration tests due to its stateful + * handler requirements. + */ +@Testcontainers(disabledWithoutDocker = true) +public class SeleniumProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer> nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-selenium|lib-http|lib-selenium|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-selenium must return non-null content for a live nginx page"); + } +} diff --git a/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java new file mode 100644 index 0000000000..9469b168fb --- /dev/null +++ b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Abstract base for Protocol plugin integration tests using Testcontainers. + * Provides common test logic for fetching URLs and verifying status codes. + * + *
Subclasses declare a static {@code @Container} field for the server + * container, implement {@link ProtocolPluginIntegrationTest}, and may add + * protocol-specific tests (e.g., redirect handling, authentication). + */ +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractProtocolPluginIT implements ProtocolPluginIntegrationTest { + + @BeforeEach + void setUp() throws Exception { + setUpProtocol(); + } + + @AfterEach + void tearDown() throws Exception { + tearDownProtocol(); + } + + /** Fetch the test URL and assert an HTTP 200 response. */ + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = getProtocol() + .getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + assertEquals(200, getHttpStatusCode(datum), + "Expected HTTP 200 for " + getTestUrl()); + verifyFetchedContent(output, datum); + } + + /** Fetch a non-existent path and assert an HTTP 404 response. */ + @Test + void testFetch404() throws Exception { + String url = get404Url(); + CrawlDatum datum = new CrawlDatum(); + getProtocol().getProtocolOutput(new Text(url), datum); + assertEquals(404, getHttpStatusCode(datum), + "Expected HTTP 404 for " + url); + } + + /** + * Returns a URL expected to produce a 404. Default appends a random path + * segment to {@link #getTestUrl()}; override if the server needs a specific + * path. + */ + protected String get404Url() { + String base = getTestUrl(); + if (base.endsWith("/")) { + return base + "nonexistent-path-xyz"; + } + return base + "/nonexistent-path-xyz"; + } + + /** + * Reads the HTTP status code stored in the CrawlDatum metadata by Nutch + * protocol plugins. Returns -1 if no status code was stored. + */ + protected static int getHttpStatusCode(CrawlDatum datum) { + if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) { + return Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + } + return -1; + } +} diff --git a/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java new file mode 100644 index 0000000000..b3778077d9 --- /dev/null +++ b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import org.apache.nutch.crawl.CrawlDatum; + +/** + * Contract for Protocol plugin integration tests. Implementations run against + * real server backends (via Testcontainers or embedded servers). + */ +public interface ProtocolPluginIntegrationTest { + + /** Set up the protocol plugin and its backing server before tests. */ + void setUpProtocol() throws Exception; + + /** Shut down the protocol plugin after tests. */ + void tearDownProtocol() throws Exception; + + /** The Protocol under test. */ + Protocol getProtocol(); + + /** + * A URL that the backing server will serve with a 200/success response. + * Must point into the container or embedded server started by this test. + */ + String getTestUrl(); + + /** + * Optional extra verification after a successful fetch. + * Default is a no-op; override to inspect content, headers, etc. + */ + default void verifyFetchedContent(ProtocolOutput output, CrawlDatum datum) + throws Exception { + // no-op + } +}