Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .codespellrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[codespell]
# JUnit 5 annotations incorrectly flagged as spelling errors
ignore-words-list = AfterAll,BeforeAll
# CHANGES.md is a historical changelog not maintained by this branch
skip = CHANGES.md
6 changes: 6 additions & 0 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ jobs:
- 'src/plugin/**'
indexer_plugins:
- 'src/plugin/indexer-*/**'
protocol_plugins:
- 'src/plugin/protocol-*/**'
buildconf:
- 'build.xml'
- 'ivy/ivy.xml'
Expand All @@ -244,6 +246,10 @@ jobs:
- name: test indexer integration
if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
run: ant clean test-indexer-integration -buildfile build.xml
# run protocol integration tests when protocol plugin files change (Docker required, ubuntu-latest only)
- name: test protocol integration
if: ${{ steps.filter.outputs.protocol_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
run: ant clean test-protocol-integration -buildfile build.xml
- name: Check for test results
id: check_tests
if: always() && matrix.os == 'ubuntu-latest'
Expand Down
1 change: 1 addition & 0 deletions .yetus/blanks-eol.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Ignore trailing blanks in Yetus-generated patch/diff and logs (not source files).
# See --blanks-eol-ignore-file in the blanks plugin.
^out/
CHANGES.md
1 change: 1 addition & 0 deletions .yetus/blanks-tabs.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Ignore tabs in Yetus-generated patch dir (not source files).
# See --blanks-tabs-ignore-file in the blanks plugin.
^out/
CHANGES.md
8 changes: 6 additions & 2 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
</target>

<target name="dependencytests" depends="resolve-test" description="Show unit tests dependency tree">
<ivy:dependencytree />
<ivy:dependencytree />
</target>

<!-- ====================================================== -->
Expand Down Expand Up @@ -528,6 +528,10 @@
<ant dir="src/plugin" target="test-indexer-integration" inheritAll="false"/>
</target>

<target name="test-protocol-integration" depends="resolve-test, compile, compile-core-test, job" description="--> run protocol plugin integration tests (Testcontainers)">
<ant dir="src/plugin" target="test-protocol-integration" inheritAll="false"/>
</target>

<target name="nightly" depends="test, tar-src, zip-src" description="--> run the nightly target build">
</target>

Expand Down Expand Up @@ -1079,7 +1083,7 @@
projectName="Apache Nutch Spotbugs Analysis"
stylesheet="fancy-hist.xsl" >
<auxClasspath>
<!-- depency jars required for analysis but not analyzed (not our bugs) -->
<!-- dependency jars required for analysis but not analyzed (not our bugs) -->
<pathelement path="${basedir}/${build.dir}/lib"/>
<fileset dir="${basedir}/${build.dir}/plugins">
<include name="**/*.jar"/>
Expand Down
10 changes: 5 additions & 5 deletions conf/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@
<!-- default values that can be overridden by system properties:
Note: the script bin/nutch sets these properties from the environment variables
NUTCH_LOG_DIR and NUTCH_LOGFILE -->
<Property name="hadoop.log.dir">${sys:hadoop.log.dir:-./logs}</Property>
<Property name="hadoop.log.file">${sys:hadoop.log.file:-hadoop.log}</Property>
<Property name="nutch.log.dir">${sys:hadoop.log.dir:-./logs}</Property>
<Property name="nutch.log.file">${sys:hadoop.log.file:-hadoop.log}</Property>
</Properties>
<Appenders>
<RollingFile name="RollingFile" fileName="${hadoop.log.dir}/${hadoop.log.file}"
filePattern="${hadoop.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<RollingFile name="RollingFile" fileName="${nutch.log.dir}/${nutch.log.file}"
filePattern="${nutch.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />
<CronTriggeringPolicy schedule="0 0 0 * * ?" evaluateOnStartup="true" />
<DefaultRolloverStrategy>
<Delete basePath="${hadoop.log.dir}" maxDepth="2">
<Delete basePath="${nutch.log.dir}" maxDepth="2">
<IfFileName glob="*/nutch-*.log.gz" />
<IfLastModified age="60d" />
</Delete>
Expand Down
6 changes: 5 additions & 1 deletion ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,13 @@
<dependency org="org.mockito" name="mockito-core" rev="5.18.0" conf="test->default"/>
<dependency org="org.mockito" name="mockito-junit-jupiter" rev="5.18.0" conf="test->default"/>

<!-- Testcontainers for indexer plugin integration tests -->
<!-- Testcontainers for indexer and protocol plugin integration tests -->
<dependency org="org.testcontainers" name="testcontainers" rev="2.0.3" conf="test->default"/>
<dependency org="org.testcontainers" name="junit-jupiter" rev="1.21.4" conf="test->default"/>
<!-- WireMock for HTTP mock server in protocol-httpclient integration tests -->
<dependency org="com.github.tomakehurst" name="wiremock-standalone" rev="3.0.1" conf="test->default"/>
<!-- MockFtpServer for in-process FTP server in protocol-ftp integration tests -->
<dependency org="org.mockftpserver" name="MockFtpServer" rev="3.1.0" conf="test->default"/>

<!-- MockServer (https://www.mock-server.com/) for static HTTP content in unit tests. -->
<dependency org="org.mock-server" name="mockserver-netty" rev="5.15.0" conf="test->default">
Expand Down
28 changes: 28 additions & 0 deletions src/plugin/build-plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
<include name="hamcrest*.jar" />
<include name="junit*.jar" />
<include name="opentest4j*.jar" />
<include name="testcontainers*.jar" />
</fileset>
<path refid="classpath"/>
</path>
Expand Down Expand Up @@ -277,6 +278,33 @@
<fail if="integration.tests.failed">Indexer integration tests failed!</fail>
</target>

<!-- ================================================================== -->
<!-- Run protocol plugin integration tests (Testcontainers) -->
<!-- ================================================================== -->
<target name="test-protocol-integration" depends="compile-test, deploy" if="test.available">
<echo message="Running protocol integration tests for plugin: ${name}"/>
<junitlauncher printSummary="true" haltOnFailure="false" failureProperty="protocol.integration.tests.failed">
<classpath refid="test.classpath"/>
<testclasses outputDir="${build.test}">
<listener type="legacy-plain" sendSysOut="true" sendSysErr="true"/>
<listener type="legacy-xml" sendSysOut="true" sendSysErr="true"/>
<fork forkMode="perTestClass">
<jvmarg value="-Xmx2000m"/>
<sysproperty key="test.data" value="${build.test}/data"/>
<sysproperty key="test.input" value="${root}/data"/>
<sysproperty key="testcontainers.reuse.enable" value="true"/>
<sysproperty key="javax.xml.parsers.DocumentBuilderFactory" value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
</fork>
<fileset dir="${build.test}">
<include name="**/IT*.class"/>
<include name="**/*IT.class"/>
<include name="**/*IntegrationTest.class"/>
</fileset>
</testclasses>
</junitlauncher>
<fail if="protocol.integration.tests.failed">Protocol integration tests failed!</fail>
</target>

<!-- target: resolve ================================================= -->
<target name="resolve-default" depends="clean-lib" description="resolve and retrieve dependencies with ivy">
<ivy:resolve file="ivy.xml" conf="default" log="download-only"/>
Expand Down
13 changes: 13 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,19 @@
<ant dir="indexer-solr" target="test-indexer-integration"/>
</target>

<!-- ====================================================== -->
<!-- Protocol plugin integration tests (Testcontainers) -->
<!-- Run sequentially to avoid container resource contention-->
<!-- ====================================================== -->
<target name="test-protocol-integration">
<ant dir="protocol-ftp" target="test-protocol-integration"/>
<ant dir="protocol-http" target="test-protocol-integration"/>
<ant dir="protocol-httpclient" target="test-protocol-integration"/>
<ant dir="protocol-htmlunit" target="test-protocol-integration"/>
<ant dir="protocol-okhttp" target="test-protocol-integration"/>
<ant dir="protocol-selenium" target="test-protocol-integration"/>
</target>

<!-- ====================================================== -->
<!-- Clean all of the plugins. -->
<!-- ====================================================== -->
Expand Down
3 changes: 2 additions & 1 deletion src/plugin/protocol-ftp/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
</publications>

<dependencies>
<dependency org="commons-net" name="commons-net" rev="1.2.2" conf="*->master"/>
<dependency org="commons-net" name="commons-net" rev="3.9.0" conf="*->master"/>
<dependency org="org.mockftpserver" name="MockFtpServer" rev="3.1.0" conf="test->default"/>
</dependencies>

</ivy-module>
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
} catch (Exception e) {
LOG.error("Could not get protocol output for {}: {}", url,
e.getMessage());
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text("500"));
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
Ftp.LOG.info("connect to {}", addr);
}

ftp.client.connect(addr);
int port = url.getPort();
ftp.client.connect(addr, port > 0 ? port : FTP.DEFAULT_PORT);
if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
ftp.client.disconnect();
Ftp.LOG.warn("ftp.client.connect() failed: {} {}", addr,
Expand Down Expand Up @@ -206,6 +207,11 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
try {
ftp.parser = null;
String parserKey = ftp.client.getSystemName();
// strip surrounding quotes that some servers include in SYST reply
if (parserKey.length() > 2 && parserKey.charAt(0) == '"'
&& parserKey.charAt(parserKey.length() - 1) == '"') {
parserKey = parserKey.substring(1, parserKey.length() - 1);
}
// some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
if (parserKey.startsWith("UNKNOWN Type: L8"))
parserKey = "UNIX Type: L8";
Expand Down Expand Up @@ -302,6 +308,11 @@ private void getFileAsHttpResponse(String path, long lastModified)
list = new LinkedList<FTPFile>();
ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);

if (list.isEmpty()) {
this.code = 404; // file not found (server returned empty listing)
return;
}

FTPFile ftpFile = (FTPFile) list.get(0);
this.headers.set(Response.CONTENT_LENGTH,
Long.valueOf(ftpFile.getSize()).toString());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.ftp;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.nio.charset.StandardCharsets;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolPluginIntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockftpserver.fake.FakeFtpServer;
import org.mockftpserver.fake.UserAccount;
import org.mockftpserver.fake.filesystem.DirectoryEntry;
import org.mockftpserver.fake.filesystem.FileEntry;
import org.mockftpserver.fake.filesystem.UnixFakeFileSystem;

/**
* Integration tests for protocol-ftp using an in-process FakeFtpServer.
*
* <p>FTP passive mode with Testcontainers requires that the PASV response IP
* matches the host-visible address of the container, which is not reliable
* across Docker Desktop (macOS/Windows) and Linux Docker environments. An
* in-process {@link FakeFtpServer} from MockFtpServer avoids this constraint
* while still testing the Nutch FTP client against a real FTP protocol
* implementation.
*/
public class FtpProtocolIT implements ProtocolPluginIntegrationTest {

private static final String FTP_USER = "testuser";
private static final String FTP_PASS = "testpass";
private static final String FTP_HOME = "/home/testuser";
private static final String TEST_FILE = "test.txt";
private static final String TEST_CONTENT = "FTP integration test content";

private static FakeFtpServer fakeFtpServer;
private Ftp protocol;

@BeforeAll
static void startFtpServer() {
fakeFtpServer = new FakeFtpServer();
fakeFtpServer.setServerControlPort(0); // bind to a random free port

UserAccount userAccount = new UserAccount(FTP_USER, FTP_PASS, FTP_HOME);
fakeFtpServer.addUserAccount(userAccount);

UnixFakeFileSystem fileSystem = new UnixFakeFileSystem();
fileSystem.add(new DirectoryEntry(FTP_HOME));
fileSystem.add(new FileEntry(FTP_HOME + "/" + TEST_FILE, TEST_CONTENT));
fakeFtpServer.setFileSystem(fileSystem);

fakeFtpServer.start();
}

@AfterAll
static void stopFtpServer() {
if (fakeFtpServer != null) {
fakeFtpServer.stop();
}
}

@BeforeEach
@Override
public void setUpProtocol() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-ftp|nutch-extensionpoints");
conf.set("http.agent.name", "NutchFtpProtocolIT");
conf.set("ftp.username", FTP_USER);
conf.set("ftp.password", FTP_PASS);
conf.setInt("ftp.timeout", 10000);
protocol = new Ftp();
protocol.setConf(conf);
}

@AfterEach
@Override
public void tearDownProtocol() {
protocol = null;
}

@Override
public Protocol getProtocol() {
return protocol;
}

@Override
public String getTestUrl() {
return "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ FTP_HOME + "/" + TEST_FILE;
}

@Test
void testFtpFileDownload() throws Exception {
CrawlDatum datum = new CrawlDatum();
ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum);

assertNotNull(output, "ProtocolOutput must not be null");
int code = Integer.parseInt(
datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
assertEquals(200, code, "Expected FTP 200 for file download");

assertNotNull(output.getContent(), "Content must not be null");
String body = new String(output.getContent().getContent(), StandardCharsets.UTF_8);
assertTrue(body.contains(TEST_CONTENT),
"Downloaded content must match the file on the FTP server");
}

@Test
void testFtpDirectoryListing() throws Exception {
String dirUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ FTP_HOME + "/";
CrawlDatum datum = new CrawlDatum();
ProtocolOutput output = protocol.getProtocolOutput(new Text(dirUrl), datum);

assertNotNull(output, "ProtocolOutput for directory listing must not be null");
int code = Integer.parseInt(
datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
assertEquals(200, code, "Expected FTP 200 for directory listing");
}

@Test
void testFtpMissingFileReturnsError() throws Exception {
String missingUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ FTP_HOME + "/nonexistent.txt";
CrawlDatum datum = new CrawlDatum();
ProtocolOutput output = protocol.getProtocolOutput(new Text(missingUrl), datum);
assertNotNull(output, "ProtocolOutput must not be null even for missing files");
// FTP 550 "No such file" maps to a non-200 Nutch status
int code = Integer.parseInt(
datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
assertTrue(code != 200, "Expected non-200 code for missing FTP file, got: " + code);
}
}
Loading
Loading