From 5b30fcaad8f0f41362696d4299658452217f8e86 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 09:24:40 -0400 Subject: [PATCH 1/7] update parse modes and configuration.adoc --- .../ROOT/pages/pipes/configuration.adoc | 2 +- docs/modules/ROOT/pages/pipes/index.adoc | 2 +- .../modules/ROOT/pages/pipes/parse-modes.adoc | 143 +++++++++++++----- 3 files changed, 103 insertions(+), 44 deletions(-) diff --git a/docs/modules/ROOT/pages/pipes/configuration.adoc b/docs/modules/ROOT/pages/pipes/configuration.adoc index c6614e7811e..e9c75ab0603 100644 --- a/docs/modules/ROOT/pages/pipes/configuration.adoc +++ b/docs/modules/ROOT/pages/pipes/configuration.adoc @@ -98,7 +98,7 @@ See also xref:pipes/timeouts.adoc[Timeouts] for the full timeout model. |`parseMode` |`RMETA` -|How embedded documents are handled: `RMETA` (recursive metadata list), `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`. See xref:pipes/parse-modes.adoc[Parse Modes]. +|How embedded documents are handled: `RMETA` (recursive metadata list), `CONCATENATE`, `CONTENT_ONLY`, `NO_PARSE`, `UNPACK`. See xref:pipes/parse-modes.adoc[Parse Modes]. |`onParseException` |`EMIT` diff --git a/docs/modules/ROOT/pages/pipes/index.adoc b/docs/modules/ROOT/pages/pipes/index.adoc index 796f9d7f1f1..7bd20782388 100644 --- a/docs/modules/ROOT/pages/pipes/index.adoc +++ b/docs/modules/ROOT/pages/pipes/index.adoc @@ -48,7 +48,7 @@ against problematic files. * xref:pipes/iterators.adoc[Iterators] -- document enumeration (directory walk, S3 listing, CSV, JDBC, Kafka, etc.) * xref:pipes/reporters.adoc[Reporters] -- track per-document processing status * xref:pipes/configuration.adoc[Pipeline Configuration] -- numClients, timeouts, JVM args, parse modes, emit batching -* xref:pipes/parse-modes.adoc[Parse Modes] -- control how documents are parsed and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`) +* xref:pipes/parse-modes.adoc[Parse Modes] -- control how documents are parsed and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `NO_PARSE`, `UNPACK`) * xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] -- extract raw bytes from embedded documents * xref:pipes/timeouts.adoc[Timeouts] -- two-tier timeout system for handling long-running and hung parsers diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc b/docs/modules/ROOT/pages/pipes/parse-modes.adoc index a023d0b4062..2a1af6a5936 100644 --- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc +++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc @@ -16,6 +16,8 @@ // = Parse Modes +:toc: +:toclevels: 3 Tika Pipes uses `ParseMode` to control how documents are parsed and how results are emitted. The parse mode is set on the `ParseContext` or configured in `PipesConfig`. @@ -27,28 +29,60 @@ The parse mode is set on the `ParseContext` or configured in `PipesConfig`. |Mode |Description |`RMETA` -|Default mode. Each embedded document produces a separate `Metadata` object. -Results are returned as a JSON array of metadata objects. +|Default mode. Each embedded document produces its own `Metadata` object. +Results are returned as a JSON array of metadata objects, preserving per-embedded metadata. |`CONCATENATE` -|All content from embedded documents is concatenated into a single content field. -Results are returned as a single `Metadata` object with all metadata preserved. +|All embedded-document text is concatenated into a single content field on the **container's** `Metadata` object. +Per-embedded metadata is **not** retained in the result. See <>. |`CONTENT_ONLY` -|Parses like `CONCATENATE` but emits only the raw extracted content — no JSON wrapper, -no metadata fields. Useful when you want just the text, markdown, or HTML output. +|Same parsing as `CONCATENATE`, but emitters write only the raw content — no JSON wrapper, +no metadata fields. See <>. |`NO_PARSE` -|Skip parsing entirely. Useful for pipelines that only need to fetch and emit raw bytes. +|Skips parsing. Container-level MIME detection and digesting (if configured) still run. +See <>. |`UNPACK` |Extract raw bytes from embedded documents. See xref:pipes/unpack-config.adoc[Extracting Embedded Bytes]. |=== +== Content Handler Types + +The content handler type determines the format of the extracted text. It is set on the +`ContentHandlerFactory` configured in `parseContext` (or via the CLI `-h` flag), and applies +to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`). + +[cols="1,1,2"] +|=== +|Handler |Extension |Description + +|`t` (text) +|`.txt` +|Plain text output + +|`h` (html) +|`.html` +|HTML output + +|`x` (xml) +|`.xml` +|XHTML output + +|`m` (markdown) +|`.md` +|Markdown output + +|`b` (body) +|`.txt` +|Body content handler output (text from the document body only) +|=== + +[#concatenate-mode] == CONCATENATE Mode -`CONCATENATE` merges all content from embedded documents into a single content field -while preserving all metadata from parsing: +`CONCATENATE` merges all extracted text — from the container and all embedded documents — into a single content field on the container's `Metadata` object. [source,json] ---- @@ -59,12 +93,28 @@ while preserving all metadata from parsing: } ---- -The result is a single `Metadata` object containing the concatenated content in -`X-TIKA:content` along with all other metadata fields (title, author, content type, etc.). +=== What's in the result + +* A **single** `Metadata` object (the container's). +* `X-TIKA:content` contains the concatenated text of the container and all reachable embedded documents. +* Container-level metadata fields (title, author, content type, etc.) are present. +* The handler type used is recorded in `X-TIKA:content_handler_type`. + +=== What's NOT in the result + +* **Per-embedded-document metadata is discarded.** If an embedded PDF has its own title and author, those values are not in the output. Only the container's metadata is returned. Use `RMETA` if you need per-embedded metadata. +* Individual embedded-document parse exceptions are not surfaced as separate entries. They are handled by Tika's embedded document extractor and may appear as embedded-exception fields on the container metadata, but there is no per-embedded `Metadata` object to inspect. + +=== Container-level exceptions + +If the container parse fails (`SAXException`, `EncryptedDocumentException`, or any other `Exception`), the stack trace is caught, logged, and stored on the container metadata as `X-TIKA:container_exception`. The parse continues to a return value rather than throwing — callers must check this field if they need to detect failure. +If the configured write limit is reached during concatenation, `X-TIKA:write_limit_reached` is set to `true`. + +[#content-only-mode] == CONTENT_ONLY Mode -`CONTENT_ONLY` is designed for use cases where you want just the extracted content +`CONTENT_ONLY` is designed for cases where you want just the extracted content written to storage — no JSON wrapping, no metadata overhead. This is particularly useful for: @@ -81,22 +131,20 @@ useful for: } ---- -=== How It Works +=== How it works -1. Documents are parsed identically to `CONCATENATE` mode — all embedded content is - merged into a single content field. -2. A metadata filter automatically strips all metadata except `X-TIKA:content` and - `X-TIKA:CONTAINER_EXCEPTION` (for error tracking). +1. Documents are parsed identically to `CONCATENATE` mode — all embedded text is merged into the container's content field, and the same caveats around per-embedded metadata apply. +2. A metadata filter automatically strips all metadata except `X-TIKA:content` and `X-TIKA:container_exception` (for error tracking). 3. When the emitter is a `StreamEmitter` (such as the filesystem or S3 emitter), the raw content string is written directly as bytes — no JSON serialization. -=== Metadata Filtering +=== Metadata filtering By default, `CONTENT_ONLY` mode applies an `IncludeFieldMetadataFilter` that retains -only `X-TIKA:content` and `X-TIKA:CONTAINER_EXCEPTION`. If you set your own +only `X-TIKA:content` and `X-TIKA:container_exception`. If you set your own `MetadataFilter` on the `ParseContext`, your filter takes priority. -=== CLI Usage +=== CLI usage The `tika-async-cli` batch processor supports `CONTENT_ONLY` via the `--content-only` flag: @@ -107,33 +155,44 @@ java -jar tika-async-cli.jar -i /input -o /output -h m --content-only ---- This produces `.md` files (when using the `m` handler type) containing only the -extracted markdown content. +extracted markdown content. See <<_content_handler_types>> for the available handler types. -=== Content Handler Types +[#no-parse-mode] +== NO_PARSE Mode -The content format depends on the configured handler type: +`NO_PARSE` skips parsing entirely. The container's content type is still detected, and any configured digester still runs against the raw bytes. No text is extracted, no embedded documents are recursed into. -[cols="1,1,2"] -|=== -|Handler |Extension |Description +[source,json] +---- +{ + "parseContext": { + "parseMode": "NO_PARSE" + } +} +---- -|`t` (text) -|`.txt` -|Plain text output +=== What still runs -|`h` (html) -|`.html` -|HTML output +* **MIME detection.** The configured `Detector` runs against the input stream and populates `Content-Type` and `X-TIKA:content_type_parser_override` on the container metadata. +* **Digesting.** If a `DigesterFactory` is configured on the `ParseContext`, it runs against the raw bytes and writes the digest fields (e.g., `X-TIKA:digest:SHA256`) to the container metadata before the parse-mode check. -|`x` (xml) -|`.xml` -|XHTML output +=== What does NOT run -|`m` (markdown) -|`.md` -|Markdown output +* No parser is invoked. `X-TIKA:content` is empty. +* No embedded documents are extracted. +* No content handler is constructed (handler-type configuration is ignored for this mode). -|`b` (body) -|`.txt` -|Body content handler output -|=== +=== When to use + +* **Fetch-and-emit pipelines** that move bytes from one store to another and need only the content type and a fixed-bytes digest for downstream routing or deduplication. +* **Hash-only inventories** of large corpora where parsing every document is too expensive but a stable digest per file is required. +* **MIME triage**: detect content types across a large set so a downstream pipeline can pick the right parser, parse mode, or skip rule. + +Because digest and detection run in `_preParse` regardless of parse mode, switching between `NO_PARSE` and the parsing modes leaves digest values stable for the same input — useful for cross-stage joins. + +[#unpack-mode] +== UNPACK Mode + +`UNPACK` extracts the raw bytes of embedded documents (rather than their parsed text) and emits them via the configured emitter. See xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] for the full configuration model. + +The recursive parsing pass for `UNPACK` uses the same code path as `RMETA`; the difference is at setup and emit time, where mandatory byte extraction is enabled and emitted bytes are routed through the `UnpackHandler`. From 87b5cc2bbc386e7bbbeff815570f5f4efed042f9 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 11:43:31 -0400 Subject: [PATCH 2/7] add file system docs --- .../ROOT/pages/pipes/plugins/filesystem.adoc | 255 ++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc diff --git a/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc b/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc new file mode 100644 index 00000000000..85fba5889e2 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc @@ -0,0 +1,255 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += File System Plugin +:toc: +:toclevels: 3 + +The File System plugin (`tika-pipes-file-system`) is the most common starting point for Tika Pipes. It provides all four interfaces — fetcher, emitter, iterator, and reporter — backed by the local (or mounted) filesystem. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`file-system-fetcher` +|`FileSystemFetcher` + +|Emitter +|`file-system-emitter` +|`FileSystemEmitter` + +|Iterator +|`file-system-pipes-iterator` +|`FileSystemPipesIterator` + +|Reporter +|`file-system-reporter` +|`FileSystemStatusReporter` +|=== + +== Complete Pipeline Example + +The example below is the canonical filesystem-to-filesystem integration test config. Tokens like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, and `PLUGINS_PATHS` are placeholders the test harness substitutes; replace them with real paths in your own config. + +[source,json,subs=none] +---- +include::example$pipes-fs-pipeline.json[] +---- + +icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View source on GitHub] + +[#file-system-fetcher] +== File System Fetcher (`file-system-fetcher`) + +Reads files from a local or mounted filesystem. Fetch keys are resolved relative to `basePath`. + +[source,json] +---- +{ + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "/data/input", + "extractFileSystemMetadata": true + } + } + } +} +---- + +The outer key (`fsf`) is the fetcher ID — referenced by `pipesIterator.fetcherId` elsewhere in the config. + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`basePath` +|_required_ +|Base directory for fetch operations. Fetch keys are resolved relative to this path. + +|`extractFileSystemMetadata` +|`false` +|When `true`, attach file size, created, and modified timestamps to the metadata of each fetched document. + +|`allowAbsolutePaths` +|`false` +|When `true`, fetch keys may be absolute paths and `basePath` may be omitted. Use sparingly — see <>. +|=== + +[#file-system-emitter] +== File System Emitter (`file-system-emitter`) + +Writes parsed results as files under `basePath`. The relative output path is derived from the emit key of each `FetchEmitTuple`. + +[source,json] +---- +{ + "emitters": { + "fse": { + "file-system-emitter": { + "basePath": "/data/output", + "fileExtension": "json", + "onExists": "EXCEPTION", + "prettyPrint": false + } + } + } +} +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`basePath` +|_required_ +|Base output directory. The emit key is resolved relative to this path. + +|`fileExtension` +|`json` +|Extension appended to each output file. For `CONTENT_ONLY` mode, set this to match the handler type (`txt`, `html`, `md`, `xml`). + +|`onExists` +|`EXCEPTION` +|Behavior when the output file already exists: `SKIP` (do nothing), `REPLACE` (overwrite), `EXCEPTION` (fail loudly). + +|`prettyPrint` +|`false` +|Pretty-print JSON output. Has no effect in `CONTENT_ONLY` mode (raw bytes are written). +|=== + +[#file-system-iterator] +== File System Iterator (`file-system-pipes-iterator`) + +Recursively walks a directory tree, emitting one `FetchEmitTuple` per file found. + +[source,json] +---- +{ + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/input", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "fse" + } + } +} +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`basePath` +|_required_ +|Root directory to walk. + +|`countTotal` +|`true` +|If `true`, walks the tree once to count files before processing begins. Enables progress reporting at the cost of an extra scan over the tree. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +=== Notes + +* Walk order is filesystem-dependent and not guaranteed stable across runs. +* The relative path of each file (from `basePath`) becomes the fetch key, and by default also the emit key. +* Symbolic links are followed. + +[#file-system-reporter] +== File System Reporter (`file-system-reporter`) + +Maintains a JSON status file that summarizes pipeline progress. The reporter writes the file periodically on a background thread; per-record `report()` calls only update in-memory counters. + +[source,json] +---- +{ + "pipes-reporters": { + "file-system-reporter": { + "statusFile": "/var/log/tika/status.json", + "reportUpdateMs": 1000 + } + } +} +---- + +`pipes-reporters` accepts multiple reporters keyed by type name — see xref:pipes/reporters.adoc[Pipes Reporters] for how multiple reporters compose. + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`statusFile` +|_required_ +|Path of the JSON status file. The file is created on first write and overwritten in place. + +|`reportUpdateMs` +|_no default_ +|Interval in milliseconds between status-file writes. Typical values: `1000` for a low-overhead heartbeat, `100` for near-real-time updates. There is no built-in default — always set this explicitly. +|=== + +=== Status file schema + +The reporter serializes an `AsyncStatus` object to JSON, containing: + +* `asyncStatus` — current pipeline phase (`STARTED`, `COMPLETED`, `CRASHED`). +* `counts` — map of `RESULT_STATUS` to count (e.g., `PARSE_SUCCESS`, `PARSE_EXCEPTION`, `TIMEOUT`, `OOM`). +* `totalCountResult` — total documents processed and whether the enumeration is complete. +* `timestamp` — when the file was last written. +* `crashMessage` — populated only on fatal pipeline failure. + +The file is rewritten in full on each tick, not appended. + +[#watching] +=== Live status for watching applications + +The reporter is designed to support external "watchers" — UIs, dashboards, or monitoring scripts that poll the status file to display pipeline progress. To use it that way, set `reportUpdateMs` to match your desired refresh rate: + +[source,json] +---- +"reportUpdateMs": 250 +---- + +The watcher polls `statusFile` on its own interval and reads the most recent snapshot. Because the file is rewritten in full with the latest status, watchers do not need to handle partial reads. + +This pattern is used by `tika-gui-v2` to drive its progress UI: the GUI starts a pipeline subprocess, points the reporter at a temp file, and polls that file every few hundred milliseconds. + +Tradeoffs: + +* Smaller `reportUpdateMs` values mean more disk writes. On a fast SSD this is negligible, but on a slow disk (or NFS) the writer thread can become a bottleneck. +* The reporter thread sleeps between writes, so the worst-case staleness of the file is `reportUpdateMs` milliseconds plus serialization time. +* Per-record `report()` calls are cheap (counter increment only). The cost of "watching" is bounded by the periodic write, not by document throughput. + +[#security-notes] +== Security Notes + +* **`basePath` is a sandbox boundary.** The fetcher and emitter reject fetch/emit keys that resolve outside `basePath`. Do not set `allowAbsolutePaths=true` unless the source of fetch keys is fully trusted — an attacker-controlled fetch key could otherwise read arbitrary files. +* **Symlinks are followed.** A symlink under `basePath` pointing outside `basePath` may still be readable. If you need strict containment, do not allow symlinks in your input tree. +* **Output directories are created automatically.** The emitter creates intermediate directories as needed. Make sure the process's umask is appropriate for the data being written. From 9cf2de2c3a170cdbbbdaae2b6480b56d81290c94 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 11:43:39 -0400 Subject: [PATCH 3/7] add file system docs --- .../ROOT/examples/pipes-fs-pipeline.json | 2 +- docs/modules/ROOT/nav.adoc | 2 + .../ROOT/pages/pipes/getting-started.adoc | 4 +- .../ROOT/pages/pipes/plugins/index.adoc | 133 ++++++++++++++++++ 4 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 docs/modules/ROOT/pages/pipes/plugins/index.adoc diff --git a/docs/modules/ROOT/examples/pipes-fs-pipeline.json b/docs/modules/ROOT/examples/pipes-fs-pipeline.json index 5a7538b1416..4b71666add9 120000 --- a/docs/modules/ROOT/examples/pipes-fs-pipeline.json +++ b/docs/modules/ROOT/examples/pipes-fs-pipeline.json @@ -1 +1 @@ -../../../../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json \ No newline at end of file +../../../../tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 979555022a7..ef16b190dde 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -31,6 +31,8 @@ ** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] ** xref:pipes/timeouts.adoc[Timeouts] ** xref:pipes/cpu-sizing.adoc[Forked-JVM CPU Sizing] +** xref:pipes/plugins/index.adoc[Plugins] +*** xref:pipes/plugins/filesystem.adoc[File System] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc b/docs/modules/ROOT/pages/pipes/getting-started.adoc index 6ee6c451482..e52e02f1acd 100644 --- a/docs/modules/ROOT/pages/pipes/getting-started.adoc +++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc @@ -64,7 +64,9 @@ pipeline: ---- include::example$pipes-fs-pipeline.json[] ---- -icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json[View source on GitHub] +icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View source on GitHub] + +NOTE: The values shown like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, and `PLUGINS_PATHS` are placeholders the integration tests substitute at runtime. Replace them with real paths in your own config. Run it with: diff --git a/docs/modules/ROOT/pages/pipes/plugins/index.adoc b/docs/modules/ROOT/pages/pipes/plugins/index.adoc new file mode 100644 index 00000000000..8542fa20343 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/index.adoc @@ -0,0 +1,133 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Pipes Plugins + +Tika Pipes is extensible through plugins. Each plugin lives in its own Maven module and can implement one or more of the four pipes extension points: + +* **Fetcher** — retrieves document bytes from a source. +* **Emitter** — writes parsed results to a destination. +* **Iterator** (`PipesIterator`) — enumerates documents to process as `FetchEmitTuple` records. +* **Reporter** (`PipesReporter`) — records per-document processing status. + +Many plugins implement more than one (e.g., the S3 plugin provides fetcher, emitter, and iterator). The pages below document each plugin once, with one section per implemented interface. + +== Plugin / Interface Matrix + +[cols="2,1,1,1,1"] +|=== +|Plugin |Fetcher |Emitter |Iterator |Reporter + +|xref:pipes/plugins/filesystem.adoc[File System] +|✓ +|✓ +|✓ +|✓ + +|xref:pipes/plugins/s3.adoc[Amazon S3] +|✓ +|✓ +|✓ +|— + +|xref:pipes/plugins/gcs.adoc[Google Cloud Storage] +|✓ +|✓ +|✓ +|— + +|xref:pipes/plugins/azblob.adoc[Azure Blob Storage] +|✓ +|✓ +|✓ +|— + +|xref:pipes/plugins/opensearch.adoc[OpenSearch] +|— +|✓ +|— +|✓ + +|xref:pipes/plugins/elasticsearch.adoc[Elasticsearch] +|— +|✓ +|— +|✓ + +|xref:pipes/plugins/solr.adoc[Solr] +|— +|✓ +|✓ +|— + +|xref:pipes/plugins/jdbc.adoc[JDBC] +|— +|✓ +|✓ +|✓ + +|xref:pipes/plugins/kafka.adoc[Kafka] +|— +|✓ +|✓ +|— + +|xref:pipes/plugins/http.adoc[HTTP] +|✓ +|— +|— +|— + +|xref:pipes/plugins/google-drive.adoc[Google Drive] +|✓ +|— +|— +|— + +|xref:pipes/plugins/microsoft-graph.adoc[Microsoft Graph] +|✓ +|— +|— +|— + +|xref:pipes/plugins/atlassian-jwt.adoc[Atlassian JWT] +|✓ +|— +|— +|— + +|xref:pipes/plugins/csv.adoc[CSV] +|— +|— +|✓ +|— + +|xref:pipes/plugins/json.adoc[JSON] +|— +|— +|✓ +|— +|=== + +== Interface Overviews + +For descriptions of the interfaces themselves — their contracts, the shared concepts (`FetchKey`, `FetchEmitTuple`, `baseConfig`, etc.), and how they fit into a pipeline — see: + +* xref:pipes/fetchers.adoc[Fetchers] +* xref:pipes/emitters.adoc[Emitters] +* xref:pipes/iterators.adoc[Pipes Iterators] +* xref:pipes/reporters.adoc[Pipes Reporters] From e6d9e53c1239aa23b6630a5bd3a5646275e45224 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 16:07:05 -0400 Subject: [PATCH 4/7] add s3 --- .../ROOT/examples/pipes-s3-emitter.json | 1 + .../ROOT/examples/pipes-s3-fetcher.json | 1 + .../ROOT/examples/pipes-s3-iterator.json | 1 + .../ROOT/examples/pipes-s3-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 1 + .../ROOT/pages/pipes/plugins/index.adoc | 2 +- docs/modules/ROOT/pages/pipes/plugins/s3.adoc | 242 ++++++++++++++++++ .../tika/pipes/s3/ConfigExamplesTest.java | 136 ++++++++++ .../resources/config-examples/s3-emitter.json | 14 + .../resources/config-examples/s3-fetcher.json | 15 ++ .../config-examples/s3-pipeline.json | 49 ++++ .../config-examples/s3-pipes-iterator.json | 13 + 12 files changed, 475 insertions(+), 1 deletion(-) create mode 120000 docs/modules/ROOT/examples/pipes-s3-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-s3-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-s3-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-s3-pipeline.json create mode 100644 docs/modules/ROOT/pages/pipes/plugins/s3.adoc create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json diff --git a/docs/modules/ROOT/examples/pipes-s3-emitter.json b/docs/modules/ROOT/examples/pipes-s3-emitter.json new file mode 120000 index 00000000000..6f05a73ec21 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-s3-fetcher.json b/docs/modules/ROOT/examples/pipes-s3-fetcher.json new file mode 120000 index 00000000000..b24bd4fa27a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-s3-iterator.json b/docs/modules/ROOT/examples/pipes-s3-iterator.json new file mode 120000 index 00000000000..db1b210e827 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-s3-pipeline.json b/docs/modules/ROOT/examples/pipes-s3-pipeline.json new file mode 120000 index 00000000000..cc6f573ec2c --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index ef16b190dde..90fce8701d9 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -33,6 +33,7 @@ ** xref:pipes/cpu-sizing.adoc[Forked-JVM CPU Sizing] ** xref:pipes/plugins/index.adoc[Plugins] *** xref:pipes/plugins/filesystem.adoc[File System] +*** xref:pipes/plugins/s3.adoc[Amazon S3] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/plugins/index.adoc b/docs/modules/ROOT/pages/pipes/plugins/index.adoc index 8542fa20343..d5173d2032e 100644 --- a/docs/modules/ROOT/pages/pipes/plugins/index.adoc +++ b/docs/modules/ROOT/pages/pipes/plugins/index.adoc @@ -125,7 +125,7 @@ Many plugins implement more than one (e.g., the S3 plugin provides fetcher, emit == Interface Overviews -For descriptions of the interfaces themselves — their contracts, the shared concepts (`FetchKey`, `FetchEmitTuple`, `baseConfig`, etc.), and how they fit into a pipeline — see: +For descriptions of the interfaces themselves — their contracts, the shared concepts (`FetchKey`, `FetchEmitTuple`, `fetcherId`/`emitterId` wiring, etc.), and how they fit into a pipeline — see: * xref:pipes/fetchers.adoc[Fetchers] * xref:pipes/emitters.adoc[Emitters] diff --git a/docs/modules/ROOT/pages/pipes/plugins/s3.adoc b/docs/modules/ROOT/pages/pipes/plugins/s3.adoc new file mode 100644 index 00000000000..90d0960f06e --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/s3.adoc @@ -0,0 +1,242 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Amazon S3 Plugin +:toc: +:toclevels: 3 + +The Amazon S3 plugin (`tika-pipes-s3`) provides fetcher, emitter, and iterator interfaces for objects in S3 (or any S3-compatible service such as MinIO). + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`s3-fetcher` +|`S3Fetcher` + +|Emitter +|`s3-emitter` +|`S3Emitter` + +|Iterator +|`s3-pipes-iterator` +|`S3PipesIterator` +|=== + +[#credentials] +== Credentials + +All three components share the same `credentialsProvider` selector: + +* `profile` — reads credentials from the local AWS profile named by `profile` (e.g., `default`). +* `instance` — uses the instance/container role attached to the host (EC2 IAM role, ECS task role, etc.). No additional fields needed. +* `key_secret` — reads `accessKey` and `secretKey` from the config. Avoid checking these into source control; prefer environment-variable substitution or one of the other providers. + +The emitter's `validate()` enforces these values, but the fetcher and iterator do not — they fail later when the AWS SDK tries to resolve credentials. + +[#s3-fetcher] +== S3 Fetcher (`s3-fetcher`) + +Reads objects from an S3 bucket. The fetch key is the S3 key under `prefix` (if set). + +[source,json] +---- +include::example$pipes-s3-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|S3 bucket name. + +|`region` +|_required_ +|AWS region (e.g., `us-east-1`). + +|`prefix` +|_no default_ +|Optional key prefix. Fetch keys are resolved underneath this prefix. + +|`credentialsProvider` +|_required_ +|One of `profile`, `instance`, `key_secret`. See <>. + +|`profile` / `accessKey` / `secretKey` +|_conditional_ +|Required by the matching `credentialsProvider`. + +|`spoolToTemp` +|`true` +|If `true`, the fetched object is spooled to a temp file before being parsed. + +|`extractUserMetadata` +|`true` +|If `true`, S3 user-metadata is copied into the parsed `Metadata`. + +|`maxConnections` +|`0` +|Maximum HTTP connections in the S3 client pool. `0` lets the SDK pick a default. + +|`maxLength` +|`-1` +|Maximum object size, in bytes. `-1` means no limit. + +|`endpointConfigurationService` +|_no default_ +|Custom S3 endpoint, for S3-compatible services such as MinIO or LocalStack. + +|`pathStyleAccessEnabled` +|`false` +|Force path-style URLs (e.g., `https://endpoint/bucket/key`). Required by some S3-compatible services. + +|`throttleSeconds` +|_no default_ +|Optional rate-limit array; consecutive failures sleep for the corresponding number of seconds. +|=== + +[#s3-emitter] +== S3 Emitter (`s3-emitter`) + +Writes parsed results back to an S3 bucket. The emit key (relative to `prefix`) is derived from the `FetchEmitTuple`. + +[source,json] +---- +include::example$pipes-s3-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|Destination S3 bucket name (validated non-blank). + +|`region` +|_required_ +|AWS region (validated non-blank). + +|`credentialsProvider` +|_required_ +|One of `profile`, `instance`, `key_secret` (validated). See <>. + +|`profile` / `accessKey` / `secretKey` +|_conditional_ +|Required by the matching `credentialsProvider` (validated). + +|`prefix` +|_no default_ +|Optional key prefix. A trailing `/` is stripped automatically. + +|`fileExtension` +|`json` +|Extension appended to each emitted key. + +|`spoolToTemp` +|`true` +|If `true`, output is spooled locally before being uploaded. + +|`maxConnections` +|`50` +|Maximum HTTP connections in the S3 client pool. + +|`endpointConfigurationService` +|_no default_ +|Custom S3 endpoint, for S3-compatible services. + +|`pathStyleAccessEnabled` +|`false` +|Force path-style URLs. +|=== + +[#s3-iterator] +== S3 Iterator (`s3-pipes-iterator`) + +Lists objects under a bucket/prefix and emits one `FetchEmitTuple` per object found. + +[source,json] +---- +include::example$pipes-s3-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|S3 bucket to enumerate. + +|`region` +|_required_ +|AWS region. + +|`prefix` +|`""` +|Key prefix to scope the listing. + +|`credentialsProvider` +|_optional_ +|One of `profile`, `instance`, `key_secret`. See <>. + +|`profile` / `accessKey` / `secretKey` / `endpointConfigurationService` +|_conditional_ +|Auth fields, mirroring the fetcher and emitter. + +|`fileNamePattern` +|_no default_ +|Optional regex; only keys whose name matches are emitted. + +|`maxConnections` +|`50` +|Maximum HTTP connections in the S3 client pool. + +|`pathStyleAccessEnabled` +|`false` +|Force path-style URLs. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#s3-pipeline] +== Complete Pipeline Example + +The example below wires the S3 fetcher, emitter, and iterator into a complete pipeline that lists `s3://my-tika-input/incoming/` and writes results to `s3://my-tika-output/results/`. + +[source,json] +---- +include::example$pipes-s3-pipeline.json[] +---- + +[#notes] +== Notes + +* The fetcher, emitter, and iterator each maintain their own S3 client. Auth and endpoint settings need to be configured per component, not globally. +* The S3 SDK enforces TLS 1.2+ by default; in-flight encryption is on. For at-rest encryption, configure bucket-level SSE on the AWS side. +* When using `endpointConfigurationService` against MinIO or LocalStack, you almost always need `pathStyleAccessEnabled: true`. diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java new file mode 100644 index 00000000000..f248d8194e4 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.s3; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.s3.S3EmitterConfig; +import org.apache.tika.pipes.fetcher.s3.config.S3FetcherConfig; +import org.apache.tika.pipes.iterator.s3.S3PipesIteratorConfig; + +/** + * Validates S3 fetcher/emitter/iterator configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testS3FetcherConfig() throws Exception { + loadViaTikaLoader("s3-fetcher.json"); + + JsonNode inner = innerComponent(readExample("s3-fetcher.json"), + "fetchers", "s3f", "s3-fetcher"); + S3FetcherConfig config = S3FetcherConfig.load(inner.toString()); + assertEquals("my-tika-input", config.getBucket()); + assertEquals("us-east-1", config.getRegion()); + assertEquals("profile", config.getCredentialsProvider()); + assertEquals("default", config.getProfile()); + } + + @Test + public void testS3EmitterConfig() throws Exception { + loadViaTikaLoader("s3-emitter.json"); + + JsonNode inner = innerComponent(readExample("s3-emitter.json"), + "emitters", "s3e", "s3-emitter"); + S3EmitterConfig config = S3EmitterConfig.load(inner.toString()); + assertEquals("my-tika-output", config.bucket()); + assertEquals("us-east-1", config.region()); + assertEquals("profile", config.credentialsProvider()); + assertEquals("json", config.fileExtension()); + // exercises required-field + credentialsProvider whitelist validation + config.validate(); + } + + @Test + public void testS3IteratorConfig() throws Exception { + loadViaTikaLoader("s3-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("s3-pipes-iterator.json"), + "pipes-iterator", null, "s3-pipes-iterator"); + S3PipesIteratorConfig config = S3PipesIteratorConfig.load(inner.toString()); + assertEquals("my-tika-input", config.getBucket()); + assertEquals("us-east-1", config.getRegion()); + assertEquals("s3f", config.getFetcherId()); + assertEquals("s3e", config.getEmitterId()); + } + + @Test + public void testS3PipelineConfig() throws Exception { + loadViaTikaLoader("s3-pipeline.json"); + + String json = readExample("s3-pipeline.json"); + S3FetcherConfig fetcher = S3FetcherConfig.load( + innerComponent(json, "fetchers", "s3f", "s3-fetcher").toString()); + S3EmitterConfig emitter = S3EmitterConfig.load( + innerComponent(json, "emitters", "s3e", "s3-emitter").toString()); + S3PipesIteratorConfig iterator = S3PipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "s3-pipes-iterator").toString()); + + emitter.validate(); + assertEquals(fetcher.getBucket(), iterator.getBucket()); + assertEquals("s3f", iterator.getFetcherId()); + assertEquals("s3e", iterator.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json new file mode 100644 index 00000000000..8cd5557db1c --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json @@ -0,0 +1,14 @@ +{ + "emitters": { + "s3e": { + "s3-emitter": { + "bucket": "my-tika-output", + "region": "us-east-1", + "prefix": "results/", + "fileExtension": "json", + "credentialsProvider": "profile", + "profile": "default" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json new file mode 100644 index 00000000000..8047fee2b08 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json @@ -0,0 +1,15 @@ +{ + "fetchers": { + "s3f": { + "s3-fetcher": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "extractUserMetadata": true, + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json new file mode 100644 index 00000000000..1f17aa7081d --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json @@ -0,0 +1,49 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "s3f": { + "s3-fetcher": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "extractUserMetadata": true + } + } + }, + "emitters": { + "s3e": { + "s3-emitter": { + "bucket": "my-tika-output", + "region": "us-east-1", + "prefix": "results/", + "fileExtension": "json", + "credentialsProvider": "profile", + "profile": "default" + } + } + }, + "pipes-iterator": { + "s3-pipes-iterator": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "fetcherId": "s3f", + "emitterId": "s3e" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json new file mode 100644 index 00000000000..e1fb2e98750 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json @@ -0,0 +1,13 @@ +{ + "pipes-iterator": { + "s3-pipes-iterator": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "fetcherId": "s3f", + "emitterId": "s3e" + } + } +} From cbc65c9bb88b85763660bb6d556a3dd87e5601d9 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 16:11:28 -0400 Subject: [PATCH 5/7] gcs --- .../ROOT/examples/pipes-gcs-emitter.json | 1 + .../ROOT/examples/pipes-gcs-fetcher.json | 1 + .../ROOT/examples/pipes-gcs-iterator.json | 1 + .../ROOT/examples/pipes-gcs-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 1 + .../modules/ROOT/pages/pipes/plugins/gcs.adoc | 166 ++++++++++++++++++ .../tika/pipes/gcs/ConfigExamplesTest.java | 133 ++++++++++++++ .../config-examples/gcs-emitter.json | 12 ++ .../config-examples/gcs-fetcher.json | 12 ++ .../config-examples/gcs-pipeline.json | 42 +++++ .../config-examples/gcs-pipes-iterator.json | 11 ++ 11 files changed, 381 insertions(+) create mode 120000 docs/modules/ROOT/examples/pipes-gcs-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-gcs-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-gcs-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-gcs-pipeline.json create mode 100644 docs/modules/ROOT/pages/pipes/plugins/gcs.adoc create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json diff --git a/docs/modules/ROOT/examples/pipes-gcs-emitter.json b/docs/modules/ROOT/examples/pipes-gcs-emitter.json new file mode 120000 index 00000000000..48c994f74ad --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-gcs-fetcher.json b/docs/modules/ROOT/examples/pipes-gcs-fetcher.json new file mode 120000 index 00000000000..8b390e310c0 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-gcs-iterator.json b/docs/modules/ROOT/examples/pipes-gcs-iterator.json new file mode 120000 index 00000000000..d4f6b6b9347 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-gcs-pipeline.json b/docs/modules/ROOT/examples/pipes-gcs-pipeline.json new file mode 120000 index 00000000000..621bad767e0 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 90fce8701d9..e5e2a096244 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -34,6 +34,7 @@ ** xref:pipes/plugins/index.adoc[Plugins] *** xref:pipes/plugins/filesystem.adoc[File System] *** xref:pipes/plugins/s3.adoc[Amazon S3] +*** xref:pipes/plugins/gcs.adoc[Google Cloud Storage] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc b/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc new file mode 100644 index 00000000000..d639580d0f7 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc @@ -0,0 +1,166 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Google Cloud Storage Plugin +:toc: +:toclevels: 3 + +The Google Cloud Storage plugin (`tika-pipes-gcs`) provides fetcher, emitter, and iterator interfaces for objects in GCS buckets. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`gcs-fetcher` +|`GCSFetcher` + +|Emitter +|`gcs-emitter` +|`GCSEmitter` + +|Iterator +|`gcs-pipes-iterator` +|`GCSPipesIterator` +|=== + +[#credentials] +== Credentials + +The GCS plugin relies on Google's Application Default Credentials chain — there are no credential fields in the JSON config itself. Set credentials by: + +* Running on a GCP service (GCE/GKE/Cloud Run) — uses the attached service account automatically. +* Setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of a service-account JSON key. +* Running `gcloud auth application-default login` for local development. + +The `projectId` field in each component selects which GCP project to bill the API calls against; the service account or user must have storage access to the named bucket. + +[#gcs-fetcher] +== GCS Fetcher (`gcs-fetcher`) + +Reads objects from a GCS bucket. The fetch key is the object name. + +[source,json] +---- +include::example$pipes-gcs-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`projectId` +|_required_ +|GCP project ID for billing/authentication. + +|`bucket` +|_required_ +|GCS bucket name. + +|`spoolToTemp` +|`true` +|If `true`, the fetched object is spooled to a temp file before parsing. + +|`extractUserMetadata` +|`true` +|If `true`, GCS custom metadata is copied into the parsed `Metadata`. +|=== + +[#gcs-emitter] +== GCS Emitter (`gcs-emitter`) + +Writes parsed results to a GCS bucket. The emit key (relative to `prefix`) is derived from the `FetchEmitTuple`. + +[source,json] +---- +include::example$pipes-gcs-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`projectId` +|_required_ +|GCP project ID (validated non-blank). + +|`bucket` +|_required_ +|Destination GCS bucket (validated non-blank). + +|`prefix` +|_no default_ +|Optional object-name prefix. A trailing `/` is stripped automatically. + +|`fileExtension` +|`json` +|Extension appended to each emitted object name. +|=== + +[#gcs-iterator] +== GCS Iterator (`gcs-pipes-iterator`) + +Lists objects under a bucket/prefix and emits one `FetchEmitTuple` per object. + +[source,json] +---- +include::example$pipes-gcs-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|GCS bucket to enumerate. + +|`projectId` +|`""` +|GCP project ID for the listing API call. + +|`prefix` +|`""` +|Object-name prefix to scope the listing. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#gcs-pipeline] +== Complete Pipeline Example + +The example below wires the GCS fetcher, emitter, and iterator together for a bucket-to-bucket pipeline. + +[source,json] +---- +include::example$pipes-gcs-pipeline.json[] +---- + +[#notes] +== Notes + +* The GCS plugin uses the official `google-cloud-storage` SDK. Set `GOOGLE_APPLICATION_CREDENTIALS` (or rely on workload identity / metadata server) to authenticate. +* Each component creates its own `Storage` client. Heavy throughput should be balanced against your project's per-second request quota. +* Unlike S3, there is no `path-style` toggle — GCS uses a single global endpoint. diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java new file mode 100644 index 00000000000..7cfc1f3fb16 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.gcs; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.gcs.GCSEmitterConfig; +import org.apache.tika.pipes.fetcher.gcs.config.GCSFetcherConfig; +import org.apache.tika.pipes.iterator.gcs.GCSPipesIteratorConfig; + +/** + * Validates GCS fetcher/emitter/iterator configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testGCSFetcherConfig() throws Exception { + loadViaTikaLoader("gcs-fetcher.json"); + + JsonNode inner = innerComponent(readExample("gcs-fetcher.json"), + "fetchers", "gcsf", "gcs-fetcher"); + GCSFetcherConfig config = GCSFetcherConfig.load(inner.toString()); + assertEquals("my-gcp-project", config.getProjectId()); + assertEquals("my-tika-input", config.getBucket()); + } + + @Test + public void testGCSEmitterConfig() throws Exception { + loadViaTikaLoader("gcs-emitter.json"); + + JsonNode inner = innerComponent(readExample("gcs-emitter.json"), + "emitters", "gcse", "gcs-emitter"); + GCSEmitterConfig config = GCSEmitterConfig.load(inner.toString()); + assertEquals("my-gcp-project", config.projectId()); + assertEquals("my-tika-output", config.bucket()); + assertEquals("json", config.fileExtension()); + config.validate(); + assertEquals("results", config.getNormalizedPrefix()); + } + + @Test + public void testGCSIteratorConfig() throws Exception { + loadViaTikaLoader("gcs-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("gcs-pipes-iterator.json"), + "pipes-iterator", null, "gcs-pipes-iterator"); + GCSPipesIteratorConfig config = GCSPipesIteratorConfig.load(inner.toString()); + assertEquals("my-gcp-project", config.getProjectId()); + assertEquals("my-tika-input", config.getBucket()); + assertEquals("gcsf", config.getFetcherId()); + assertEquals("gcse", config.getEmitterId()); + } + + @Test + public void testGCSPipelineConfig() throws Exception { + loadViaTikaLoader("gcs-pipeline.json"); + + String json = readExample("gcs-pipeline.json"); + GCSFetcherConfig fetcher = GCSFetcherConfig.load( + innerComponent(json, "fetchers", "gcsf", "gcs-fetcher").toString()); + GCSEmitterConfig emitter = GCSEmitterConfig.load( + innerComponent(json, "emitters", "gcse", "gcs-emitter").toString()); + GCSPipesIteratorConfig iterator = GCSPipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "gcs-pipes-iterator").toString()); + + emitter.validate(); + assertEquals(fetcher.getBucket(), iterator.getBucket()); + assertEquals("gcsf", iterator.getFetcherId()); + assertEquals("gcse", iterator.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json new file mode 100644 index 00000000000..6ba06037924 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json @@ -0,0 +1,12 @@ +{ + "emitters": { + "gcse": { + "gcs-emitter": { + "projectId": "my-gcp-project", + "bucket": "my-tika-output", + "prefix": "results/", + "fileExtension": "json" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json new file mode 100644 index 00000000000..89ab85eed3b --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json @@ -0,0 +1,12 @@ +{ + "fetchers": { + "gcsf": { + "gcs-fetcher": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "extractUserMetadata": true, + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json new file mode 100644 index 00000000000..8c483e51049 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json @@ -0,0 +1,42 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "gcsf": { + "gcs-fetcher": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "extractUserMetadata": true + } + } + }, + "emitters": { + "gcse": { + "gcs-emitter": { + "projectId": "my-gcp-project", + "bucket": "my-tika-output", + "prefix": "results/", + "fileExtension": "json" + } + } + }, + "pipes-iterator": { + "gcs-pipes-iterator": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "prefix": "incoming/", + "fetcherId": "gcsf", + "emitterId": "gcse" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json new file mode 100644 index 00000000000..756e087848b --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json @@ -0,0 +1,11 @@ +{ + "pipes-iterator": { + "gcs-pipes-iterator": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "prefix": "incoming/", + "fetcherId": "gcsf", + "emitterId": "gcse" + } + } +} From 06270d3ae2d0b2f31e173f3fcbdb5376497d9432 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 16:17:42 -0400 Subject: [PATCH 6/7] azblob --- .../ROOT/examples/pipes-azblob-emitter.json | 1 + .../ROOT/examples/pipes-azblob-fetcher.json | 1 + .../ROOT/examples/pipes-azblob-iterator.json | 1 + .../ROOT/examples/pipes-azblob-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 1 + .../ROOT/pages/pipes/plugins/azblob.adoc | 185 ++++++++++++++++++ .../tika/pipes/azblob/ConfigExamplesTest.java | 134 +++++++++++++ .../config-examples/az-blob-emitter.json | 14 ++ .../config-examples/az-blob-fetcher.json | 13 ++ .../config-examples/az-blob-pipeline.json | 45 +++++ .../az-blob-pipes-iterator.json | 13 ++ 11 files changed, 409 insertions(+) create mode 120000 docs/modules/ROOT/examples/pipes-azblob-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-azblob-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-azblob-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-azblob-pipeline.json create mode 100644 docs/modules/ROOT/pages/pipes/plugins/azblob.adoc create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json diff --git a/docs/modules/ROOT/examples/pipes-azblob-emitter.json b/docs/modules/ROOT/examples/pipes-azblob-emitter.json new file mode 120000 index 00000000000..8213f434fa9 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-azblob-fetcher.json b/docs/modules/ROOT/examples/pipes-azblob-fetcher.json new file mode 120000 index 00000000000..c7d8ce2d52a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-azblob-iterator.json b/docs/modules/ROOT/examples/pipes-azblob-iterator.json new file mode 120000 index 00000000000..bc68d45fb08 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-azblob-pipeline.json b/docs/modules/ROOT/examples/pipes-azblob-pipeline.json new file mode 120000 index 00000000000..1e3c9dc8602 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-azblob-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index e5e2a096244..e72c1d637bb 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -35,6 +35,7 @@ *** xref:pipes/plugins/filesystem.adoc[File System] *** xref:pipes/plugins/s3.adoc[Amazon S3] *** xref:pipes/plugins/gcs.adoc[Google Cloud Storage] +*** xref:pipes/plugins/azblob.adoc[Azure Blob Storage] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc b/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc new file mode 100644 index 00000000000..1e462b0f705 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc @@ -0,0 +1,185 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Azure Blob Storage Plugin +:toc: +:toclevels: 3 + +The Azure Blob Storage plugin (`tika-pipes-az-blob`) provides fetcher, emitter, and iterator interfaces for blobs in Azure Storage containers. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`az-blob-fetcher` +|`AZBlobFetcher` + +|Emitter +|`az-blob-emitter` +|`AZBlobEmitter` + +|Iterator +|`az-blob-pipes-iterator` +|`AZBlobPipesIterator` +|=== + +[#credentials] +== Credentials + +All three components authenticate with a SAS (shared-access-signature) token. There are no other auth modes — managed identity, account keys, and AD-based auth are not currently exposed. + +* `endpoint` — base URL of the storage account, e.g., `https://myaccount.blob.core.windows.net`. +* `sasToken` — the URL query-string portion of a generated SAS, without a leading `?`. Permissions in the token must match the operations the component will perform (read for fetchers/iterators, read+write for emitters). + +The emitter's `validate()` enforces that `sasToken`, `endpoint`, and `container` are all non-blank, but does not parse the SAS itself — invalid or expired tokens fail later when the Azure SDK makes a request. + +[#az-blob-fetcher] +== Azure Blob Fetcher (`az-blob-fetcher`) + +Reads blobs from an Azure Storage container. The fetch key is the blob name. + +[source,json] +---- +include::example$pipes-azblob-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`endpoint` +|_required_ +|Storage account URL. + +|`container` +|_required_ +|Container name. + +|`sasToken` +|_required_ +|SAS token granting read access to the container. + +|`spoolToTemp` +|`true` +|If `true`, the fetched blob is spooled to a temp file before parsing. + +|`extractUserMetadata` +|`true` +|If `true`, blob user-metadata is copied into the parsed `Metadata`. +|=== + +[#az-blob-emitter] +== Azure Blob Emitter (`az-blob-emitter`) + +Writes parsed results to an Azure Storage container. The emit key (relative to `prefix`) is derived from the `FetchEmitTuple`. + +[source,json] +---- +include::example$pipes-azblob-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`endpoint` +|_required_ +|Storage account URL (validated non-blank). + +|`container` +|_required_ +|Destination container name (validated non-blank). + +|`sasToken` +|_required_ +|SAS token granting read+write access (validated non-blank). + +|`prefix` +|_no default_ +|Optional blob-name prefix. A trailing `/` is stripped automatically. + +|`fileExtension` +|`json` +|Extension appended to each emitted blob name. + +|`overwriteExisting` +|`false` +|If `true`, an existing blob with the same name is overwritten; otherwise the emit fails. +|=== + +[#az-blob-iterator] +== Azure Blob Iterator (`az-blob-pipes-iterator`) + +Lists blobs under a container/prefix and emits one `FetchEmitTuple` per blob. + +[source,json] +---- +include::example$pipes-azblob-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`endpoint` +|_required_ +|Storage account URL. + +|`container` +|_required_ +|Container to enumerate. + +|`sasToken` +|_required_ +|SAS token granting list+read access. + +|`prefix` +|`""` +|Blob-name prefix to scope the listing. + +|`timeoutMillis` +|`360000` +|Per-request timeout, in milliseconds (6 minutes by default). + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#az-blob-pipeline] +== Complete Pipeline Example + +The example below wires the Azure Blob fetcher, emitter, and iterator together into a container-to-container pipeline. + +[source,json] +---- +include::example$pipes-azblob-pipeline.json[] +---- + +[#notes] +== Notes + +* SAS tokens have an expiration baked in. For long-running pipelines, rotate the SAS or use a token that outlives the pipeline window. +* Avoid checking real SAS tokens into source control — the strings in the examples above are placeholders. +* Each component creates its own `BlobServiceClient`. The Azure SDK pools HTTP connections per client. diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java new file mode 100644 index 00000000000..0a083f608a8 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.azblob; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.azblob.AZBlobEmitterConfig; +import org.apache.tika.pipes.fetcher.azblob.config.AZBlobFetcherConfig; +import org.apache.tika.pipes.iterator.azblob.AZBlobPipesIteratorConfig; + +/** + * Validates Azure Blob fetcher/emitter/iterator configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testAZBlobFetcherConfig() throws Exception { + loadViaTikaLoader("az-blob-fetcher.json"); + + JsonNode inner = innerComponent(readExample("az-blob-fetcher.json"), + "fetchers", "azf", "az-blob-fetcher"); + AZBlobFetcherConfig config = AZBlobFetcherConfig.load(inner.toString()); + assertEquals("tika-input", config.getContainer()); + assertEquals("https://myaccount.blob.core.windows.net", config.getEndpoint()); + assertNotNull(config.getSasToken()); + } + + @Test + public void testAZBlobEmitterConfig() throws Exception { + loadViaTikaLoader("az-blob-emitter.json"); + + JsonNode inner = innerComponent(readExample("az-blob-emitter.json"), + "emitters", "aze", "az-blob-emitter"); + AZBlobEmitterConfig config = AZBlobEmitterConfig.load(inner.toString()); + assertEquals("tika-output", config.container()); + assertEquals("json", config.fileExtension()); + config.validate(); + assertEquals("results", config.getNormalizedPrefix()); + } + + @Test + public void testAZBlobIteratorConfig() throws Exception { + loadViaTikaLoader("az-blob-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("az-blob-pipes-iterator.json"), + "pipes-iterator", null, "az-blob-pipes-iterator"); + AZBlobPipesIteratorConfig config = AZBlobPipesIteratorConfig.load(inner.toString()); + assertEquals("tika-input", config.getContainer()); + assertEquals("incoming/", config.getPrefix()); + assertEquals(360000L, config.getTimeoutMillis()); + assertEquals("azf", config.getFetcherId()); + assertEquals("aze", config.getEmitterId()); + } + + @Test + public void testAZBlobPipelineConfig() throws Exception { + loadViaTikaLoader("az-blob-pipeline.json"); + + String json = readExample("az-blob-pipeline.json"); + AZBlobFetcherConfig fetcher = AZBlobFetcherConfig.load( + innerComponent(json, "fetchers", "azf", "az-blob-fetcher").toString()); + AZBlobEmitterConfig emitter = AZBlobEmitterConfig.load( + innerComponent(json, "emitters", "aze", "az-blob-emitter").toString()); + AZBlobPipesIteratorConfig iterator = AZBlobPipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "az-blob-pipes-iterator").toString()); + + emitter.validate(); + assertEquals(fetcher.getContainer(), iterator.getContainer()); + assertEquals("azf", iterator.getFetcherId()); + assertEquals("aze", iterator.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json new file mode 100644 index 00000000000..9d102868c9f --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json @@ -0,0 +1,14 @@ +{ + "emitters": { + "aze": { + "az-blob-emitter": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-output", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rwl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "results/", + "fileExtension": "json", + "overwriteExisting": false + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json new file mode 100644 index 00000000000..aebdcedf93f --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json @@ -0,0 +1,13 @@ +{ + "fetchers": { + "azf": { + "az-blob-fetcher": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "extractUserMetadata": true, + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json new file mode 100644 index 00000000000..65181a1a57d --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json @@ -0,0 +1,45 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "azf": { + "az-blob-fetcher": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "extractUserMetadata": true + } + } + }, + "emitters": { + "aze": { + "az-blob-emitter": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-output", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rwl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "results/", + "fileExtension": "json" + } + } + }, + "pipes-iterator": { + "az-blob-pipes-iterator": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "incoming/", + "fetcherId": "azf", + "emitterId": "aze" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json new file mode 100644 index 00000000000..e2875fa92fb --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json @@ -0,0 +1,13 @@ +{ + "pipes-iterator": { + "az-blob-pipes-iterator": { + "endpoint": "https://myaccount.blob.core.windows.net", + "container": "tika-input", + "sasToken": "sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED", + "prefix": "incoming/", + "timeoutMillis": 360000, + "fetcherId": "azf", + "emitterId": "aze" + } + } +} From e8b98f8e4951c96e8ed41bb0b8f15c89dc98132a Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 21:00:14 -0400 Subject: [PATCH 7/7] pipes docs updates --- .../examples/pipes-atlassian-jwt-fetcher.json | 1 + .../ROOT/examples/pipes-config-template.json | 1 + .../ROOT/examples/pipes-csv-iterator.json | 1 + .../examples/pipes-elasticsearch-emitter.json | 1 + .../pipes-elasticsearch-pipeline.json | 1 + .../pipes-elasticsearch-reporter.json | 1 + .../modules/ROOT/examples/pipes-emit-all.json | 1 + .../ROOT/examples/pipes-fs-emitter.json | 1 - .../ROOT/examples/pipes-fs-fetcher.json | 1 - .../examples/pipes-google-drive-fetcher.json | 1 + .../ROOT/examples/pipes-http-fetcher.json | 1 + .../ROOT/examples/pipes-jdbc-emitter.json | 1 + .../ROOT/examples/pipes-jdbc-iterator.json | 1 + .../ROOT/examples/pipes-jdbc-pipeline.json | 1 + .../ROOT/examples/pipes-jdbc-reporter.json | 1 + .../ROOT/examples/pipes-json-iterator.json | 1 + .../ROOT/examples/pipes-kafka-emitter.json | 1 + .../ROOT/examples/pipes-kafka-iterator.json | 1 + .../ROOT/examples/pipes-kafka-pipeline.json | 1 + .../pipes-microsoft-graph-fetcher.json | 1 + .../examples/pipes-opensearch-emitter.json | 1 + .../examples/pipes-opensearch-pipeline.json | 1 + .../examples/pipes-opensearch-reporter.json | 1 + .../ROOT/examples/pipes-shared-server.json | 1 + .../ROOT/examples/pipes-solr-emitter-zk.json | 1 + .../ROOT/examples/pipes-solr-emitter.json | 1 + .../ROOT/examples/pipes-solr-iterator.json | 1 + .../ROOT/examples/pipes-solr-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 11 + .../ROOT/pages/pipes/configuration.adoc | 51 ++++ docs/modules/ROOT/pages/pipes/cpu-sizing.adoc | 17 ++ docs/modules/ROOT/pages/pipes/emitters.adoc | 245 +++++----------- docs/modules/ROOT/pages/pipes/fetchers.adoc | 264 ++++-------------- docs/modules/ROOT/pages/pipes/iterators.adoc | 230 ++++----------- .../pages/pipes/plugins/atlassian-jwt.adoc | 121 ++++++++ .../modules/ROOT/pages/pipes/plugins/csv.adoc | 75 +++++ .../pages/pipes/plugins/elasticsearch.adoc | 196 +++++++++++++ .../pages/pipes/plugins/google-drive.adoc | 79 ++++++ .../ROOT/pages/pipes/plugins/http.adoc | 132 +++++++++ .../ROOT/pages/pipes/plugins/jdbc.adoc | 241 ++++++++++++++++ .../ROOT/pages/pipes/plugins/json.adoc | 63 +++++ .../ROOT/pages/pipes/plugins/kafka.adoc | 213 ++++++++++++++ .../pages/pipes/plugins/microsoft-graph.adoc | 85 ++++++ .../ROOT/pages/pipes/plugins/opensearch.adoc | 176 ++++++++++++ .../ROOT/pages/pipes/plugins/solr.adoc | 202 ++++++++++++++ docs/modules/ROOT/pages/pipes/reporters.adoc | 99 +++---- .../ROOT/pages/pipes/shared-server-mode.adoc | 2 + .../atlassianjwt/ConfigExamplesTest.java | 69 +++++ .../atlassian-jwt-fetcher.json | 19 ++ .../config/tika-config-az-blob-fetcher.xml | 30 -- .../resources/config/tika-config-az-blob.xml | 28 -- .../tika/pipes/csv/ConfigExamplesTest.java | 70 +++++ .../config-examples/csv-pipes-iterator.json | 12 + .../tika/pipes/es/ConfigExamplesTest.java | 126 +++++++++ .../resources/config-examples/es-emitter.json | 19 ++ .../config-examples/es-pipeline.json | 60 ++++ .../config-examples/es-reporter.json | 15 + .../test/resources/config/tika-config-gcs.xml | 26 -- .../pipes/googledrive/ConfigExamplesTest.java | 70 +++++ .../config-examples/google-drive-fetcher.json | 13 + .../tika/pipes/http/ConfigExamplesTest.java | 70 +++++ .../config-examples/http-fetcher.json | 21 ++ .../jdbc/JDBCPipesReporterConfig.java | 27 +- .../tika/pipes/jdbc/ConfigExamplesTest.java | 150 ++++++++++ .../config-examples/jdbc-emitter.json | 22 ++ .../config-examples/jdbc-pipeline.json | 56 ++++ .../config-examples/jdbc-pipes-iterator.json | 15 + .../config-examples/jdbc-reporter.json | 12 + .../tika-config-jdbc-emitter-attachments.xml | 53 ---- ...ika-config-jdbc-emitter-existing-table.xml | 42 --- .../tika-config-jdbc-emitter-multivalued.xml | 45 --- .../tika-config-jdbc-emitter-trunc.xml | 44 --- .../configs/tika-config-jdbc-emitter.xml | 54 ---- .../tika/pipes/json/ConfigExamplesTest.java | 67 +++++ .../config-examples/json-pipes-iterator.json | 9 + .../tika/pipes/kafka/ConfigExamplesTest.java | 119 ++++++++ .../config-examples/kafka-emitter.json | 19 ++ .../config-examples/kafka-pipeline.json | 43 +++ .../config-examples/kafka-pipes-iterator.json | 14 + .../microsoftgraph/ConfigExamplesTest.java | 72 +++++ .../microsoft-graph-fetcher.json | 15 + .../pipes/opensearch/ConfigExamplesTest.java | 123 ++++++++ .../config-examples/opensearch-emitter.json | 21 ++ .../config-examples/opensearch-pipeline.json | 64 +++++ .../config-examples/opensearch-reporter.json | 17 ++ .../resources/tika-config-simple-emitter.xml | 41 --- .../tika/pipes/solr/ConfigExamplesTest.java | 134 +++++++++ .../config-examples/solr-emitter-zk.json | 15 + .../config-examples/solr-emitter.json | 17 ++ .../config-examples/solr-pipeline.json | 42 +++ .../config-examples/solr-pipes-iterator.json | 15 + .../resources/tika-config-simple-emitter.xml | 48 ---- 92 files changed, 3560 insertions(+), 1033 deletions(-) create mode 120000 docs/modules/ROOT/examples/pipes-atlassian-jwt-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-config-template.json create mode 120000 docs/modules/ROOT/examples/pipes-csv-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-elasticsearch-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-elasticsearch-pipeline.json create mode 120000 docs/modules/ROOT/examples/pipes-elasticsearch-reporter.json create mode 120000 docs/modules/ROOT/examples/pipes-emit-all.json delete mode 120000 docs/modules/ROOT/examples/pipes-fs-emitter.json delete mode 120000 docs/modules/ROOT/examples/pipes-fs-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-google-drive-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-http-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-jdbc-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-jdbc-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-jdbc-pipeline.json create mode 120000 docs/modules/ROOT/examples/pipes-jdbc-reporter.json create mode 120000 docs/modules/ROOT/examples/pipes-json-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-kafka-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-kafka-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-kafka-pipeline.json create mode 120000 docs/modules/ROOT/examples/pipes-microsoft-graph-fetcher.json create mode 120000 docs/modules/ROOT/examples/pipes-opensearch-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-opensearch-pipeline.json create mode 120000 docs/modules/ROOT/examples/pipes-opensearch-reporter.json create mode 120000 docs/modules/ROOT/examples/pipes-shared-server.json create mode 120000 docs/modules/ROOT/examples/pipes-solr-emitter-zk.json create mode 120000 docs/modules/ROOT/examples/pipes-solr-emitter.json create mode 120000 docs/modules/ROOT/examples/pipes-solr-iterator.json create mode 120000 docs/modules/ROOT/examples/pipes-solr-pipeline.json create mode 100644 docs/modules/ROOT/pages/pipes/plugins/atlassian-jwt.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/csv.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/elasticsearch.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/google-drive.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/http.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/jdbc.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/json.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/kafka.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/microsoft-graph.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/opensearch.adoc create mode 100644 docs/modules/ROOT/pages/pipes/plugins/solr.adoc create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/java/org/apache/tika/pipes/atlassianjwt/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/resources/config-examples/atlassian-jwt-fetcher.json delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob-fetcher.xml delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob.xml create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/csv/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/resources/config-examples/csv-pipes-iterator.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/java/org/apache/tika/pipes/es/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-reporter.json delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config/tika-config-gcs.xml create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/java/org/apache/tika/pipes/googledrive/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/resources/config-examples/google-drive-fetcher.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/java/org/apache/tika/pipes/http/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/resources/config-examples/http-fetcher.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/jdbc/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipes-iterator.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-reporter.json delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-attachments.xml delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-existing-table.xml delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-trunc.xml delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter.xml create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/json/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/config-examples/json-pipes-iterator.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/kafka/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipes-iterator.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/java/org/apache/tika/pipes/microsoftgraph/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/resources/config-examples/microsoft-graph-fetcher.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/java/org/apache/tika/pipes/opensearch/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-reporter.json delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/tika-config-simple-emitter.xml create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/solr/ConfigExamplesTest.java create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter-zk.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipeline.json create mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipes-iterator.json delete mode 100644 tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/tika-config-simple-emitter.xml diff --git a/docs/modules/ROOT/examples/pipes-atlassian-jwt-fetcher.json b/docs/modules/ROOT/examples/pipes-atlassian-jwt-fetcher.json new file mode 120000 index 00000000000..8f2871640e4 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-atlassian-jwt-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/resources/config-examples/atlassian-jwt-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-config-template.json b/docs/modules/ROOT/examples/pipes-config-template.json new file mode 120000 index 00000000000..ae8c7de24eb --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-config-template.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-async-cli/src/main/resources/config-template.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-csv-iterator.json b/docs/modules/ROOT/examples/pipes-csv-iterator.json new file mode 120000 index 00000000000..11bdc189cb4 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-csv-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/resources/config-examples/csv-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-elasticsearch-emitter.json b/docs/modules/ROOT/examples/pipes-elasticsearch-emitter.json new file mode 120000 index 00000000000..2b48ca80802 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-elasticsearch-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-elasticsearch-pipeline.json b/docs/modules/ROOT/examples/pipes-elasticsearch-pipeline.json new file mode 120000 index 00000000000..2fefff94957 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-elasticsearch-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-elasticsearch-reporter.json b/docs/modules/ROOT/examples/pipes-elasticsearch-reporter.json new file mode 120000 index 00000000000..36117c95a8a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-elasticsearch-reporter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-reporter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-emit-all.json b/docs/modules/ROOT/examples/pipes-emit-all.json new file mode 120000 index 00000000000..5a5ba03d860 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-emit-all.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-emit-all.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-fs-emitter.json b/docs/modules/ROOT/examples/pipes-fs-emitter.json deleted file mode 120000 index a9321db9ebd..00000000000 --- a/docs/modules/ROOT/examples/pipes-fs-emitter.json +++ /dev/null @@ -1 +0,0 @@ -../../../../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-fs-fetcher.json b/docs/modules/ROOT/examples/pipes-fs-fetcher.json deleted file mode 120000 index faef8e27a1e..00000000000 --- a/docs/modules/ROOT/examples/pipes-fs-fetcher.json +++ /dev/null @@ -1 +0,0 @@ -../../../../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-google-drive-fetcher.json b/docs/modules/ROOT/examples/pipes-google-drive-fetcher.json new file mode 120000 index 00000000000..d8afdd781de --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-google-drive-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/resources/config-examples/google-drive-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-http-fetcher.json b/docs/modules/ROOT/examples/pipes-http-fetcher.json new file mode 120000 index 00000000000..51a6d0387e5 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-http-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/resources/config-examples/http-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-jdbc-emitter.json b/docs/modules/ROOT/examples/pipes-jdbc-emitter.json new file mode 120000 index 00000000000..878458e60f2 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-jdbc-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-jdbc-iterator.json b/docs/modules/ROOT/examples/pipes-jdbc-iterator.json new file mode 120000 index 00000000000..74eddd76010 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-jdbc-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-jdbc-pipeline.json b/docs/modules/ROOT/examples/pipes-jdbc-pipeline.json new file mode 120000 index 00000000000..e3ae2bc1d7a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-jdbc-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-jdbc-reporter.json b/docs/modules/ROOT/examples/pipes-jdbc-reporter.json new file mode 120000 index 00000000000..53d25aa2b6a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-jdbc-reporter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-reporter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-json-iterator.json b/docs/modules/ROOT/examples/pipes-json-iterator.json new file mode 120000 index 00000000000..ef848ecda3f --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-json-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/config-examples/json-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-kafka-emitter.json b/docs/modules/ROOT/examples/pipes-kafka-emitter.json new file mode 120000 index 00000000000..b3a84ca8c42 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-kafka-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-kafka-iterator.json b/docs/modules/ROOT/examples/pipes-kafka-iterator.json new file mode 120000 index 00000000000..6a35e7d2b72 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-kafka-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-kafka-pipeline.json b/docs/modules/ROOT/examples/pipes-kafka-pipeline.json new file mode 120000 index 00000000000..cdbbc2c980e --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-kafka-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-microsoft-graph-fetcher.json b/docs/modules/ROOT/examples/pipes-microsoft-graph-fetcher.json new file mode 120000 index 00000000000..a69990c9873 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-microsoft-graph-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/resources/config-examples/microsoft-graph-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-opensearch-emitter.json b/docs/modules/ROOT/examples/pipes-opensearch-emitter.json new file mode 120000 index 00000000000..6cf72fc6104 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-opensearch-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-opensearch-pipeline.json b/docs/modules/ROOT/examples/pipes-opensearch-pipeline.json new file mode 120000 index 00000000000..ba12f014085 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-opensearch-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-opensearch-reporter.json b/docs/modules/ROOT/examples/pipes-opensearch-reporter.json new file mode 120000 index 00000000000..22bd5cbc926 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-opensearch-reporter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-reporter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-shared-server.json b/docs/modules/ROOT/examples/pipes-shared-server.json new file mode 120000 index 00000000000..e6d0a634c49 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-shared-server.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-shared-server.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-solr-emitter-zk.json b/docs/modules/ROOT/examples/pipes-solr-emitter-zk.json new file mode 120000 index 00000000000..2af060b6063 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-solr-emitter-zk.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter-zk.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-solr-emitter.json b/docs/modules/ROOT/examples/pipes-solr-emitter.json new file mode 120000 index 00000000000..80aa2572035 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-solr-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-solr-iterator.json b/docs/modules/ROOT/examples/pipes-solr-iterator.json new file mode 120000 index 00000000000..e2b7beabd5f --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-solr-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-solr-pipeline.json b/docs/modules/ROOT/examples/pipes-solr-pipeline.json new file mode 120000 index 00000000000..480ab0bf79a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-solr-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index e72c1d637bb..9f4e70609de 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -36,6 +36,17 @@ *** xref:pipes/plugins/s3.adoc[Amazon S3] *** xref:pipes/plugins/gcs.adoc[Google Cloud Storage] *** xref:pipes/plugins/azblob.adoc[Azure Blob Storage] +*** xref:pipes/plugins/opensearch.adoc[OpenSearch] +*** xref:pipes/plugins/elasticsearch.adoc[Elasticsearch] +*** xref:pipes/plugins/solr.adoc[Apache Solr] +*** xref:pipes/plugins/jdbc.adoc[JDBC] +*** xref:pipes/plugins/kafka.adoc[Apache Kafka] +*** xref:pipes/plugins/http.adoc[HTTP] +*** xref:pipes/plugins/google-drive.adoc[Google Drive] +*** xref:pipes/plugins/microsoft-graph.adoc[Microsoft Graph] +*** xref:pipes/plugins/atlassian-jwt.adoc[Atlassian JWT] +*** xref:pipes/plugins/csv.adoc[CSV] +*** xref:pipes/plugins/json.adoc[JSON] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/configuration.adoc b/docs/modules/ROOT/pages/pipes/configuration.adoc index e9c75ab0603..f6b3d5c2b64 100644 --- a/docs/modules/ROOT/pages/pipes/configuration.adoc +++ b/docs/modules/ROOT/pages/pipes/configuration.adoc @@ -150,3 +150,54 @@ These settings control how parsed results are batched before sending to emitters |=== See xref:pipes/shared-server-mode.adoc[Shared Server Mode] for details. + +[#complete-examples] +== Complete examples + +Worked-out end-to-end configs from the test tree. Each is loaded by an automated test, so the syntax stays current. + +[#fs-pipeline] +=== Filesystem-to-filesystem pipeline + +[source,json,subs=none] +---- +include::example$pipes-fs-pipeline.json[] +---- + +icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View source on GitHub] + +Tokens (`FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`) are substituted by the test harness — replace them with real paths in production configs. + +[#emit-all] +=== Emit-all variant + +[source,json,subs=none] +---- +include::example$pipes-emit-all.json[] +---- + +icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-emit-all.json[View source on GitHub] + +[#shared-server-example] +=== Shared-server (YOLO) mode + +[source,json,subs=none] +---- +include::example$pipes-shared-server.json[] +---- + +icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-shared-server.json[View source on GitHub] + +See xref:pipes/shared-server-mode.adoc[Shared Server Mode] for the trade-offs. + +[#config-template] +=== `tika-async-cli` config template + +[source,json,subs=none] +---- +include::example$pipes-config-template.json[] +---- + +icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-async-cli/src/main/resources/config-template.json[View source on GitHub] + +For per-plugin pipeline examples (S3, OpenSearch, JDBC, Kafka, etc.), see the relevant page under xref:pipes/plugins/index.adoc[Plugins]. diff --git a/docs/modules/ROOT/pages/pipes/cpu-sizing.adoc b/docs/modules/ROOT/pages/pipes/cpu-sizing.adoc index e81a843853f..997ea159b18 100644 --- a/docs/modules/ROOT/pages/pipes/cpu-sizing.adoc +++ b/docs/modules/ROOT/pages/pipes/cpu-sizing.adoc @@ -129,6 +129,23 @@ When Tika sees an explicit `-XX:ActiveProcessorCount` in `forkedJvmArgs`, it respects your value and skips the auto-injection — the sizing summary will report `autoCap=user-set in forkedJvmArgs`. +[#heap-per-worker] +== Heap per worker — rule of thumb + +A reasonable starting point is **~2 GB of heap per forked worker** (passed via `-Xmx2g` in `forkedJvmArgs`). The number falls out of three independent constraints any of which can dominate: + +* **Worst-case PDF parsing.** A handful of pathological PDFs in any reasonably large corpus will allocate hundreds of MB of intermediate object data per document — large image streams, deeply nested form fields, big embedded fonts. Smaller heaps OOM on those documents; larger heaps just let GC clean up between docs. +* **Embedded-document explosion.** A zip-bomb-shaped office document with thousands of embedded objects multiplies per-doc allocation by the embedding count. The `maxEmbeddedResources` setting caps the count, but each retained object still lives in the heap until the whole tree finishes parsing. +* **GC headroom.** G1GC behaves poorly above ~85% occupancy. A `-Xmx2g` worker comfortably handles documents that allocate up to ~1.5 GB of live data; below that you start trading throughput for memory. + +This is a default — not a tuning recommendation. To right-size for your specific corpus: + +. Measure peak per-worker live-heap with `-Xlog:gc*` (look at the post-GC working set, not the peak before GC). +. Pick `-Xmx` ≈ `1.5 × peakLiveHeap` to leave GC headroom. +. Re-measure under your real concurrency. Embedded-doc-heavy formats (PowerPoint, complex Word) shift this number up; flat text or PDF-text-only shifts it down. + +The pod-level heap budget is `numClients × per-worker-Xmx + parent-overhead`. On a 16 GB node running `numClients=4`, that's about `4 × 2 GB + 1 GB ≈ 9 GB` — comfortably below the node limit, leaving room for kernel, IO buffers, and a non-saturated pod. + == Container & cgroup behavior The formula uses `Runtime.availableProcessors()` for the host CPU count, diff --git a/docs/modules/ROOT/pages/pipes/emitters.adoc b/docs/modules/ROOT/pages/pipes/emitters.adoc index 3feeb8ebf3c..3fa494b4378 100644 --- a/docs/modules/ROOT/pages/pipes/emitters.adoc +++ b/docs/modules/ROOT/pages/pipes/emitters.adoc @@ -16,205 +16,94 @@ // = Emitters +:toc: -Emitters write parsed results to a destination. Each emitter is identified by -its component name and an `id` that is referenced by the pipes iterator. +An *emitter* writes parse results to a destination — a file on disk, a row in a database, a document in a search index, a message on a queue, etc. -== File System Emitter (`file-system-emitter`) +[#contract] +== The Emitter Contract -Writes parsed metadata as JSON files to a local or mounted filesystem. +Each emitter implements `Emitter#emit(EmitData emitData)`, where `EmitData` carries the emit key, the parsed `Metadata`, and (for content-emitting strategies) the extracted content. -**Module:** `tika-pipes-file-system` +The emit key is supplied by the iterator on each `FetchEmitTuple` and tells the emitter where to put the result. Its shape depends on the emitter: -[source,json,subs=none] ----- -include::example$pipes-fs-emitter.json[] ----- - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`basePath` -|_required_ -|Base output directory. - -|`fileExtension` -|`json` -|Extension for output files. - -|`onExists` -|`EXCEPTION` -|Behavior when output file exists: `SKIP`, `REPLACE`, `EXCEPTION`. - -|`prettyPrint` -|`false` -|Pretty-print JSON output. -|=== - -== Elasticsearch Emitter (`es-emitter`) - -Sends parsed documents to Elasticsearch via the `_bulk` API. Uses plain HTTP -- -no dependency on the ES Java client. - -**Module:** `tika-pipes-es` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`esUrl` -|_required_ -|Full URL including index (e.g., `https://localhost:9200/my-index`). - -|`idField` -|`_id` -|Metadata field used as the document `_id`. - -|`apiKey` -|_none_ -|Base64-encoded `id:api_key` for authentication. - -|`attachmentStrategy` -|`SEPARATE_DOCUMENTS` -|`SEPARATE_DOCUMENTS` or `PARENT_CHILD`. - -|`updateStrategy` -|`OVERWRITE` -|`OVERWRITE` (full replace) or `UPSERT` (field-level merge). - -|`embeddedFileFieldName` -|`embedded` -|Join-field name for `PARENT_CHILD` mode. -|=== - -== OpenSearch Emitter (`opensearch-emitter`) - -Sends documents to OpenSearch. Configured identically to the ES emitter -but uses `openSearchUrl` instead of `esUrl`. - -**Module:** `tika-pipes-opensearch` - -== S3 Emitter (`s3-emitter`) - -Writes parsed metadata as JSON objects to Amazon S3. - -**Module:** `tika-pipes-s3` +* file-system / S3 / GCS / Azure Blob — a key/path relative to `basePath` or `prefix`. +* OpenSearch / Elasticsearch / Solr — the `_id` field value, taken from the metadata field named by the emitter's `idField`. +* JDBC — the value bound to the first `?` placeholder in the `insert` template. +* Kafka — the Kafka record key. -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`bucket` -|_required_ -|S3 bucket name. - -|`region` -|_required_ -|AWS region. - -|`prefix` -|_none_ -|S3 key prefix for output objects. - -|`credentialsProvider` -|`profile` -|Credentials type: `profile`, `static`, `instance`. - -|`fileExtension` -|`json` -|File extension for output keys. -|=== - -== GCS Emitter (`gcs-emitter`) - -Writes parsed metadata to Google Cloud Storage. - -**Module:** `tika-pipes-gcs` - -== Azure Blob Emitter (`az-blob-emitter`) - -Writes parsed metadata to Azure Blob Storage. - -**Module:** `tika-pipes-az-blob` - -== Solr Emitter (`solr-emitter`) +Emitters are intended to be safe under concurrent use; the pipeline's worker pool may call `emit()` from many threads. -Indexes parsed documents into Apache Solr. +[#wiring] +== Wiring Emitters Into a Pipeline -**Module:** `tika-pipes-solr` +Emitters live under the top-level `emitters` key. Each emitter gets an ID (the outer map key) and a type-name (the inner map key); the iterator references the ID through its `emitterId` field. -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`solrCollection` -|_required_ -|Solr collection name. - -|`solrUrls` -|_required_ -|List of Solr URLs. +[source,json] +---- +{ + "emitters": { + "output": { + "file-system-emitter": { + "basePath": "/data/output", + "fileExtension": "json" + } + } + }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/input", + "fetcherId": "...", + "emitterId": "output" + } + } +} +---- -|`idField` -|`id` -|Field name for document ID. +A pipeline may declare multiple emitters and choose between them at iterator-config time. Within a single iterator, each emitted `FetchEmitTuple` carries exactly one emitter ID. -|`commitWithin` -|`-1` -|Milliseconds before auto-commit (-1 = server default). +[#plugins] +== Available Emitters -|`attachmentStrategy` -|`SEPARATE_DOCUMENTS` -|How to handle embedded documents. +[cols="2,2,3"] |=== +|Plugin |Component name |Notes -== JDBC Emitter (`jdbc-emitter`) +|xref:pipes/plugins/filesystem.adoc[File System] +|`file-system-emitter` +|Local / mounted filesystem. -Writes parsed metadata to a SQL database via JDBC. +|xref:pipes/plugins/s3.adoc[Amazon S3] +|`s3-emitter` +|S3 or S3-compatible. -**Module:** `tika-pipes-jdbc` +|xref:pipes/plugins/gcs.adoc[Google Cloud Storage] +|`gcs-emitter` +|GCS via ADC. -[cols="1,1,3"] -|=== -|Field |Default |Description +|xref:pipes/plugins/azblob.adoc[Azure Blob Storage] +|`az-blob-emitter` +|SAS-token auth. -|`connection` -|_required_ -|JDBC connection string. +|xref:pipes/plugins/opensearch.adoc[OpenSearch] +|`opensearch-emitter` +|REST-based bulk indexing. -|`insert` -|_required_ -|SQL INSERT statement with `?` placeholders. +|xref:pipes/plugins/elasticsearch.adoc[Elasticsearch] +|`es-emitter` +|REST-based bulk indexing; ApiKey or basic auth. -|`keys` -|_required_ -|Ordered list of metadata keys to bind to placeholders. -|=== +|xref:pipes/plugins/solr.adoc[Apache Solr] +|`solr-emitter` +|SolrCloud (URLs or ZooKeeper). -== Kafka Emitter (`kafka-emitter`) +|xref:pipes/plugins/jdbc.adoc[JDBC] +|`jdbc-emitter` +|Any RDBMS with a JDBC driver. -Sends parsed metadata as messages to Apache Kafka. - -**Module:** `tika-pipes-kafka` - -[cols="1,1,3"] +|xref:pipes/plugins/kafka.adoc[Apache Kafka] +|`kafka-emitter` +|Standard Kafka producer. |=== -|Field |Default |Description - -|`topic` -|_required_ -|Kafka topic name. -|`bootstrapServers` -|_required_ -|Kafka broker addresses. - -|`acks` -|`all` -|Acknowledgment requirement. - -|`lingerMs` -|`0` -|Batch wait time in milliseconds. -|=== +For the full plugin / interface matrix, see xref:pipes/plugins/index.adoc[Plugins]. diff --git a/docs/modules/ROOT/pages/pipes/fetchers.adoc b/docs/modules/ROOT/pages/pipes/fetchers.adoc index eff355d0dea..96beaf47495 100644 --- a/docs/modules/ROOT/pages/pipes/fetchers.adoc +++ b/docs/modules/ROOT/pages/pipes/fetchers.adoc @@ -16,230 +16,82 @@ // = Fetchers +:toc: -Fetchers retrieve document bytes from a source. Each fetcher is identified by -its component name and an `id` that is referenced by the pipes iterator. +A *fetcher* retrieves the bytes of a document from a source — a local filesystem, an S3 bucket, an HTTP URL, etc. — and returns them as an `InputStream` to the parser. -== File System Fetcher (`file-system-fetcher`) +[#contract] +== The Fetcher Contract -Reads files from a local or mounted filesystem. +Each fetcher implements `Fetcher#fetch(String fetchKey, Metadata metadata, ParseContext parseContext)` and returns an `InputStream` for the named document. The shape of the fetch key depends on the fetcher: for the file-system fetcher it is a path relative to `basePath`; for the S3 fetcher it is an object key relative to `prefix`; for the HTTP fetcher it is the URL itself. -**Module:** `tika-pipes-file-system` +Fetchers are stateless from the pipeline's perspective — every `fetch()` call resolves the key independently, so iterators are free to parallelize fetches. -[source,json,subs=none] ----- -include::example$pipes-fs-fetcher.json[] ----- - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`basePath` -|_required_ -|Base directory. Fetch keys are resolved relative to this path. - -|`extractFileSystemMetadata` -|`false` -|Extract file created/modified timestamps and size into metadata. - -|`allowAbsolutePaths` -|`false` -|Allow absolute fetch keys when `basePath` is not set. -|=== - -== S3 Fetcher (`s3-fetcher`) - -Fetches objects from Amazon S3. - -**Module:** `tika-pipes-s3` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`bucket` -|_required_ -|S3 bucket name. - -|`region` -|_required_ -|AWS region (e.g., `us-east-1`). - -|`credentialsProvider` -|`profile` -|Credentials type: `profile`, `static`, `instance`. - -|`profile` -|`default` -|AWS profile name (when using `profile` credentials). - -|`accessKey` / `secretKey` -|_none_ -|Static credentials (when using `static` credentials). - -|`prefix` -|_none_ -|S3 key prefix. - -|`spoolToTemp` -|`false` -|Spool object to a temp file before parsing. - -|`extractUserMetadata` -|`false` -|Extract S3 user metadata. - -|`maxLength` -|_unlimited_ -|Maximum object size to fetch. -|=== - -== HTTP Fetcher (`http-fetcher`) - -Fetches documents from HTTP/HTTPS URLs. - -**Module:** `tika-pipes-http` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`userName` -|_none_ -|Basic auth username. - -|`password` -|_none_ -|Basic auth password. - -|`connectTimeoutMillis` -|`30000` -|Connection timeout. - -|`socketTimeoutMillis` -|`120000` -|Socket read timeout. +[#wiring] +== Wiring Fetchers Into a Pipeline -|`maxConnections` -|`200` -|Maximum concurrent connections. +Fetchers live under the top-level `fetchers` key. Each fetcher gets an ID (the outer map key) and a type-name (the inner map key); the iterator then references the ID through its `fetcherId` field. -|`userAgent` -|_default_ -|HTTP User-Agent header. -|=== - -== GCS Fetcher (`gcs-fetcher`) - -Fetches objects from Google Cloud Storage. - -**Module:** `tika-pipes-gcs` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`projectId` -|_required_ -|GCP project ID. - -|`bucket` -|_required_ -|GCS bucket name. - -|`prefix` -|_none_ -|Key prefix. - -|`spoolToTemp` -|`false` -|Spool to temp file before parsing. - -|`extractUserMetadata` -|`false` -|Extract GCS user metadata. -|=== - -== Azure Blob Fetcher (`az-blob-fetcher`) - -Fetches blobs from Azure Blob Storage. - -**Module:** `tika-pipes-az-blob` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`sasToken` -|_required_ -|Shared Access Signature token. - -|`endpoint` -|_required_ -|Azure storage endpoint URL. - -|`container` -|_required_ -|Container name. - -|`prefix` -|_none_ -|Blob prefix. - -|`extractUserMetadata` -|`false` -|Extract Azure user metadata. -|=== - -== Google Drive Fetcher (`google-drive-fetcher`) +[source,json] +---- +{ + "fetchers": { + "primary": { + "file-system-fetcher": { + "basePath": "/data/input" + } + } + }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/input", + "fetcherId": "primary", + "emitterId": "..." + } + } +} +---- -Fetches files from Google Drive via the Drive API. +A single pipes config may declare multiple fetchers with different IDs and use them in different iterators or pipelines. -**Module:** `tika-pipes-google-drive` +[#plugins] +== Available Fetchers -[cols="1,1,3"] +[cols="2,2,3"] |=== -|Field |Default |Description +|Plugin |Component name |Notes -|`serviceAccountCredentialsPath` -|_required_ -|Path to GCP service account JSON key file. - -|`impersonatedUser` -|_none_ -|User email to impersonate (for domain-wide delegation). -|=== +|xref:pipes/plugins/filesystem.adoc[File System] +|`file-system-fetcher` +|Local / mounted filesystem. -== Microsoft Graph Fetcher (`microsoft-graph-fetcher`) +|xref:pipes/plugins/s3.adoc[Amazon S3] +|`s3-fetcher` +|S3 or S3-compatible (MinIO, LocalStack). -Fetches files from Microsoft 365 (OneDrive, SharePoint) via the Graph API. +|xref:pipes/plugins/gcs.adoc[Google Cloud Storage] +|`gcs-fetcher` +|GCS via Application Default Credentials. -**Module:** `tika-pipes-microsoft-graph` +|xref:pipes/plugins/azblob.adoc[Azure Blob Storage] +|`az-blob-fetcher` +|SAS-token auth. -== Atlassian JWT Fetcher (`atlassian-jwt-fetcher`) +|xref:pipes/plugins/http.adoc[HTTP] +|`http-fetcher` +|HTTP(S) with basic / JWT auth. -Fetches content from Atlassian products using JWT authentication. +|xref:pipes/plugins/google-drive.adoc[Google Drive] +|`google-drive-fetcher` +|Drive API with service-account auth. -**Module:** `tika-pipes-atlassian-jwt` +|xref:pipes/plugins/microsoft-graph.adoc[Microsoft Graph] +|`microsoft-graph-fetcher` +|OneDrive / SharePoint via Graph. -[cols="1,1,3"] +|xref:pipes/plugins/atlassian-jwt.adoc[Atlassian JWT] +|`atlassian-jwt-fetcher` +|Atlassian Connect (Jira/Confluence Cloud). |=== -|Field |Default |Description -|`sharedSecret` -|_required_ -|JWT shared secret. - -|`issuer` -|_required_ -|JWT issuer / app key. - -|`connectTimeoutMillis` -|`30000` -|Connection timeout. - -|`socketTimeoutMillis` -|`120000` -|Socket read timeout. -|=== +For the full plugin / interface matrix, see xref:pipes/plugins/index.adoc[Plugins]. diff --git a/docs/modules/ROOT/pages/pipes/iterators.adoc b/docs/modules/ROOT/pages/pipes/iterators.adoc index dc433bb4928..a3e3bc7292b 100644 --- a/docs/modules/ROOT/pages/pipes/iterators.adoc +++ b/docs/modules/ROOT/pages/pipes/iterators.adoc @@ -16,197 +16,85 @@ // = Pipes Iterators +:toc: -Pipes iterators enumerate the documents to be processed. Each iterator -produces fetch/emit tuples that the pipeline consumes. +A *pipes iterator* enumerates the documents to be processed. It emits one `FetchEmitTuple` per document; the pipeline workers then call the bound fetcher (to get the bytes), the parser, and the bound emitter (to write the result). -All iterators share a `baseConfig` block that specifies which fetcher and emitter -to use: +[#contract] +== The Iterator Contract -[source,json] ----- -"baseConfig": { - "fetcherId": "my-fetcher-id", - "emitterId": "my-emitter-id" -} ----- - -== File System Iterator (`file-system-pipes-iterator`) - -Recursively walks a directory tree. - -**Module:** `tika-pipes-file-system` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`basePath` -|_required_ -|Directory to walk. - -|`countTotal` -|`false` -|Count total files before processing (enables progress reporting). - -|`baseConfig` -|_required_ -|Fetcher/emitter IDs. -|=== - -== S3 Iterator (`s3-pipes-iterator`) - -Lists objects in an S3 bucket. - -**Module:** `tika-pipes-s3` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`bucket` -|_required_ -|S3 bucket name. - -|`region` -|_required_ -|AWS region. - -|`prefix` -|_none_ -|Key prefix to filter objects. - -|`credentialsProvider` -|`profile` -|Credentials type. - -|`baseConfig` -|_required_ -|Fetcher/emitter IDs. -|=== - -== GCS Iterator (`gcs-pipes-iterator`) - -Lists objects in a Google Cloud Storage bucket. - -**Module:** `tika-pipes-gcs` - -== Azure Blob Iterator (`az-blob-pipes-iterator`) - -Lists blobs in an Azure Blob Storage container. +A `PipesIterator` produces a stream of `FetchEmitTuple` records. Each tuple carries: -**Module:** `tika-pipes-az-blob` +* the *fetch key* — passed to the fetcher to retrieve the document bytes +* the *emit key* — passed to the emitter to decide where to write results +* an optional `id` and arbitrary metadata fields -== CSV Iterator (`csv-pipes-iterator`) +The iterator runs on its own thread; the pipeline reads tuples as fast as the worker pool can keep up. -Reads rows from a CSV file to generate fetch/emit tuples. +[#wiring] +== Wiring an Iterator Into a Pipeline -**Module:** `tika-pipes-csv` +The iterator lives under the singular top-level `pipes-iterator` key. The inner map key is the iterator's component name. `fetcherId` and `emitterId` are *flat fields* on the iterator config — they are not wrapped in a `baseConfig` block. -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`csvPath` -|_required_ -|Path to the CSV file. - -|`fetchKeyColumn` -|_required_ -|Column name containing the fetch key (file path, S3 key, etc.). - -|`emitKeyColumn` -|_none_ -|Column name for the emit key. If omitted, uses the fetch key. - -|`baseConfig` -|_required_ -|Fetcher/emitter IDs. -|=== - -== JDBC Iterator (`jdbc-pipes-iterator`) - -Executes a SQL query and uses each row as a fetch/emit tuple. - -**Module:** `tika-pipes-jdbc` - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`connection` -|_required_ -|JDBC connection string. - -|`select` -|_required_ -|SQL SELECT query. +[source,json] +---- +{ + "fetchers": { "fsf": { "file-system-fetcher": { "basePath": "/data/in" } } }, + "emitters": { "fse": { "file-system-emitter": { "basePath": "/data/out" } } }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/in", + "fetcherId": "fsf", + "emitterId": "fse" + } + } +} +---- -|`fetchKeyColumn` -|_required_ -|Column containing the fetch key. +Only one iterator is active per pipeline. To process multiple sources in parallel, run multiple pipelines. -|`idColumn` -|_none_ -|Column containing the document ID. +[#plugins] +== Available Iterators -|`baseConfig` -|_required_ -|Fetcher/emitter IDs. +[cols="2,2,3"] |=== +|Plugin |Component name |Notes -== Solr Iterator (`solr-pipes-iterator`) - -Queries a Solr collection and uses each document as a fetch/emit tuple. - -**Module:** `tika-pipes-solr` - -== JSON Iterator (`json-pipes-iterator`) - -Reads an array of objects from a JSON file. +|xref:pipes/plugins/filesystem.adoc[File System] +|`file-system-pipes-iterator` +|Recursively walks a directory tree. -**Module:** `tika-pipes-json` +|xref:pipes/plugins/s3.adoc[Amazon S3] +|`s3-pipes-iterator` +|Lists S3 objects under a prefix. -[cols="1,1,3"] -|=== -|Field |Default |Description +|xref:pipes/plugins/gcs.adoc[Google Cloud Storage] +|`gcs-pipes-iterator` +|Lists GCS objects under a prefix. -|`jsonPath` -|_required_ -|Path to the JSON file. +|xref:pipes/plugins/azblob.adoc[Azure Blob Storage] +|`az-blob-pipes-iterator` +|Lists blobs under a prefix. -|`baseConfig` -|_required_ -|Fetcher/emitter IDs. -|=== +|xref:pipes/plugins/solr.adoc[Apache Solr] +|`solr-pipes-iterator` +|Queries a Solr collection (useful for re-parsing). -== Kafka Iterator (`kafka-pipes-iterator`) +|xref:pipes/plugins/jdbc.adoc[JDBC] +|`jdbc-pipes-iterator` +|Walks rows from a SELECT query. -Consumes messages from a Kafka topic as fetch/emit tuples. +|xref:pipes/plugins/kafka.adoc[Apache Kafka] +|`kafka-pipes-iterator` +|Consumes fetch-request messages from a topic. -**Module:** `tika-pipes-kafka` +|xref:pipes/plugins/csv.adoc[CSV] +|`csv-pipes-iterator` +|Reads work items from a CSV file. -[cols="1,1,3"] +|xref:pipes/plugins/json.adoc[JSON] +|`json-pipes-iterator` +|Reads work items from a JSON-lines file. |=== -|Field |Default |Description - -|`topic` -|_required_ -|Kafka topic. - -|`bootstrapServers` -|_required_ -|Kafka broker addresses. -|`groupId` -|_required_ -|Consumer group ID. - -|`autoOffsetReset` -|`earliest` -|Where to start reading: `earliest` or `latest`. - -|`baseConfig` -|_required_ -|Fetcher/emitter IDs. -|=== +For the full plugin / interface matrix, see xref:pipes/plugins/index.adoc[Plugins]. diff --git a/docs/modules/ROOT/pages/pipes/plugins/atlassian-jwt.adoc b/docs/modules/ROOT/pages/pipes/plugins/atlassian-jwt.adoc new file mode 100644 index 00000000000..30c9b9be446 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/atlassian-jwt.adoc @@ -0,0 +1,121 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Atlassian JWT Plugin +:toc: +:toclevels: 3 + +The Atlassian JWT plugin (`tika-pipes-atlassian-jwt`) provides a fetcher specifically for Atlassian Connect endpoints (Jira Cloud, Confluence Cloud) that require an Atlassian-style JWT bearer token. It is fetcher-only — pair it with another emitter and iterator. + +For generic HTTP fetching, use the xref:pipes/plugins/http.adoc[HTTP plugin] instead. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`atlassian-jwt-fetcher` +|`AtlassianJwtFetcher` +|=== + +[#atlassian-jwt-fetcher] +== Atlassian JWT Fetcher (`atlassian-jwt-fetcher`) + +Fetches resources from an Atlassian Cloud endpoint, generating a fresh JWT for each request signed with the app's shared secret. + +[source,json] +---- +include::example$pipes-atlassian-jwt-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`issuer` +|_required_ +|Atlassian Connect app key (the `iss` claim). + +|`sharedSecret` +|_required_ +|Shared secret from the app installation, used as the JWT signing key. + +|`subject` +|_optional_ +|JWT `sub` claim — typically an account ID for user-context calls. + +|`jwtExpiresInSeconds` +|`3600` +|JWT validity window. Each request gets a freshly-signed token. + +|`userAgent` +|_no default_ +|`User-Agent` header sent on each request. + +|`maxConnections` +|`2000` +|HTTP connection-pool size. + +|`maxConnectionsPerRoute` +|`1000` +|Per-route connection-pool size. + +|`connectTimeoutMillis` +|`120000` +|TCP connect timeout. + +|`socketTimeoutMillis` +|`120000` +|Socket read timeout. + +|`requestTimeoutMillis` +|`120000` +|Connection-manager request timeout. + +|`overallTimeoutMillis` +|`120000` +|Hard cap on total time for a single fetch operation. + +|`maxRedirects` +|`0` +|Maximum number of redirects to follow. + +|`maxSpoolSize` +|`-1` +|Maximum bytes to spool locally. `-1` means no limit. + +|`maxErrMsgSize` +|`10000000` +|Maximum bytes of error response body to capture into the exception. + +|`httpHeaders` +|_empty_ +|Extra HTTP headers, formatted as `"Header: value"` strings (list). + +|`httpRequestHeaders` +|_empty_ +|Structured per-request headers as a `Header → [values]` map. +|=== + +[#notes] +== Notes + +* The JWT is computed per request — the `qsh` claim is derived from the request method and URL, as required by Atlassian Connect. +* `sharedSecret` is sensitive; use environment-variable substitution or external secret stores rather than inlining it in source control. +* For Jira Server / Data Center (not Cloud) endpoints, use the xref:pipes/plugins/http.adoc[HTTP plugin] with basic or token auth instead — those endpoints do not accept Atlassian Connect JWTs. diff --git a/docs/modules/ROOT/pages/pipes/plugins/csv.adoc b/docs/modules/ROOT/pages/pipes/plugins/csv.adoc new file mode 100644 index 00000000000..cd44e14d24b --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/csv.adoc @@ -0,0 +1,75 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += CSV Plugin +:toc: +:toclevels: 3 + +The CSV plugin (`tika-pipes-csv`) provides an iterator that reads work items from a CSV file. It is iterator-only — pair it with a fetcher and emitter. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Iterator +|`csv-pipes-iterator` +|`CSVPipesIterator` +|=== + +[#csv-iterator] +== CSV Iterator (`csv-pipes-iterator`) + +Reads each row of the CSV as a work item and emits one `FetchEmitTuple` per row. + +[source,json] +---- +include::example$pipes-csv-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`csvPath` +|_required_ +|Path to the CSV file on disk. + +|`idColumn` +|_optional_ +|Column whose value becomes the iterator's row identifier. + +|`fetchKeyColumn` +|_optional_ +|Column whose value becomes the fetch key on each emitted tuple. + +|`emitKeyColumn` +|_optional_ +|Column whose value becomes the emit key on each emitted tuple. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#notes] +== Notes + +* The CSV must have a header row — column names in the config refer to header values, not column indexes. +* For very large CSV files, the iterator streams rows rather than loading them all into memory. +* For row-shaped work items in JSONL (one JSON object per line), use the xref:pipes/plugins/json.adoc[JSON iterator] instead. diff --git a/docs/modules/ROOT/pages/pipes/plugins/elasticsearch.adoc b/docs/modules/ROOT/pages/pipes/plugins/elasticsearch.adoc new file mode 100644 index 00000000000..110e325bae5 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/elasticsearch.adoc @@ -0,0 +1,196 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Elasticsearch Plugin +:toc: +:toclevels: 3 + +The Elasticsearch plugin (`tika-pipes-es`) provides an emitter (writes parsed docs to an Elasticsearch index) and a reporter (writes per-document processing status to Elasticsearch). + +It mirrors the xref:pipes/plugins/opensearch.adoc[OpenSearch plugin] in structure. The field names differ — `esUrl` instead of `openSearchUrl` — and ES adds an `apiKey` field for ApiKey-based auth in addition to basic auth. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Emitter +|`es-emitter` +|`ESEmitter` + +|Reporter +|`es-pipes-reporter` +|`ESPipesReporter` +|=== + +[#credentials] +== Authentication + +Two auth modes are supported, in this priority order: + +1. **ApiKey** — set the top-level `apiKey` field to the Base64-encoded `id:api_key` string Elasticsearch generates. Sent as `Authorization: ApiKey `. +2. **Basic** — leave `apiKey` null/empty and set `userName` + `password` inside `httpClientConfig`. Sent as `Authorization: Basic ...`. + +The emitter overrides `toString()` to redact the `apiKey` value, so it does not leak into logs. + +[#http-client-config] +== Shared HTTP Client Settings + +Both the emitter and the reporter accept a nested `httpClientConfig` block with these fields: + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`userName` / `password` +|_optional_ +|Basic-auth credentials. Used only when `apiKey` is unset. + +|`authScheme` +|_optional_ +|Set to `basic` to send credentials preemptively. + +|`connectionTimeoutMillis` +|_no default_ +|HTTP connect timeout, in milliseconds. + +|`socketTimeoutMillis` +|_no default_ +|HTTP socket read timeout, in milliseconds. + +|`proxyHost` / `proxyPort` +|_optional_ +|Optional outbound HTTP proxy. +|=== + +[#es-emitter] +== Elasticsearch Emitter (`es-emitter`) + +Writes parsed documents to an Elasticsearch index. + +[source,json] +---- +include::example$pipes-elasticsearch-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`esUrl` +|_required_ +|Full URL of the target Elasticsearch index, e.g., `https://es.example.com:9200/tika-docs`. + +|`idField` +|_required_ +|Field in the emitted JSON document that holds the Elasticsearch `_id`. + +|`attachmentStrategy` +|_no default_ +|How attached/embedded documents are indexed. One of: + +* `SEPARATE_DOCUMENTS` — each attachment becomes its own top-level document. +* `PARENT_CHILD` — attachments are nested under the parent in a parent/child relation. + +|`updateStrategy` +|_no default_ +|How existing documents are handled. One of: + +* `OVERWRITE` — replaces an existing document at `_id`. +* `UPSERT` — merges into an existing document. + +|`commitWithin` +|_no default_ +|Kept for API parity with the OpenSearch emitter. ES does not consume this value. + +|`embeddedFileFieldName` +|_no default_ +|Name of the field used to hold embedded-file content (used by `PARENT_CHILD`). + +|`apiKey` +|_optional_ +|Base64-encoded `id:api_key`. See <>. + +|`httpClientConfig` +|_optional_ +|See <>. +|=== + +[#es-reporter] +== Elasticsearch Reporter (`es-pipes-reporter`) + +Writes per-document processing status records to an Elasticsearch index. Useful for building dashboards over pipeline activity. + +[source,json] +---- +include::example$pipes-elasticsearch-reporter.json[] +---- + +`pipes-reporters` accepts multiple reporters keyed by type name — see xref:pipes/reporters.adoc[Pipes Reporters] for how multiple reporters compose. + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`esUrl` +|_required_ +|Full URL of the status index, e.g., `https://es.example.com:9200/tika-status`. + +|`includes` +|_optional_ +|Set of `RESULT_STATUS` names to include (e.g., `PARSE_SUCCESS`, `PARSE_EXCEPTION`). If unset, all are reported. + +|`excludes` +|_optional_ +|Set of `RESULT_STATUS` names to skip. Applied after `includes`. + +|`keyPrefix` +|_optional_ +|Prefix prepended to status field names in the emitted documents. + +|`includeRouting` +|`false` +|If `true`, include ES routing info in each status record. + +|`apiKey` +|_optional_ +|Base64-encoded `id:api_key`. See <>. + +|`httpClientConfig` +|_optional_ +|See <>. +|=== + +[#es-pipeline] +== Complete Pipeline Example + +The example below combines a filesystem iterator/fetcher with the Elasticsearch emitter and reporter — a common pattern for ingesting a directory of documents into ES. + +[source,json] +---- +include::example$pipes-elasticsearch-pipeline.json[] +---- + +[#notes] +== Notes + +* The ES plugin's HTTP client is REST-based; it does not depend on the Elasticsearch transport client. +* For OpenSearch deployments, use the parallel xref:pipes/plugins/opensearch.adoc[OpenSearch plugin] instead — the field names differ (`openSearchUrl` vs. `esUrl`). +* Don't check real credentials into source control — the `apiKey` and `password` values in the examples above are placeholders. diff --git a/docs/modules/ROOT/pages/pipes/plugins/google-drive.adoc b/docs/modules/ROOT/pages/pipes/plugins/google-drive.adoc new file mode 100644 index 00000000000..d7dba7631ad --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/google-drive.adoc @@ -0,0 +1,79 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Google Drive Plugin +:toc: +:toclevels: 3 + +The Google Drive plugin (`tika-pipes-google-drive`) provides a fetcher that retrieves files from a Google Drive. It is fetcher-only — pair it with another emitter and iterator. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`google-drive-fetcher` +|`GoogleDriveFetcher` +|=== + +[#google-drive-fetcher] +== Google Drive Fetcher (`google-drive-fetcher`) + +Fetches files from Google Drive by file ID. The fetch key is the Drive file ID. + +[source,json] +---- +include::example$pipes-google-drive-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`applicationName` +|`tika-pipes` +|Application name sent to the Google API for logging/quota tracking. + +|`serviceAccountKeyBase64` +|_optional_ +|Base64-encoded service-account JSON key. If absent, the SDK falls back to Application Default Credentials (env var `GOOGLE_APPLICATION_CREDENTIALS` or workload identity). + +|`subjectUser` +|_optional_ +|For domain-wide delegation: the user to impersonate (e.g., `user@example.com`). + +|`scopes` +|_empty_ +|OAuth scopes to request. Typical: `["https://www.googleapis.com/auth/drive.readonly"]`. + +|`spoolToTemp` +|`false` +|If `true`, files are spooled to a temp file before being parsed. + +|`throttleSeconds` +|_optional_ +|Rate-limit array — consecutive failures sleep for the corresponding number of seconds. +|=== + +[#notes] +== Notes + +* The plugin uses Google's official `google-api-services-drive` SDK. +* For domain-wide delegation, the service account must have been granted that scope in the Google Workspace admin console — Tika config alone is not enough. +* Service-account credentials in `serviceAccountKeyBase64` are sensitive — use environment-variable substitution or external secret stores rather than checking the encoded JSON into source control. diff --git a/docs/modules/ROOT/pages/pipes/plugins/http.adoc b/docs/modules/ROOT/pages/pipes/plugins/http.adoc new file mode 100644 index 00000000000..d60a006243d --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/http.adoc @@ -0,0 +1,132 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += HTTP Plugin +:toc: +:toclevels: 3 + +The HTTP plugin (`tika-pipes-http`) provides a fetcher that downloads documents over HTTP(S). It is fetcher-only — pair it with another emitter and iterator. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`http-fetcher` +|`HttpFetcher` +|=== + +[#http-fetcher] +== HTTP Fetcher (`http-fetcher`) + +Fetches document bytes from an HTTP(S) URL. The fetch key is the URL. + +[source,json] +---- +include::example$pipes-http-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`userName` / `password` +|_optional_ +|Basic-auth credentials. + +|`ntDomain` +|_optional_ +|NT domain for NTLM auth. + +|`authScheme` +|_optional_ +|Auth scheme hint: `basic`, `digest`, `ntlm`, or unset. + +|`proxyHost` / `proxyPort` +|_optional_ +|Outbound HTTP proxy. + +|`userAgent` +|_no default_ +|`User-Agent` header sent on each request. + +|`maxConnections` +|`2000` +|HTTP connection-pool size. + +|`maxConnectionsPerRoute` +|`1000` +|Per-route connection-pool size. + +|`connectTimeoutMillis` +|`120000` +|TCP connect timeout. + +|`socketTimeoutMillis` +|`120000` +|Socket read timeout. + +|`requestTimeoutMillis` +|`120000` +|Connection-manager request timeout. + +|`overallTimeoutMillis` +|`120000` +|Hard cap on total time for a single fetch operation. + +|`maxRedirects` +|`0` +|Maximum number of redirects to follow. `0` means follow none. + +|`maxSpoolSize` +|`-1` +|Maximum bytes to spool locally before failing. `-1` means no limit. + +|`maxErrMsgSize` +|`10000000` +|Maximum bytes of error response body to capture into the exception. + +|`httpHeaders` +|_empty_ +|Extra HTTP headers, formatted as `"Header: value"` strings (list). + +|`httpRequestHeaders` +|_empty_ +|Structured per-request headers as a `Header → [values]` map. Used when a header has multiple values. + +|`jwtIssuer` / `jwtSubject` / `jwtExpiresInSeconds` +|_optional_ +|JWT claims, for endpoints that accept JWT-bearer auth. + +|`jwtSecret` +|_optional_ +|HMAC secret for symmetric-key JWT signing. + +|`jwtPrivateKeyBase64` +|_optional_ +|Base64-encoded private key for asymmetric (RSA/ECDSA) JWT signing. Mutually exclusive with `jwtSecret`. +|=== + +[#notes] +== Notes + +* Both basic auth and JWT auth may be configured at the same time, but only one will apply per request (JWT takes precedence when present). +* For zero-redirect crawling, leave `maxRedirects` at `0`. The fetcher returns the redirect response as-is so the caller can decide what to do. +* `overallTimeoutMillis` is enforced by the fetcher itself, not the HTTP client — it covers slow drains and zombie connections that the lower-level timeouts may miss. +* For Atlassian Cloud endpoints that require an Atlassian Connect JWT, use the dedicated xref:pipes/plugins/atlassian-jwt.adoc[Atlassian JWT fetcher] instead — it has the correct claim layout baked in. diff --git a/docs/modules/ROOT/pages/pipes/plugins/jdbc.adoc b/docs/modules/ROOT/pages/pipes/plugins/jdbc.adoc new file mode 100644 index 00000000000..c1c8747b5f3 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/jdbc.adoc @@ -0,0 +1,241 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += JDBC Plugin +:toc: +:toclevels: 3 + +The JDBC plugin (`tika-pipes-jdbc`) provides emitter, iterator, and reporter interfaces for relational databases. The plugin is JDBC-driver-agnostic: any database with a working JDBC driver on the plugin's classpath should work. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Emitter +|`jdbc-emitter` +|`JDBCEmitter` + +|Iterator +|`jdbc-pipes-iterator` +|`JDBCPipesIterator` + +|Reporter +|`jdbc-reporter` +|`JDBCPipesReporter` +|=== + +[#drivers] +== JDBC Drivers + +The plugin does not bundle drivers. Drop the JDBC driver JAR for your database into the plugin's `lib/` directory alongside `tika-pipes-jdbc.jar` so the plugin class loader can find it. Tested drivers include H2, PostgreSQL, MySQL, SQLite, and SQL Server. + +[#jdbc-emitter] +== JDBC Emitter (`jdbc-emitter`) + +Writes parsed documents into a relational table. The emitter uses a prepared statement built from the `insert` template; the emit key is always the first bound parameter, followed by one parameter per entry in `keys`. + +[source,json] +---- +include::example$pipes-jdbc-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`connection` +|_required_ +|JDBC connection URL (validated non-blank). Example: `jdbc:postgresql://db.example.com:5432/tika`. + +|`insert` +|_required_ +|Prepared-statement `INSERT` template. Must use `?` placeholders. The first placeholder receives the emit key; subsequent placeholders receive values from `keys` in order. + +|`createTable` +|_optional_ +|DDL executed once at startup. Use this to create the destination table if it does not already exist. + +|`alterTable` +|_optional_ +|DDL executed once at startup, after `createTable`. Use for indexes or migrations. + +|`postConnection` +|_optional_ +|SQL executed every time a new connection is opened (e.g., pragma statements for SQLite). + +|`maxRetries` +|`0` +|Number of times to retry a failed insert before giving up. + +|`maxStringLength` +|`64000` +|String columns longer than this are truncated. Set to `-1` to disable. + +|`keys` +|_required_ +|Ordered map of metadata-field-name → SQL-type. Types: `string`, `int`, `long`, `bigint`, `boolean`, `timestamp`. The order matters — it must match the order of `?` placeholders in `insert`. + +|`attachmentStrategy` +|`FIRST_ONLY` +|How embedded documents are written. One of: + +* `FIRST_ONLY` — only the parent document is inserted; attachments are dropped. +* `ALL` — every document (parent and attachments) gets its own row. + +|`multivaluedFieldStrategy` +|`CONCATENATE` +|How multi-valued metadata fields are handled. One of: + +* `FIRST_ONLY` — keep only the first value. +* `CONCATENATE` — join values with `multivaluedFieldDelimiter`. + +|`multivaluedFieldDelimiter` +|`", "` +|Separator used by `CONCATENATE`. +|=== + +[#jdbc-iterator] +== JDBC Iterator (`jdbc-pipes-iterator`) + +Walks rows returned by a SELECT statement, emitting one `FetchEmitTuple` per row. + +[source,json] +---- +include::example$pipes-jdbc-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`connection` +|_required_ +|JDBC connection URL. + +|`select` +|_required_ +|SELECT statement to enumerate. + +|`idColumn` +|_optional_ +|Column whose value becomes the iterator's row identifier. + +|`fetchKeyColumn` +|_optional_ +|Column whose value becomes the fetch key on each emitted tuple. + +|`emitKeyColumn` +|_optional_ +|Column whose value becomes the emit key on each emitted tuple. + +|`fetchKeyRangeStartColumn` / `fetchKeyRangeEndColumn` +|_optional_ +|Columns for range-based fetch keys (advanced). + +|`fetchSize` +|`-1` +|JDBC `fetchSize` hint. `-1` lets the driver choose. + +|`queryTimeoutSeconds` +|`-1` +|JDBC statement timeout. `-1` means no timeout. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#jdbc-reporter] +== JDBC Reporter (`jdbc-reporter`) + +Writes per-document processing status to a SQL table. Records are buffered in memory and flushed periodically. + +[source,json] +---- +include::example$pipes-jdbc-reporter.json[] +---- + +`pipes-reporters` accepts multiple reporters keyed by type name — see xref:pipes/reporters.adoc[Pipes Reporters] for how multiple reporters compose. + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`connectionString` +|_required_ +|JDBC connection URL for the status database. + +|`includes` +|_empty (all reported)_ +|Set of `RESULT_STATUS` names to include (e.g., `PARSE_SUCCESS`, `PARSE_EXCEPTION`). + +|`excludes` +|_empty_ +|Set of `RESULT_STATUS` names to skip. Applied after `includes`. + +|`tableName` +|`tika_status` +|Status table name. + +|`createTable` +|`true` +|If `true`, drop the existing status table (if any) and recreate it on startup. Set to `false` to preserve an existing table. + +|`reportSql` +|_no default_ +|Custom prepared-statement template for inserting/updating status rows. If unset, the reporter uses `insert into (id, status, timestamp) values (?,?,?)`. Coordinate with `reportVariables` when overriding. + +|`postConnectionSql` +|_no default_ +|SQL executed each time a connection is opened (e.g., SQLite pragmas). + +|`reportVariables` +|_empty_ +|Names of the variables to bind to each `?` placeholder in `reportSql`, in order. Available names: `id`, `status`, `timestamp`. Only needed when overriding `reportSql`. + +|`reportWithinMs` +|`10000` +|Milliseconds between batched flushes from the in-memory cache to the database. + +|`cacheSize` +|`100` +|Maximum in-memory cache size before a flush is forced. +|=== + +[#jdbc-pipeline] +== Complete Pipeline Example + +The example below combines a JDBC iterator (reading work items from one table), a filesystem fetcher (reading the actual document bytes), a JDBC emitter (writing parsed metadata to a results table), and a JDBC reporter (recording per-document outcomes). + +[source,json] +---- +include::example$pipes-jdbc-pipeline.json[] +---- + +[#notes] +== Notes + +* H2 (`jdbc:h2:mem:...`) is convenient for testing — no setup required — but the schema is lost when the process exits. +* The emitter's `keys` map preserves insertion order (it's a `LinkedHashMap` in Java). When writing the JSON, list the keys in the same order as the `?` placeholders in `insert`. +* For high-throughput inserts, point `maxRetries` at a small positive number so transient connection failures don't drop documents. +* Bind variables are typed by the SQL type declared in `keys`, not by the metadata value's Java type. Mismatches between SQL type and column type cause inserts to fail — coordinate `createTable` with `keys`. diff --git a/docs/modules/ROOT/pages/pipes/plugins/json.adoc b/docs/modules/ROOT/pages/pipes/plugins/json.adoc new file mode 100644 index 00000000000..18ff976609a --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/json.adoc @@ -0,0 +1,63 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += JSON Plugin +:toc: +:toclevels: 3 + +The JSON plugin (`tika-pipes-json`) provides an iterator that reads work items from a JSON-lines file (one JSON object per line). It is iterator-only — pair it with a fetcher and emitter. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Iterator +|`json-pipes-iterator` +|`JsonPipesIterator` +|=== + +[#json-iterator] +== JSON Iterator (`json-pipes-iterator`) + +Reads each line of a JSONL file as a work item and emits one `FetchEmitTuple` per object. + +[source,json] +---- +include::example$pipes-json-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`jsonPath` +|_required_ +|Path to the JSONL file on disk. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#notes] +== Notes + +* The file format is JSON-lines (also called NDJSON) — one valid JSON object per line, no surrounding array brackets. +* Each line's JSON object is parsed and its fields become the metadata of the emitted `FetchEmitTuple`. The keys used for fetch / emit identification come from the tuple-shaped fields (e.g., `fetchKey`, `emitKey`). +* For columnar work items in a CSV, use the xref:pipes/plugins/csv.adoc[CSV iterator] instead. diff --git a/docs/modules/ROOT/pages/pipes/plugins/kafka.adoc b/docs/modules/ROOT/pages/pipes/plugins/kafka.adoc new file mode 100644 index 00000000000..f1da41fbe83 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/kafka.adoc @@ -0,0 +1,213 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Apache Kafka Plugin +:toc: +:toclevels: 3 + +The Apache Kafka plugin (`tika-pipes-kafka`) provides an emitter (publishes parsed documents to a Kafka topic) and an iterator (consumes fetch requests from a Kafka topic). + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Emitter +|`kafka-emitter` +|`KafkaEmitter` + +|Iterator +|`kafka-pipes-iterator` +|`KafkaPipesIterator` +|=== + +[#kafka-emitter] +== Kafka Emitter (`kafka-emitter`) + +Publishes each parsed document as a record to a Kafka topic. + +[source,json] +---- +include::example$pipes-kafka-emitter.json[] +---- + +=== Configuration + +Most fields map directly to standard Kafka producer settings; the defaults listed here match Kafka's own defaults unless noted. + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`topic` +|_required_ +|Kafka topic to publish to (validated non-blank). + +|`bootstrapServers` +|_required_ +|Comma-separated `host:port` list of Kafka brokers (validated non-blank). + +|`acks` +|`all` +|Producer acks setting: `0`, `1`, or `all`. + +|`lingerMs` +|`5000` +|Producer linger in milliseconds. + +|`batchSize` +|`16384` +|Producer batch size in bytes. + +|`bufferMemory` +|`33554432` +|Producer buffer memory in bytes (32 MiB). + +|`compressionType` +|`none` +|One of `none`, `gzip`, `snappy`, `lz4`, `zstd`. + +|`connectionsMaxIdleMs` +|`540000` +|Producer connection idle timeout. + +|`deliveryTimeoutMs` +|`120000` +|End-to-end delivery timeout. + +|`enableIdempotence` +|`false` +|Enable the idempotent producer. Requires `acks=all` and `maxInFlightRequestsPerConnection<=5`. + +|`interceptorClasses` +|_no default_ +|Comma-separated list of producer interceptor class names. + +|`maxBlockMs` +|`60000` +|How long the producer blocks on `send()` when the buffer is full. + +|`maxInFlightRequestsPerConnection` +|`5` +|In-flight requests per connection. + +|`maxRequestSize` +|`1048576` +|Maximum request size in bytes (1 MiB). + +|`metadataMaxAgeMs` +|`300000` +|Metadata refresh interval. + +|`requestTimeoutMs` +|`30000` +|Request timeout. + +|`retries` +|`2147483647` +|Producer retries. Default is `Integer.MAX_VALUE`; capped by `deliveryTimeoutMs`. + +|`retryBackoffMs` +|`100` +|Backoff between retries. + +|`transactionTimeoutMs` +|`60000` +|Transaction timeout (only meaningful with `transactionalId`). + +|`transactionalId` +|_no default_ +|Set to enable transactional producer. + +|`clientId` +|_no default_ +|`client.id` to send with each request. + +|`keySerializer` / `valueSerializer` +|_no default_ +|Fully-qualified serializer class names. Leave unset to use the plugin's defaults (string keys, JSON values). +|=== + +[#kafka-iterator] +== Kafka Iterator (`kafka-pipes-iterator`) + +Consumes fetch-request messages from a Kafka topic and emits one `FetchEmitTuple` per message. Useful for building event-driven pipelines where some upstream system pushes work to a queue. + +[source,json] +---- +include::example$pipes-kafka-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`topic` +|_required_ +|Kafka topic to consume from. + +|`bootstrapServers` +|_required_ +|Broker list. + +|`groupId` +|_optional_ +|Kafka consumer group ID. Strongly recommended in production for failover and partition reassignment. + +|`keySerializer` / `valueSerializer` +|_optional_ +|Custom (de)serializer class names. + +|`autoOffsetReset` +|`earliest` +|What to do on first connect: `earliest` or `latest`. + +|`pollDelayMs` +|`100` +|Sleep between `poll()` calls when the topic is idle. + +|`emitMax` +|`-1` +|Maximum tuples to emit. `-1` means unbounded. + +|`groupInitialRebalanceDelayMs` +|`3000` +|Initial rebalance delay for the consumer group. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#kafka-pipeline] +== Complete Pipeline Example + +The example below wires the Kafka iterator (consuming fetch requests) with a filesystem fetcher and a Kafka emitter (publishing parsed results). Common for stream-processing-style document pipelines. + +[source,json] +---- +include::example$pipes-kafka-pipeline.json[] +---- + +[#notes] +== Notes + +* The Kafka plugin uses the official `kafka-clients` SDK. +* The emitter is fire-and-forget at the Tika level; durability is determined by Kafka's `acks` and broker replication factor, not by Tika. +* For exactly-once semantics, set `enableIdempotence: true` (and ensure `acks: all`); for transactional semantics, also set `transactionalId`. +* The iterator's `groupId` controls partition assignment. Set it explicitly in production — without one, the consumer receives a transient assignment that resets on restart. diff --git a/docs/modules/ROOT/pages/pipes/plugins/microsoft-graph.adoc b/docs/modules/ROOT/pages/pipes/plugins/microsoft-graph.adoc new file mode 100644 index 00000000000..ede02d8f622 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/microsoft-graph.adoc @@ -0,0 +1,85 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Microsoft Graph Plugin +:toc: +:toclevels: 3 + +The Microsoft Graph plugin (`tika-pipes-microsoft-graph`) provides a fetcher that retrieves files from OneDrive, SharePoint, and other Graph-accessible sources. It is fetcher-only — pair it with another emitter and iterator. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`microsoft-graph-fetcher` +|`MicrosoftGraphFetcher` +|=== + +[#credentials] +== Credentials + +The fetcher authenticates against Microsoft Entra (Azure AD) using one of two credential modes — set exactly one: + +* **Client secret** (`clientSecretCredentialsConfig`) — easiest to set up; client secrets rotate manually. +* **Client certificate** (`clientCertificateCredentialsConfig`) — for environments that require certificate-based auth. + +Both modes need the same three identity fields: `tenantId`, `clientId`, plus either `clientSecret` or `certificate`. + +[#microsoft-graph-fetcher] +== Microsoft Graph Fetcher (`microsoft-graph-fetcher`) + +Fetches files via the Microsoft Graph API. The fetch key encodes the Graph object identifier. + +[source,json] +---- +include::example$pipes-microsoft-graph-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`clientSecretCredentialsConfig` +|_required (XOR)_ +|Nested object with `tenantId`, `clientId`, `clientSecret`. See <>. + +|`clientCertificateCredentialsConfig` +|_required (XOR)_ +|Nested object with `tenantId`, `clientId`, `certificate`. See <>. + +|`scopes` +|_empty_ +|OAuth scopes to request. Typical: `["https://graph.microsoft.com/.default"]` (application permissions). + +|`spoolToTemp` +|`false` +|If `true`, files are spooled to a temp file before being parsed. + +|`throttleSeconds` +|_optional_ +|Rate-limit array — consecutive failures sleep for the corresponding number of seconds. +|=== + +[#notes] +== Notes + +* The plugin uses the official `microsoft-graph` SDK. +* For most service-to-service workflows, use application permissions (`https://graph.microsoft.com/.default` scope) — delegated permissions require an interactive flow that the fetcher does not support. +* Client secrets are sensitive — use environment-variable substitution or external secret stores rather than inlining them in source control. diff --git a/docs/modules/ROOT/pages/pipes/plugins/opensearch.adoc b/docs/modules/ROOT/pages/pipes/plugins/opensearch.adoc new file mode 100644 index 00000000000..34931251198 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/opensearch.adoc @@ -0,0 +1,176 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += OpenSearch Plugin +:toc: +:toclevels: 3 + +The OpenSearch plugin (`tika-pipes-opensearch`) provides an emitter (writes parsed docs to an OpenSearch index) and a reporter (writes per-document processing status to OpenSearch). + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Emitter +|`opensearch-emitter` +|`OpenSearchEmitter` + +|Reporter +|`opensearch-pipes-reporter` +|`OpenSearchPipesReporter` +|=== + +[#http-client-config] +== Shared HTTP Client Settings + +Both the emitter and the reporter accept a nested `httpClientConfig` block with these fields: + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`userName` / `password` +|_optional_ +|Basic-auth credentials. Omit both for an anonymous client. + +|`authScheme` +|_optional_ +|Set to `basic` to send credentials preemptively. + +|`connectionTimeoutMillis` +|_no default_ +|HTTP connect timeout, in milliseconds. + +|`socketTimeoutMillis` +|_no default_ +|HTTP socket read timeout, in milliseconds. + +|`proxyHost` / `proxyPort` +|_optional_ +|Optional outbound HTTP proxy. +|=== + +[#opensearch-emitter] +== OpenSearch Emitter (`opensearch-emitter`) + +Writes parsed documents to an OpenSearch index. + +[source,json] +---- +include::example$pipes-opensearch-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`openSearchUrl` +|_required_ +|Full URL of the target OpenSearch index, e.g., `https://opensearch.example.com:9200/tika-docs`. + +|`idField` +|_required_ +|Field in the emitted JSON document that holds the OpenSearch `_id`. + +|`attachmentStrategy` +|_no default_ +|How attached/embedded documents are indexed. One of: + +* `SEPARATE_DOCUMENTS` — each attachment becomes its own top-level document. +* `PARENT_CHILD` — attachments are nested under the parent in a parent/child relation. + +|`updateStrategy` +|_no default_ +|How existing documents are handled. One of: + +* `OVERWRITE` — replaces an existing document at `_id`. +* `UPSERT` — merges into an existing document. + +|`commitWithin` +|_no default_ +|Maximum delay before the index refresh becomes visible, in milliseconds (passed to OpenSearch's `refresh` semantics). + +|`embeddedFileFieldName` +|_no default_ +|Name of the field used to hold embedded-file content (used by `PARENT_CHILD`). + +|`httpClientConfig` +|_optional_ +|See <>. +|=== + +[#opensearch-reporter] +== OpenSearch Reporter (`opensearch-pipes-reporter`) + +Writes per-document processing status records to an OpenSearch index. Useful for building dashboards over pipeline activity. + +[source,json] +---- +include::example$pipes-opensearch-reporter.json[] +---- + +`pipes-reporters` accepts multiple reporters keyed by type name — see xref:pipes/reporters.adoc[Pipes Reporters] for how multiple reporters compose. + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`openSearchUrl` +|_required_ +|Full URL of the status index, e.g., `https://opensearch.example.com:9200/tika-status`. + +|`includes` +|_optional_ +|Set of `RESULT_STATUS` names to include (e.g., `PARSE_SUCCESS`, `PARSE_EXCEPTION`). If unset, all are reported. + +|`excludes` +|_optional_ +|Set of `RESULT_STATUS` names to skip. Applied after `includes`. + +|`keyPrefix` +|_optional_ +|Prefix prepended to status field names in the emitted documents. + +|`includeRouting` +|`false` +|If `true`, include OpenSearch routing info in each status record. + +|`httpClientConfig` +|_optional_ +|See <>. +|=== + +[#opensearch-pipeline] +== Complete Pipeline Example + +The example below combines a filesystem iterator/fetcher with the OpenSearch emitter and reporter — a common pattern for ingesting a directory of documents into an OpenSearch index. + +[source,json] +---- +include::example$pipes-opensearch-pipeline.json[] +---- + +[#notes] +== Notes + +* The OpenSearch plugin's HTTP client is REST-based; it does not depend on the OpenSearch transport client. +* For Elasticsearch deployments, use the parallel xref:pipes/plugins/elasticsearch.adoc[Elasticsearch plugin] instead — the field names differ (`esUrl` vs. `openSearchUrl`) and ES adds API-key auth. +* Don't check real credentials into source control — the `password` values in the examples above are placeholders. diff --git a/docs/modules/ROOT/pages/pipes/plugins/solr.adoc b/docs/modules/ROOT/pages/pipes/plugins/solr.adoc new file mode 100644 index 00000000000..808e2bcdb63 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/solr.adoc @@ -0,0 +1,202 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Apache Solr Plugin +:toc: +:toclevels: 3 + +The Apache Solr plugin (`tika-pipes-solr`) provides an emitter (writes parsed docs to a Solr collection) and an iterator (enumerates documents already in Solr for re-processing). + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Emitter +|`solr-emitter` +|`SolrEmitter` + +|Iterator +|`solr-pipes-iterator` +|`SolrPipesIterator` +|=== + +[#connection-modes] +== Connection Modes + +Both components support two ways of locating a Solr cluster — pick exactly one: + +* **Direct URLs (`solrUrls`)** — list one or more node URLs. Use this for standalone Solr or for SolrCloud when you want to bypass ZooKeeper for routing. +* **ZooKeeper (`solrZkHosts` + `solrZkChroot`)** — list the ZooKeeper ensemble; Solr discovers nodes via ZK. Use this for SolrCloud deployments. + +The emitter's `validate()` enforces the XOR: setting neither or both raises `TikaConfigException`. + +[#solr-emitter] +== Solr Emitter (`solr-emitter`) + +Writes parsed documents to a Solr collection. + +[source,json] +---- +include::example$pipes-solr-emitter.json[] +---- + +For SolrCloud with ZooKeeper-based routing, use `solrZkHosts` (and optionally `solrZkChroot`) instead of `solrUrls`: + +[source,json] +---- +include::example$pipes-solr-emitter-zk.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`solrCollection` +|_required_ +|Solr collection (or core) name (validated non-blank). + +|`solrUrls` +|_required (XOR)_ +|List of node URLs, e.g., `["http://solr1.example.com:8983/solr"]`. Mutually exclusive with `solrZkHosts`. + +|`solrZkHosts` +|_required (XOR)_ +|List of ZooKeeper hosts, e.g., `["zk1.example.com:2181"]`. Mutually exclusive with `solrUrls`. + +|`solrZkChroot` +|_optional_ +|ZooKeeper chroot, when using `solrZkHosts`. + +|`idField` +|`id` +|Field in the emitted JSON document that holds the Solr `_id`. + +|`commitWithin` +|`1000` +|Solr `commitWithin` value, in milliseconds. + +|`connectionTimeoutMillis` +|`10000` +|HTTP connect timeout. + +|`socketTimeoutMillis` +|`60000` +|HTTP socket read timeout. + +|`attachmentStrategy` +|`PARENT_CHILD` +|How attached/embedded documents are indexed. One of: + +* `SEPARATE_DOCUMENTS` — each attachment becomes its own top-level document. +* `PARENT_CHILD` — attachments are nested under the parent. + +|`updateStrategy` +|`ADD` +|How existing documents are handled. One of: + +* `ADD` — replaces any existing document at the same `_id`. +* `UPDATE_MUST_EXIST` — fails if no document exists at that `_id`. +* `UPDATE_MUST_NOT_EXIST` — fails if a document already exists at that `_id`. + +|`embeddedFileFieldName` +|`embedded` +|Field name used to hold embedded-file content (used by `PARENT_CHILD`). + +|`userName` / `password` / `authScheme` +|_optional_ +|HTTP basic auth credentials. + +|`proxyHost` / `proxyPort` +|_optional_ +|Optional outbound HTTP proxy. +|=== + +[#solr-iterator] +== Solr Iterator (`solr-pipes-iterator`) + +Enumerates documents already in a Solr collection and emits one `FetchEmitTuple` per matching document. Useful for re-parsing existing documents — e.g., after a parser bug fix or a Tika upgrade. + +[source,json] +---- +include::example$pipes-solr-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`solrCollection` +|_required_ +|Solr collection to iterate. + +|`solrUrls` / `solrZkHosts` / `solrZkChroot` +|_required (XOR)_ +|Connection mode — see <>. + +|`filters` +|_empty_ +|List of Solr filter queries to scope the iteration (e.g., `["status:NEEDS_REPARSE"]`). + +|`idField` +|_no default_ +|Solr field used as the iterator's row identifier. + +|`parsingIdField` / `failCountField` / `sizeFieldName` / `additionalFields` +|_optional_ +|Extra Solr fields surfaced into the `FetchEmitTuple` metadata. Advanced; usually unset. + +|`rows` +|`5000` +|Page size for the underlying Solr query. + +|`connectionTimeoutMillis` +|`10000` +|HTTP connect timeout. + +|`socketTimeoutMillis` +|`60000` +|HTTP socket read timeout. + +|`userName` / `password` / `authScheme` / `proxyHost` / `proxyPort` +|_optional_ +|Same as the emitter. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#solr-pipeline] +== Complete Pipeline Example + +The example below combines a filesystem iterator/fetcher with the Solr emitter — the common pattern for ingesting a directory of documents into Solr. + +[source,json] +---- +include::example$pipes-solr-pipeline.json[] +---- + +[#notes] +== Notes + +* The Solr plugin uses SolrJ (`solr-solrj`). HTTP/2 transport is used when available. +* For re-parsing workflows, point a `solr-pipes-iterator` at the same collection a `solr-emitter` writes to, but use `UPDATE_MUST_EXIST` on the emitter to avoid creating phantom rows. +* `commitWithin` is a soft guarantee — Solr may delay commits under load. For strict ordering, configure auto-commits on the Solr side and leave `commitWithin` at its default. diff --git a/docs/modules/ROOT/pages/pipes/reporters.adoc b/docs/modules/ROOT/pages/pipes/reporters.adoc index 3994ede95e2..01bc05e6043 100644 --- a/docs/modules/ROOT/pages/pipes/reporters.adoc +++ b/docs/modules/ROOT/pages/pipes/reporters.adoc @@ -16,78 +16,59 @@ // = Pipes Reporters +:toc: -Reporters track the processing status of each document in the pipeline. -They record whether a parse succeeded, failed, or timed out, along with -timing information. +A *pipes reporter* records per-document processing status — success, parse exception, timeout, OOM — as the pipeline runs. Reporters are observational; they do not gate parsing or emission. -== File System Reporter (`file-system-reporter`) +[#contract] +== The Reporter Contract -Writes a JSON status file that is updated periodically. +Each reporter implements `PipesReporter#report(FetchEmitTuple t, PipesResult result, long elapsed)` and gets called once per processed document. Reporters typically buffer status records in memory and flush them on a background thread, so per-document calls stay cheap. -**Module:** `tika-pipes-file-system` +[#wiring] +== Wiring Reporters Into a Pipeline -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`statusFile` -|_required_ -|Path to the JSON status file. - -|`reportUpdateMs` -|`1000` -|How often to update the status file (milliseconds). -|=== - -== JDBC Reporter (`jdbc-reporter`) - -Writes per-document status to a SQL database table. - -**Module:** `tika-pipes-jdbc` +Reporters live under the plural top-level `pipes-reporters` key. The keys inside that block are reporter type-names; multiple reporters may run together. -[cols="1,1,3"] -|=== -|Field |Default |Description +[source,json] +---- +{ + "pipes-reporters": { + "file-system-reporter": { + "statusFile": "/var/log/tika/status.json", + "reportUpdateMs": 1000 + }, + "jdbc-reporter": { + "connectionString": "jdbc:h2:mem:reports;DB_CLOSE_DELAY=-1" + } + } +} +---- -|`connectionString` -|_required_ -|JDBC connection string. +Each entry's outer key is the reporter's component name — there is no separate ID layer because reporters do not get referenced by other components. -|`tableName` -|_required_ -|Table name for status records. +[#plugins] +== Available Reporters -|`createTable` -|`false` -|Auto-create the table if it does not exist. +[cols="2,2,3"] |=== +|Plugin |Component name |Notes -== Elasticsearch Reporter (`es-pipes-reporter`) +|xref:pipes/plugins/filesystem.adoc[File System] +|`file-system-reporter` +|Writes a JSON status file periodically. Pair with an external watcher — see xref:pipes/plugins/filesystem.adoc#watching[Live status for watching applications]. -Writes per-document parse status back into the Elasticsearch index via upsert. +|xref:pipes/plugins/jdbc.adoc[JDBC] +|`jdbc-reporter` +|Writes per-doc status rows to a SQL table. -**Module:** `tika-pipes-es` +|xref:pipes/plugins/opensearch.adoc[OpenSearch] +|`opensearch-pipes-reporter` +|Writes per-doc status records to an OpenSearch index. -[cols="1,1,3"] +|xref:pipes/plugins/elasticsearch.adoc[Elasticsearch] +|`es-pipes-reporter` +|Writes per-doc status records to an Elasticsearch index. |=== -|Field |Default |Description - -|`esUrl` -|_required_ -|Elasticsearch endpoint (including index). - -|`keyPrefix` -|`tika_` -|Prefix for status fields (e.g., `tika_parse_status`). - -|`includeRouting` -|`false` -|Include routing in upsert requests. -|=== - -== OpenSearch Reporter (`opensearch-pipes-reporter`) - -Same as the ES reporter but for OpenSearch. Uses `openSearchUrl` instead of `esUrl`. -**Module:** `tika-pipes-opensearch` +For the full plugin / interface matrix, see xref:pipes/plugins/index.adoc[Plugins]. diff --git a/docs/modules/ROOT/pages/pipes/shared-server-mode.adoc b/docs/modules/ROOT/pages/pipes/shared-server-mode.adoc index d956acb5fac..e95180ecc72 100644 --- a/docs/modules/ROOT/pages/pipes/shared-server-mode.adoc +++ b/docs/modules/ROOT/pages/pipes/shared-server-mode.adoc @@ -101,6 +101,8 @@ When using shared mode, size the JVM heap for worst-case concurrent load: In per-client mode, the same workload would use 4 x 500MB = 2GB total, but distributed across 4 isolated JVMs where one OOM only affects one request. +For the per-worker `-Xmx` rule of thumb that informs both modes, see xref:pipes/cpu-sizing.adoc#heap-per-worker[Heap per worker]. + == Recovery Behavior When a fatal error occurs (OOM, timeout, or crash): diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/java/org/apache/tika/pipes/atlassianjwt/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/java/org/apache/tika/pipes/atlassianjwt/ConfigExamplesTest.java new file mode 100644 index 00000000000..e1802d01029 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/java/org/apache/tika/pipes/atlassianjwt/ConfigExamplesTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.atlassianjwt; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.fetcher.atlassianjwt.config.AtlassianJwtFetcherConfig; + +/** + * Validates Atlassian JWT fetcher configuration examples used in documentation. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + @Test + public void testAtlassianJwtFetcherConfig() throws Exception { + String json = readExample("atlassian-jwt-fetcher.json"); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + assertNotNull(TikaLoader.load(configFile)); + + JsonNode inner = OBJECT_MAPPER.readTree(json) + .get("fetchers").get("ajwt").get("atlassian-jwt-fetcher"); + AtlassianJwtFetcherConfig config = AtlassianJwtFetcherConfig.load(inner.toString()); + assertEquals("tika-pipes-app-key", config.getIssuer()); + assertNotNull(config.getSharedSecret()); + assertEquals("service-account@example.com", config.getSubject()); + assertEquals(Integer.valueOf(3600), config.getJwtExpiresInSeconds()); + assertEquals(Integer.valueOf(30000), config.getConnectTimeoutMillis()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/resources/config-examples/atlassian-jwt-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/resources/config-examples/atlassian-jwt-fetcher.json new file mode 100644 index 00000000000..01c165a61ec --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-atlassian-jwt/src/test/resources/config-examples/atlassian-jwt-fetcher.json @@ -0,0 +1,19 @@ +{ + "fetchers": { + "ajwt": { + "atlassian-jwt-fetcher": { + "issuer": "tika-pipes-app-key", + "sharedSecret": "REDACTED_SHARED_SECRET", + "subject": "service-account@example.com", + "jwtExpiresInSeconds": 3600, + "maxConnections": 2000, + "maxConnectionsPerRoute": 1000, + "connectTimeoutMillis": 30000, + "socketTimeoutMillis": 60000, + "requestTimeoutMillis": 60000, + "overallTimeoutMillis": 120000, + "userAgent": "tika-pipes/1.0" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob-fetcher.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob-fetcher.xml deleted file mode 100644 index 2aa6ba9a533..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob-fetcher.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - az-blob - - - - - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob.xml deleted file mode 100644 index c1ba42b07b0..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config/tika-config-az-blob.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - az-blob - - - - - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/csv/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/csv/ConfigExamplesTest.java new file mode 100644 index 00000000000..75ca4429186 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/csv/ConfigExamplesTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.csv; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.iterator.csv.CSVPipesIteratorConfig; + +/** + * Validates CSV iterator configuration example used in documentation. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + @Test + public void testCsvIteratorConfig() throws Exception { + String json = readExample("csv-pipes-iterator.json"); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + assertNotNull(TikaLoader.load(configFile)); + + JsonNode inner = OBJECT_MAPPER.readTree(json) + .get("pipes-iterator").get("csv-pipes-iterator"); + CSVPipesIteratorConfig config = CSVPipesIteratorConfig.load(inner.toString()); + assertNotNull(config.getCsvPath()); + assertEquals("doc_id", config.getIdColumn()); + assertEquals("source_path", config.getFetchKeyColumn()); + assertEquals("output_path", config.getEmitKeyColumn()); + assertEquals("fsf", config.getFetcherId()); + assertEquals("fse", config.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/resources/config-examples/csv-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/resources/config-examples/csv-pipes-iterator.json new file mode 100644 index 00000000000..92abb5fb7b4 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/resources/config-examples/csv-pipes-iterator.json @@ -0,0 +1,12 @@ +{ + "pipes-iterator": { + "csv-pipes-iterator": { + "csvPath": "/data/work-items.csv", + "idColumn": "doc_id", + "fetchKeyColumn": "source_path", + "emitKeyColumn": "output_path", + "fetcherId": "fsf", + "emitterId": "fse" + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/java/org/apache/tika/pipes/es/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/java/org/apache/tika/pipes/es/ConfigExamplesTest.java new file mode 100644 index 00000000000..b1be5faa4be --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/java/org/apache/tika/pipes/es/ConfigExamplesTest.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.es; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.es.ESEmitterConfig; +import org.apache.tika.pipes.reporter.es.ESReporterConfig; + +/** + * Validates Elasticsearch emitter/reporter configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testESEmitterConfig() throws Exception { + loadViaTikaLoader("es-emitter.json"); + + JsonNode inner = innerComponent(readExample("es-emitter.json"), + "emitters", "ese", "es-emitter"); + ESEmitterConfig config = ESEmitterConfig.load(inner.toString()); + assertEquals("doc_id", config.idField()); + assertEquals(ESEmitterConfig.AttachmentStrategy.PARENT_CHILD, + config.attachmentStrategy()); + assertEquals(ESEmitterConfig.UpdateStrategy.OVERWRITE, config.updateStrategy()); + assertEquals(1000, config.commitWithin()); + assertNotNull(config.apiKey()); + assertNotNull(config.httpClientConfig()); + // The toString override redacts the apiKey value + assertFalse(config.toString().contains(config.apiKey()), + "apiKey must not appear in toString() output"); + } + + @Test + public void testESReporterConfig() throws Exception { + loadViaTikaLoader("es-reporter.json"); + + JsonNode inner = innerComponent(readExample("es-reporter.json"), + "pipes-reporters", null, "es-pipes-reporter"); + ESReporterConfig config = ESReporterConfig.load(inner.toString()); + assertTrue(config.esUrl().contains("tika-status")); + assertEquals("tika_", config.keyPrefix()); + assertTrue(config.includeRouting()); + assertNotNull(config.includes()); + assertTrue(config.includes().contains("PARSE_SUCCESS")); + assertNotNull(config.httpClientConfig()); + } + + @Test + public void testESPipelineConfig() throws Exception { + loadViaTikaLoader("es-pipeline.json"); + + String json = readExample("es-pipeline.json"); + ESEmitterConfig emitter = ESEmitterConfig.load( + innerComponent(json, "emitters", "ese", "es-emitter").toString()); + ESReporterConfig reporter = ESReporterConfig.load( + innerComponent(json, "pipes-reporters", null, "es-pipes-reporter").toString()); + + assertEquals("doc_id", emitter.idField()); + assertNotNull(reporter.httpClientConfig()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-emitter.json new file mode 100644 index 00000000000..e0f341cd6c3 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-emitter.json @@ -0,0 +1,19 @@ +{ + "emitters": { + "ese": { + "es-emitter": { + "esUrl": "https://es.example.com:9200/tika-docs", + "idField": "doc_id", + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "OVERWRITE", + "commitWithin": 1000, + "embeddedFileFieldName": "embedded", + "apiKey": "REDACTED_BASE64_ID_AND_KEY", + "httpClientConfig": { + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-pipeline.json new file mode 100644 index 00000000000..1dce6405d21 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-pipeline.json @@ -0,0 +1,60 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "/data/input", + "extractFileSystemMetadata": false + } + } + }, + "emitters": { + "ese": { + "es-emitter": { + "esUrl": "https://es.example.com:9200/tika-docs", + "idField": "doc_id", + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "OVERWRITE", + "commitWithin": 1000, + "embeddedFileFieldName": "embedded", + "apiKey": "REDACTED_BASE64_ID_AND_KEY", + "httpClientConfig": { + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } + }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/input", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "ese" + } + }, + "pipes-reporters": { + "es-pipes-reporter": { + "esUrl": "https://es.example.com:9200/tika-status", + "includes": ["PARSE_SUCCESS", "PARSE_EXCEPTION", "OOM", "TIMEOUT"], + "keyPrefix": "tika_", + "includeRouting": true, + "apiKey": "REDACTED_BASE64_ID_AND_KEY", + "httpClientConfig": { + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-reporter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-reporter.json new file mode 100644 index 00000000000..4761705035c --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/resources/config-examples/es-reporter.json @@ -0,0 +1,15 @@ +{ + "pipes-reporters": { + "es-pipes-reporter": { + "esUrl": "https://es.example.com:9200/tika-status", + "includes": ["PARSE_SUCCESS", "PARSE_EXCEPTION", "OOM", "TIMEOUT"], + "keyPrefix": "tika_", + "includeRouting": true, + "apiKey": "REDACTED_BASE64_ID_AND_KEY", + "httpClientConfig": { + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config/tika-config-gcs.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config/tika-config-gcs.xml deleted file mode 100644 index 2ea06761e51..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config/tika-config-gcs.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - gcs - My First Project - tika-tallison-test-bucket - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/java/org/apache/tika/pipes/googledrive/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/java/org/apache/tika/pipes/googledrive/ConfigExamplesTest.java new file mode 100644 index 00000000000..7ee99ebca28 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/java/org/apache/tika/pipes/googledrive/ConfigExamplesTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.googledrive; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.fetcher.googledrive.config.GoogleDriveFetcherConfig; + +/** + * Validates Google Drive fetcher configuration examples used in documentation. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + @Test + public void testGoogleDriveFetcherConfig() throws Exception { + String json = readExample("google-drive-fetcher.json"); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + assertNotNull(TikaLoader.load(configFile)); + + JsonNode inner = OBJECT_MAPPER.readTree(json) + .get("fetchers").get("gdf").get("google-drive-fetcher"); + GoogleDriveFetcherConfig config = GoogleDriveFetcherConfig.load(inner.toString()); + assertEquals("tika-pipes", config.getApplicationName()); + assertEquals("user@example.com", config.getSubjectUser()); + assertNotNull(config.getServiceAccountKeyBase64()); + assertTrue(config.getScopes().contains("https://www.googleapis.com/auth/drive.readonly")); + assertTrue(config.isSpoolToTemp()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/resources/config-examples/google-drive-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/resources/config-examples/google-drive-fetcher.json new file mode 100644 index 00000000000..441a54ab16d --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-google-drive/src/test/resources/config-examples/google-drive-fetcher.json @@ -0,0 +1,13 @@ +{ + "fetchers": { + "gdf": { + "google-drive-fetcher": { + "applicationName": "tika-pipes", + "serviceAccountKeyBase64": "REDACTED_BASE64_SERVICE_ACCOUNT_JSON", + "subjectUser": "user@example.com", + "scopes": ["https://www.googleapis.com/auth/drive.readonly"], + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/java/org/apache/tika/pipes/http/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/java/org/apache/tika/pipes/http/ConfigExamplesTest.java new file mode 100644 index 00000000000..ff447df2a3c --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/java/org/apache/tika/pipes/http/ConfigExamplesTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.http; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.fetcher.http.config.HttpFetcherConfig; + +/** + * Validates HTTP fetcher configuration examples used in documentation. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + @Test + public void testHttpFetcherConfig() throws Exception { + String json = readExample("http-fetcher.json"); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + assertNotNull(TikaLoader.load(configFile)); + + JsonNode inner = OBJECT_MAPPER.readTree(json) + .get("fetchers").get("httpf").get("http-fetcher"); + HttpFetcherConfig config = HttpFetcherConfig.load(inner.toString()); + assertEquals("tika", config.getUserName()); + assertEquals("basic", config.getAuthScheme()); + assertEquals(Integer.valueOf(30000), config.getConnectTimeoutMillis()); + assertEquals(Integer.valueOf(5), config.getMaxRedirects()); + assertTrue(config.getHttpHeaders().contains("Accept: application/octet-stream")); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/resources/config-examples/http-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/resources/config-examples/http-fetcher.json new file mode 100644 index 00000000000..4514e226a8b --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-http/src/test/resources/config-examples/http-fetcher.json @@ -0,0 +1,21 @@ +{ + "fetchers": { + "httpf": { + "http-fetcher": { + "userName": "tika", + "password": "REDACTED", + "authScheme": "basic", + "userAgent": "tika-pipes/1.0", + "maxConnections": 2000, + "maxConnectionsPerRoute": 1000, + "connectTimeoutMillis": 30000, + "socketTimeoutMillis": 60000, + "requestTimeoutMillis": 60000, + "overallTimeoutMillis": 120000, + "maxRedirects": 5, + "maxSpoolSize": -1, + "httpHeaders": ["Accept: application/octet-stream"] + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/reporter/jdbc/JDBCPipesReporterConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/reporter/jdbc/JDBCPipesReporterConfig.java index 4c329b0fca7..94a7df3dc18 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/reporter/jdbc/JDBCPipesReporterConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/reporter/jdbc/JDBCPipesReporterConfig.java @@ -64,12 +64,27 @@ public static JDBCPipesReporterConfig load(final String json) } @JsonCreator - public JDBCPipesReporterConfig(@JsonProperty("connectionString") String connectionString, - @JsonProperty("includes") Set includes, - @JsonProperty("excludes") Set excludes) { - this(connectionString, + public static JDBCPipesReporterConfig fromJson( + @JsonProperty("connectionString") String connectionString, + @JsonProperty("includes") Set includes, + @JsonProperty("excludes") Set excludes, + @JsonProperty("reportSql") String reportSql, + @JsonProperty("tableName") String tableName, + @JsonProperty("createTable") Boolean createTable, + @JsonProperty("postConnectionSql") String postConnectionSql, + @JsonProperty("reportVariables") List reportVariables, + @JsonProperty("reportWithinMs") Long reportWithinMs, + @JsonProperty("cacheSize") Integer cacheSize) { + return new JDBCPipesReporterConfig( + connectionString, includes == null ? Set.of() : includes, - excludes == null ? Set.of() : excludes, null, JDBCPipesReporter.TABLE_NAME, true, - null, List.of(), JDBCPipesReporter.DEFAULT_REPORT_WITHIN_MS, JDBCPipesReporter.DEFAULT_CACHE_SIZE); + excludes == null ? Set.of() : excludes, + reportSql, + tableName == null ? JDBCPipesReporter.TABLE_NAME : tableName, + createTable == null ? true : createTable, + postConnectionSql, + reportVariables == null ? List.of() : reportVariables, + reportWithinMs == null ? JDBCPipesReporter.DEFAULT_REPORT_WITHIN_MS : reportWithinMs, + cacheSize == null ? JDBCPipesReporter.DEFAULT_CACHE_SIZE : cacheSize); } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/jdbc/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/jdbc/ConfigExamplesTest.java new file mode 100644 index 00000000000..05b657362c0 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/jdbc/ConfigExamplesTest.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.jdbc; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.jdbc.JDBCEmitterConfig; +import org.apache.tika.pipes.iterator.jdbc.JDBCPipesIteratorConfig; +import org.apache.tika.pipes.reporter.jdbc.JDBCPipesReporterConfig; + +/** + * Validates JDBC emitter/iterator/reporter configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testJDBCEmitterConfig() throws Exception { + loadViaTikaLoader("jdbc-emitter.json"); + + JsonNode inner = innerComponent(readExample("jdbc-emitter.json"), + "emitters", "jdbce", "jdbc-emitter"); + JDBCEmitterConfig config = JDBCEmitterConfig.load(inner.toString()); + assertTrue(config.connection().startsWith("jdbc:h2:")); + assertTrue(config.insert().contains("insert into parsed_docs")); + assertNotNull(config.keys()); + assertEquals(4, config.keys().size()); + assertEquals("string", config.keys().get("dc:title")); + assertEquals("timestamp", config.keys().get("dcterms:modified")); + config.validate(); + assertEquals(JDBCEmitterConfig.AttachmentStrategy.FIRST_ONLY, + config.getAttachmentStrategyEnum()); + assertEquals(JDBCEmitterConfig.MultivaluedFieldStrategy.CONCATENATE, + config.getMultivaluedFieldStrategyEnum()); + } + + @Test + public void testJDBCIteratorConfig() throws Exception { + loadViaTikaLoader("jdbc-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("jdbc-pipes-iterator.json"), + "pipes-iterator", null, "jdbc-pipes-iterator"); + JDBCPipesIteratorConfig config = JDBCPipesIteratorConfig.load(inner.toString()); + assertTrue(config.getConnection().startsWith("jdbc:h2:")); + assertTrue(config.getSelect().contains("docs_to_parse")); + assertEquals("id", config.getIdColumn()); + assertEquals("source_path", config.getFetchKeyColumn()); + assertEquals("output_path", config.getEmitKeyColumn()); + assertEquals(1000, config.getFetchSize()); + assertEquals(60, config.getQueryTimeoutSeconds()); + assertEquals("fsf", config.getFetcherId()); + assertEquals("jdbce", config.getEmitterId()); + } + + @Test + public void testJDBCReporterConfig() throws Exception { + loadViaTikaLoader("jdbc-reporter.json"); + + JsonNode inner = innerComponent(readExample("jdbc-reporter.json"), + "pipes-reporters", null, "jdbc-reporter"); + JDBCPipesReporterConfig config = JDBCPipesReporterConfig.load(inner.toString()); + assertTrue(config.connectionString().startsWith("jdbc:h2:")); + assertNotNull(config.includes()); + assertTrue(config.includes().contains("PARSE_SUCCESS")); + // Fields previously unreachable from JSON — see JDBCPipesReporterConfig.fromJson + assertEquals("tika_reporter_status", config.tableName()); + assertEquals(false, config.createTable()); + assertEquals(5000L, config.reportWithinMs()); + assertEquals(500, config.cacheSize()); + } + + @Test + public void testJDBCPipelineConfig() throws Exception { + loadViaTikaLoader("jdbc-pipeline.json"); + + String json = readExample("jdbc-pipeline.json"); + JDBCEmitterConfig emitter = JDBCEmitterConfig.load( + innerComponent(json, "emitters", "jdbce", "jdbc-emitter").toString()); + emitter.validate(); + JDBCPipesIteratorConfig iterator = JDBCPipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "jdbc-pipes-iterator").toString()); + JDBCPipesReporterConfig reporter = JDBCPipesReporterConfig.load( + innerComponent(json, "pipes-reporters", null, "jdbc-reporter").toString()); + + assertEquals("jdbce", iterator.getEmitterId()); + assertNotNull(reporter.connectionString()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-emitter.json new file mode 100644 index 00000000000..c2e22dcdd24 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-emitter.json @@ -0,0 +1,22 @@ +{ + "emitters": { + "jdbce": { + "jdbc-emitter": { + "connection": "jdbc:h2:mem:tika;DB_CLOSE_DELAY=-1", + "createTable": "create table parsed_docs (path varchar(512) primary key, title varchar(1024), author varchar(512), content_length bigint, modified timestamp)", + "insert": "insert into parsed_docs (path, title, author, content_length, modified) values (?,?,?,?,?)", + "keys": { + "dc:title": "string", + "dc:creator": "string", + "Content-Length": "long", + "dcterms:modified": "timestamp" + }, + "maxRetries": 0, + "maxStringLength": 64000, + "attachmentStrategy": "FIRST_ONLY", + "multivaluedFieldStrategy": "CONCATENATE", + "multivaluedFieldDelimiter": ", " + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipeline.json new file mode 100644 index 00000000000..15e512e8549 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipeline.json @@ -0,0 +1,56 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "/data/input", + "extractFileSystemMetadata": false + } + } + }, + "emitters": { + "jdbce": { + "jdbc-emitter": { + "connection": "jdbc:h2:mem:tika;DB_CLOSE_DELAY=-1", + "createTable": "create table parsed_docs (path varchar(512) primary key, title varchar(1024), author varchar(512), content_length bigint, modified timestamp)", + "insert": "insert into parsed_docs (path, title, author, content_length, modified) values (?,?,?,?,?)", + "keys": { + "dc:title": "string", + "dc:creator": "string", + "Content-Length": "long", + "dcterms:modified": "timestamp" + }, + "attachmentStrategy": "FIRST_ONLY", + "multivaluedFieldStrategy": "CONCATENATE" + } + } + }, + "pipes-iterator": { + "jdbc-pipes-iterator": { + "connection": "jdbc:h2:mem:tika;DB_CLOSE_DELAY=-1", + "select": "select id, source_path, output_path from docs_to_parse where status = 'PENDING'", + "idColumn": "id", + "fetchKeyColumn": "source_path", + "emitKeyColumn": "output_path", + "fetcherId": "fsf", + "emitterId": "jdbce" + } + }, + "pipes-reporters": { + "jdbc-reporter": { + "connectionString": "jdbc:h2:mem:tika;DB_CLOSE_DELAY=-1", + "includes": ["PARSE_SUCCESS", "PARSE_EXCEPTION", "OOM", "TIMEOUT"] + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipes-iterator.json new file mode 100644 index 00000000000..85003cfbb61 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-pipes-iterator.json @@ -0,0 +1,15 @@ +{ + "pipes-iterator": { + "jdbc-pipes-iterator": { + "connection": "jdbc:h2:mem:tika;DB_CLOSE_DELAY=-1", + "select": "select id, source_path, output_path from docs_to_parse where status = 'PENDING'", + "idColumn": "id", + "fetchKeyColumn": "source_path", + "emitKeyColumn": "output_path", + "fetchSize": 1000, + "queryTimeoutSeconds": 60, + "fetcherId": "fsf", + "emitterId": "jdbce" + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-reporter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-reporter.json new file mode 100644 index 00000000000..ccc3fd6709d --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/config-examples/jdbc-reporter.json @@ -0,0 +1,12 @@ +{ + "pipes-reporters": { + "jdbc-reporter": { + "connectionString": "jdbc:h2:mem:tika;DB_CLOSE_DELAY=-1", + "includes": ["PARSE_SUCCESS", "PARSE_EXCEPTION", "OOM", "TIMEOUT"], + "tableName": "tika_reporter_status", + "createTable": false, + "reportWithinMs": 5000, + "cacheSize": 500 + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-attachments.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-attachments.xml deleted file mode 100644 index 4bc2d8e875f..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-attachments.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - jdbc - CONNECTION_STRING - create table test - (path varchar(512) not null, - attachment_num integer not null, - k1 boolean, - k2 varchar(512), - k3 integer, - k4 long); - - alter table test add primary key (path, attachment_num) - - insert into test (path, attachment_num, k1, k2, k3, k4) values (?,?,?,?,?,?); - - - - - - - - - all - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-existing-table.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-existing-table.xml deleted file mode 100644 index 654b279be7d..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-existing-table.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - jdbc - CONNECTION_STRING - insert into test (path, k1, k2, k3, k4) values (?,?,?,?,?); - - - - - - - - - first_only - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml deleted file mode 100644 index eb966b54a0e..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - jdbc - CONNECTION_STRING - create table test - (path varchar(512) primary key, - k1 varchar(512)); - - - insert into test (path, k1) values (?,?); - - - - - - concatenate - , - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-trunc.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-trunc.xml deleted file mode 100644 index 85eef281c2d..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-trunc.xml +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - jdbc - CONNECTION_STRING - create table test - (path varchar(512) primary key, - k1 varchar(12)); - - - insert into test (path, k1) values (?,?); - - - - - - first_only - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter.xml deleted file mode 100644 index c1a05bdec49..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/resources/configs/tika-config-jdbc-emitter.xml +++ /dev/null @@ -1,54 +0,0 @@ - - - - - - jdbc - CONNECTION_STRING - create table test - (path varchar(512) primary key, - k1 boolean, - k2 varchar(512), - k3 integer, - k4 long, - k5 bigint, - k6 timestamp); - - - insert into test (path, k1, k2, k3, k4, k5, k6) values (?,?,?,?,?,?,?); - - - - - - - - - - - first_only - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/json/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/json/ConfigExamplesTest.java new file mode 100644 index 00000000000..d96140eae50 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/json/ConfigExamplesTest.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.json; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.pipesiterator.json.JsonPipesIteratorConfig; + +/** + * Validates JSON iterator configuration example used in documentation. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + @Test + public void testJsonIteratorConfig() throws Exception { + String json = readExample("json-pipes-iterator.json"); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + assertNotNull(TikaLoader.load(configFile)); + + JsonNode inner = OBJECT_MAPPER.readTree(json) + .get("pipes-iterator").get("json-pipes-iterator"); + JsonPipesIteratorConfig config = JsonPipesIteratorConfig.load(inner.toString()); + assertNotNull(config.getJsonPath()); + assertEquals("fsf", config.getFetcherId()); + assertEquals("fse", config.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/config-examples/json-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/config-examples/json-pipes-iterator.json new file mode 100644 index 00000000000..5bc4732bf69 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/config-examples/json-pipes-iterator.json @@ -0,0 +1,9 @@ +{ + "pipes-iterator": { + "json-pipes-iterator": { + "jsonPath": "/data/work-items.jsonl", + "fetcherId": "fsf", + "emitterId": "fse" + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/kafka/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/kafka/ConfigExamplesTest.java new file mode 100644 index 00000000000..43c9a4daefb --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/kafka/ConfigExamplesTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.kafka; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.kafka.KafkaEmitterConfig; +import org.apache.tika.pipes.iterator.kafka.KafkaPipesIteratorConfig; + +/** + * Validates Kafka emitter/iterator configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testKafkaEmitterConfig() throws Exception { + loadViaTikaLoader("kafka-emitter.json"); + + JsonNode inner = innerComponent(readExample("kafka-emitter.json"), + "emitters", "kafe", "kafka-emitter"); + KafkaEmitterConfig config = KafkaEmitterConfig.load(inner.toString()); + assertEquals("tika-parsed-docs", config.topic()); + assertTrue(config.bootstrapServers().contains("kafka1.example.com")); + assertEquals("all", config.acks()); + assertEquals("lz4", config.compressionType()); + assertTrue(config.enableIdempotence()); + config.validate(); + } + + @Test + public void testKafkaIteratorConfig() throws Exception { + loadViaTikaLoader("kafka-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("kafka-pipes-iterator.json"), + "pipes-iterator", null, "kafka-pipes-iterator"); + KafkaPipesIteratorConfig config = KafkaPipesIteratorConfig.load(inner.toString()); + assertEquals("tika-fetch-requests", config.getTopic()); + assertEquals("tika-pipes-iterator", config.getGroupId()); + assertEquals("earliest", config.getAutoOffsetReset()); + assertEquals(100, config.getPollDelayMs()); + assertEquals(-1, config.getEmitMax()); + assertEquals("fsf", config.getFetcherId()); + assertEquals("kafe", config.getEmitterId()); + } + + @Test + public void testKafkaPipelineConfig() throws Exception { + loadViaTikaLoader("kafka-pipeline.json"); + + String json = readExample("kafka-pipeline.json"); + KafkaEmitterConfig emitter = KafkaEmitterConfig.load( + innerComponent(json, "emitters", "kafe", "kafka-emitter").toString()); + emitter.validate(); + assertEquals("tika-parsed-docs", emitter.topic()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-emitter.json new file mode 100644 index 00000000000..602266e8d32 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-emitter.json @@ -0,0 +1,19 @@ +{ + "emitters": { + "kafe": { + "kafka-emitter": { + "topic": "tika-parsed-docs", + "bootstrapServers": "kafka1.example.com:9092,kafka2.example.com:9092", + "acks": "all", + "lingerMs": 5000, + "batchSize": 16384, + "compressionType": "lz4", + "enableIdempotence": true, + "maxRequestSize": 1048576, + "requestTimeoutMs": 30000, + "deliveryTimeoutMs": 120000, + "clientId": "tika-pipes-emitter" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipeline.json new file mode 100644 index 00000000000..01e5a528d48 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipeline.json @@ -0,0 +1,43 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "/data/input", + "extractFileSystemMetadata": false + } + } + }, + "emitters": { + "kafe": { + "kafka-emitter": { + "topic": "tika-parsed-docs", + "bootstrapServers": "kafka1.example.com:9092", + "acks": "all", + "compressionType": "lz4", + "enableIdempotence": true + } + } + }, + "pipes-iterator": { + "kafka-pipes-iterator": { + "topic": "tika-fetch-requests", + "bootstrapServers": "kafka1.example.com:9092", + "groupId": "tika-pipes-iterator", + "autoOffsetReset": "earliest", + "fetcherId": "fsf", + "emitterId": "kafe" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipes-iterator.json new file mode 100644 index 00000000000..5685476084f --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/resources/config-examples/kafka-pipes-iterator.json @@ -0,0 +1,14 @@ +{ + "pipes-iterator": { + "kafka-pipes-iterator": { + "topic": "tika-fetch-requests", + "bootstrapServers": "kafka1.example.com:9092,kafka2.example.com:9092", + "groupId": "tika-pipes-iterator", + "autoOffsetReset": "earliest", + "pollDelayMs": 100, + "emitMax": -1, + "fetcherId": "fsf", + "emitterId": "kafe" + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/java/org/apache/tika/pipes/microsoftgraph/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/java/org/apache/tika/pipes/microsoftgraph/ConfigExamplesTest.java new file mode 100644 index 00000000000..83159ba65ba --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/java/org/apache/tika/pipes/microsoftgraph/ConfigExamplesTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.microsoftgraph; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.fetchers.microsoftgraph.config.MicrosoftGraphFetcherConfig; + +/** + * Validates Microsoft Graph fetcher configuration examples used in documentation. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + @Test + public void testMicrosoftGraphFetcherConfig() throws Exception { + String json = readExample("microsoft-graph-fetcher.json"); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + assertNotNull(TikaLoader.load(configFile)); + + JsonNode inner = OBJECT_MAPPER.readTree(json) + .get("fetchers").get("msgf").get("microsoft-graph-fetcher"); + MicrosoftGraphFetcherConfig config = MicrosoftGraphFetcherConfig.load(inner.toString()); + assertNotNull(config.getClientSecretCredentialsConfig()); + assertEquals("REDACTED-TENANT-UUID", + config.getClientSecretCredentialsConfig().getTenantId()); + assertEquals("REDACTED-CLIENT-UUID", + config.getClientSecretCredentialsConfig().getClientId()); + assertTrue(config.getScopes().contains("https://graph.microsoft.com/.default")); + assertTrue(config.isSpoolToTemp()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/resources/config-examples/microsoft-graph-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/resources/config-examples/microsoft-graph-fetcher.json new file mode 100644 index 00000000000..5fd5ea9a436 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-microsoft-graph/src/test/resources/config-examples/microsoft-graph-fetcher.json @@ -0,0 +1,15 @@ +{ + "fetchers": { + "msgf": { + "microsoft-graph-fetcher": { + "clientSecretCredentialsConfig": { + "tenantId": "REDACTED-TENANT-UUID", + "clientId": "REDACTED-CLIENT-UUID", + "clientSecret": "REDACTED" + }, + "scopes": ["https://graph.microsoft.com/.default"], + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/java/org/apache/tika/pipes/opensearch/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/java/org/apache/tika/pipes/opensearch/ConfigExamplesTest.java new file mode 100644 index 00000000000..d0c0a9eefa4 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/java/org/apache/tika/pipes/opensearch/ConfigExamplesTest.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.opensearch; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitterConfig; +import org.apache.tika.pipes.reporter.opensearch.OpenSearchReporterConfig; + +/** + * Validates OpenSearch emitter/reporter configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testOpenSearchEmitterConfig() throws Exception { + loadViaTikaLoader("opensearch-emitter.json"); + + JsonNode inner = innerComponent(readExample("opensearch-emitter.json"), + "emitters", "ose", "opensearch-emitter"); + OpenSearchEmitterConfig config = OpenSearchEmitterConfig.load(inner.toString()); + assertEquals("doc_id", config.idField()); + assertEquals(OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD, + config.attachmentStrategy()); + assertEquals(OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, + config.updateStrategy()); + assertEquals(1000, config.commitWithin()); + assertNotNull(config.httpClientConfig()); + assertEquals("admin", config.httpClientConfig().userName()); + } + + @Test + public void testOpenSearchReporterConfig() throws Exception { + loadViaTikaLoader("opensearch-reporter.json"); + + JsonNode inner = innerComponent(readExample("opensearch-reporter.json"), + "pipes-reporters", null, "opensearch-pipes-reporter"); + OpenSearchReporterConfig config = OpenSearchReporterConfig.load(inner.toString()); + assertTrue(config.openSearchUrl().contains("tika-status")); + assertEquals("tika_", config.keyPrefix()); + assertTrue(config.includeRouting()); + assertNotNull(config.includes()); + assertTrue(config.includes().contains("PARSE_SUCCESS")); + assertNotNull(config.httpClientConfig()); + } + + @Test + public void testOpenSearchPipelineConfig() throws Exception { + loadViaTikaLoader("opensearch-pipeline.json"); + + String json = readExample("opensearch-pipeline.json"); + OpenSearchEmitterConfig emitter = OpenSearchEmitterConfig.load( + innerComponent(json, "emitters", "ose", "opensearch-emitter").toString()); + OpenSearchReporterConfig reporter = OpenSearchReporterConfig.load( + innerComponent(json, "pipes-reporters", null, "opensearch-pipes-reporter").toString()); + + assertEquals("doc_id", emitter.idField()); + assertNotNull(reporter.httpClientConfig()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-emitter.json new file mode 100644 index 00000000000..0221fbfd617 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-emitter.json @@ -0,0 +1,21 @@ +{ + "emitters": { + "ose": { + "opensearch-emitter": { + "openSearchUrl": "https://opensearch.example.com:9200/tika-docs", + "idField": "doc_id", + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "OVERWRITE", + "commitWithin": 1000, + "embeddedFileFieldName": "embedded", + "httpClientConfig": { + "userName": "admin", + "password": "REDACTED", + "authScheme": "basic", + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-pipeline.json new file mode 100644 index 00000000000..1f196caf88b --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-pipeline.json @@ -0,0 +1,64 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "/data/input", + "extractFileSystemMetadata": false + } + } + }, + "emitters": { + "ose": { + "opensearch-emitter": { + "openSearchUrl": "https://opensearch.example.com:9200/tika-docs", + "idField": "doc_id", + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "OVERWRITE", + "commitWithin": 1000, + "embeddedFileFieldName": "embedded", + "httpClientConfig": { + "userName": "admin", + "password": "REDACTED", + "authScheme": "basic", + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } + }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/input", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "ose" + } + }, + "pipes-reporters": { + "opensearch-pipes-reporter": { + "openSearchUrl": "https://opensearch.example.com:9200/tika-status", + "includes": ["PARSE_SUCCESS", "PARSE_EXCEPTION", "OOM", "TIMEOUT"], + "keyPrefix": "tika_", + "includeRouting": true, + "httpClientConfig": { + "userName": "admin", + "password": "REDACTED", + "authScheme": "basic", + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-reporter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-reporter.json new file mode 100644 index 00000000000..dcce3ce10bf --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/config-examples/opensearch-reporter.json @@ -0,0 +1,17 @@ +{ + "pipes-reporters": { + "opensearch-pipes-reporter": { + "openSearchUrl": "https://opensearch.example.com:9200/tika-status", + "includes": ["PARSE_SUCCESS", "PARSE_EXCEPTION", "OOM", "TIMEOUT"], + "keyPrefix": "tika_", + "includeRouting": true, + "httpClientConfig": { + "userName": "admin", + "password": "REDACTED", + "authScheme": "basic", + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/tika-config-simple-emitter.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/tika-config-simple-emitter.xml deleted file mode 100644 index f6530a9e928..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-opensearch/src/test/resources/tika-config-simple-emitter.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - - - - - - - opensearch1 - http://localhost:9200/tika-test - concatenate-content - content - 10 - - - \ No newline at end of file diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/solr/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/solr/ConfigExamplesTest.java new file mode 100644 index 00000000000..65d06c37cc9 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/solr/ConfigExamplesTest.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.solr; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.solr.SolrEmitterConfig; +import org.apache.tika.pipes.iterator.solr.SolrPipesIteratorConfig; + +/** + * Validates Solr emitter/iterator configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testSolrEmitterUrlsConfig() throws Exception { + loadViaTikaLoader("solr-emitter.json"); + + JsonNode inner = innerComponent(readExample("solr-emitter.json"), + "emitters", "solre", "solr-emitter"); + SolrEmitterConfig config = SolrEmitterConfig.load(inner.toString()); + assertEquals("tika-docs", config.solrCollection()); + assertNotNull(config.solrUrls()); + assertEquals(2, config.solrUrls().size()); + assertTrue(config.solrZkHosts() == null || config.solrZkHosts().isEmpty()); + config.validate(); + assertEquals(SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD, + config.getAttachmentStrategyEnum()); + assertEquals(SolrEmitterConfig.UpdateStrategy.ADD, config.getUpdateStrategyEnum()); + } + + @Test + public void testSolrEmitterZkConfig() throws Exception { + loadViaTikaLoader("solr-emitter-zk.json"); + + JsonNode inner = innerComponent(readExample("solr-emitter-zk.json"), + "emitters", "solre", "solr-emitter"); + SolrEmitterConfig config = SolrEmitterConfig.load(inner.toString()); + assertEquals("tika-docs", config.solrCollection()); + assertNotNull(config.solrZkHosts()); + assertEquals(3, config.solrZkHosts().size()); + assertEquals("/solr", config.solrZkChroot()); + assertTrue(config.solrUrls() == null || config.solrUrls().isEmpty()); + config.validate(); + } + + @Test + public void testSolrIteratorConfig() throws Exception { + loadViaTikaLoader("solr-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("solr-pipes-iterator.json"), + "pipes-iterator", null, "solr-pipes-iterator"); + SolrPipesIteratorConfig config = SolrPipesIteratorConfig.load(inner.toString()); + assertEquals("tika-docs", config.getSolrCollection()); + assertEquals(5000, config.getRows()); + assertTrue(config.getFilters().contains("status:NEEDS_REPARSE")); + assertEquals("fsf", config.getFetcherId()); + assertEquals("solre", config.getEmitterId()); + } + + @Test + public void testSolrPipelineConfig() throws Exception { + loadViaTikaLoader("solr-pipeline.json"); + + String json = readExample("solr-pipeline.json"); + SolrEmitterConfig emitter = SolrEmitterConfig.load( + innerComponent(json, "emitters", "solre", "solr-emitter").toString()); + emitter.validate(); + assertEquals("tika-docs", emitter.solrCollection()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter-zk.json b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter-zk.json new file mode 100644 index 00000000000..5cfac99011c --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter-zk.json @@ -0,0 +1,15 @@ +{ + "emitters": { + "solre": { + "solr-emitter": { + "solrCollection": "tika-docs", + "solrZkHosts": ["zk1.example.com:2181", "zk2.example.com:2181", "zk3.example.com:2181"], + "solrZkChroot": "/solr", + "idField": "id", + "commitWithin": 1000, + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "ADD" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter.json new file mode 100644 index 00000000000..fb88fd79c13 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-emitter.json @@ -0,0 +1,17 @@ +{ + "emitters": { + "solre": { + "solr-emitter": { + "solrCollection": "tika-docs", + "solrUrls": ["http://solr1.example.com:8983/solr", "http://solr2.example.com:8983/solr"], + "idField": "id", + "commitWithin": 1000, + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "ADD", + "embeddedFileFieldName": "embedded", + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000 + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipeline.json new file mode 100644 index 00000000000..21a01c9c030 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipeline.json @@ -0,0 +1,42 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "/data/input", + "extractFileSystemMetadata": false + } + } + }, + "emitters": { + "solre": { + "solr-emitter": { + "solrCollection": "tika-docs", + "solrUrls": ["http://solr1.example.com:8983/solr"], + "idField": "id", + "commitWithin": 1000, + "attachmentStrategy": "PARENT_CHILD", + "updateStrategy": "ADD" + } + } + }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "/data/input", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "solre" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipes-iterator.json new file mode 100644 index 00000000000..f6daace233a --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/config-examples/solr-pipes-iterator.json @@ -0,0 +1,15 @@ +{ + "pipes-iterator": { + "solr-pipes-iterator": { + "solrCollection": "tika-docs", + "solrUrls": ["http://solr1.example.com:8983/solr"], + "filters": ["status:NEEDS_REPARSE"], + "idField": "id", + "rows": 5000, + "connectionTimeoutMillis": 10000, + "socketTimeoutMillis": 60000, + "fetcherId": "fsf", + "emitterId": "solre" + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/tika-config-simple-emitter.xml b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/tika-config-simple-emitter.xml deleted file mode 100644 index 5b14a54415a..00000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/resources/tika-config-simple-emitter.xml +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - - - - - - - solr1 - http://localhost:8983/solr/tika-test - concatenate-content - content - 10 - - - solr2 - http://localhost:8983/solr/tika-test - parent-child - content - 10 - - - \ No newline at end of file