From c994c9aee165804401f9668de3263206cb443ee0 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 16:57:28 -0400 Subject: [PATCH 1/5] TIKA-4725 - temporarily disable automatic release, many updates for tika-server for parity with work in tika-docker repo today --- .github/workflows/docker-release.yml | 27 +- .github/workflows/docker-snapshot.yml | 4 +- docs/modules/ROOT/nav.adoc | 1 + .../integration-testing/run-uat-script.adoc | 124 +++++++++ .../maintainers/release-guides/docker.adoc | 196 +++++++++----- tika-server/docker-build/CHANGES.md | 74 +++++ tika-server/docker-build/README.md | 254 ++++++++++++++++++ .../docker-compose-tika-customocr.yml | 39 +++ .../docker-compose-tika-grobid.yml | 45 ++++ .../docker-compose-tika-vision.yml | 62 +++++ tika-server/docker-build/docker-tool.sh | 51 +++- tika-server/docker-build/full/Dockerfile | 59 ++-- .../docker-build/full/Dockerfile.snapshot | 12 +- tika-server/docker-build/minimal/Dockerfile | 52 ++-- .../docker-build/minimal/Dockerfile.snapshot | 12 +- .../customocr/tika-config-inline.json | 11 + .../customocr/tika-config-inline.xml | 31 --- .../customocr/tika-config-rendered.json | 16 ++ .../customocr/tika-config-rendered.xml | 38 --- .../sample-configs/grobid/tika-config.json | 10 + .../sample-configs/grobid/tika-config.xml | 24 -- .../sample-configs/ner/run_tika_server.sh | 62 ----- .../sample-configs/ner/tika-config.xml | 28 -- .../vision/inception-rest-caption.xml | 32 --- .../vision/inception-rest-video.xml | 32 --- .../sample-configs/vision/inception-rest.xml | 32 --- .../sample-configs/vision/vlm-claude.json | 18 ++ .../sample-configs/vision/vlm-gemini.json | 17 ++ .../sample-configs/vision/vlm-openai.json | 19 ++ 29 files changed, 968 insertions(+), 414 deletions(-) create mode 100644 docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc create mode 100644 tika-server/docker-build/CHANGES.md create mode 100644 tika-server/docker-build/README.md create mode 100644 tika-server/docker-build/docker-compose-tika-customocr.yml create mode 100644 tika-server/docker-build/docker-compose-tika-grobid.yml create mode 100644 tika-server/docker-build/docker-compose-tika-vision.yml create mode 100644 tika-server/docker-build/sample-configs/customocr/tika-config-inline.json delete mode 100644 tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml create mode 100644 tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json delete mode 100644 tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml create mode 100644 tika-server/docker-build/sample-configs/grobid/tika-config.json delete mode 100644 tika-server/docker-build/sample-configs/grobid/tika-config.xml delete mode 100755 tika-server/docker-build/sample-configs/ner/run_tika_server.sh delete mode 100644 tika-server/docker-build/sample-configs/ner/tika-config.xml delete mode 100644 tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml delete mode 100644 tika-server/docker-build/sample-configs/vision/inception-rest-video.xml delete mode 100644 tika-server/docker-build/sample-configs/vision/inception-rest.xml create mode 100644 tika-server/docker-build/sample-configs/vision/vlm-claude.json create mode 100644 tika-server/docker-build/sample-configs/vision/vlm-gemini.json create mode 100644 tika-server/docker-build/sample-configs/vision/vlm-openai.json diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index dadf630887a..54f18f5fd75 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -17,10 +17,17 @@ name: Docker release - tika-server and tika-grpc +# Auto-trigger on tag push is disabled (TIKA-4725). The official tika-docker +# images on Docker Hub (apache/tika) are published from the apache/tika-docker +# repository using its own Dockerfiles and tagging conventions. When this +# workflow ran on the 4.0.0-alpha-1 source tag it pushed an image built from +# the stale Dockerfiles under tika-server/docker-build/ to +# apache/tika:4.0.0-alpha-1, which collided with the tika-docker-managed tag +# and ran with the pre-4.x bare-jar entrypoint (broken plugin loading). Re-enable +# only after the in-repo Dockerfiles are kept in sync with (or replaced by a +# pointer to) apache/tika-docker. on: - push: - tags: - - '[0-9]+.[0-9]+.[0-9]+*' + workflow_dispatch: jobs: release-tika-server: @@ -52,25 +59,27 @@ jobs: uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: file: tika-server/docker-build/minimal/Dockerfile - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tag }} + # :latest is intentionally NOT pushed. It stays on 3.x (published from + # the external apache/tika-docker repo) until 4.0.0 GA, at which point + # add `apache/tika:latest` back here. tags: | apache/tika:${{ steps.version.outputs.tag }} - apache/tika:latest - name: Build and push tika-server full uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: file: tika-server/docker-build/full/Dockerfile - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tag }} + # :latest-full stays on 3.x until 4.0.0 GA; see note above. tags: | apache/tika:${{ steps.version.outputs.tag }}-full - apache/tika:latest-full release-tika-grpc: runs-on: ubuntu-latest @@ -152,6 +161,10 @@ jobs: push: true build-args: | VERSION=${{ steps.version.outputs.tag }} + # apache/tika-grpc is new in 4.x with no prior `:latest` to protect, so + # we track latest from the start. Unlike apache/tika (the server image) + # where :latest stays on 3.x until 4.0.0 GA, the grpc image has no 3.x + # incumbent. tags: | apache/tika-grpc:${{ steps.version.outputs.tag }} apache/tika-grpc:latest diff --git a/.github/workflows/docker-snapshot.yml b/.github/workflows/docker-snapshot.yml index cb82d05592a..1b17355ad43 100644 --- a/.github/workflows/docker-snapshot.yml +++ b/.github/workflows/docker-snapshot.yml @@ -105,7 +105,7 @@ jobs: uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: context: target/tika-server-minimal-docker - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tika_version }} @@ -157,7 +157,7 @@ jobs: uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: context: target/tika-server-full-docker - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tika_version }} diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 979555022a7..b333e25fc68 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -55,6 +55,7 @@ ** xref:advanced/spooling.adoc[Spooling] ** xref:advanced/embedded-documents.adoc[Embedded Document Metadata] ** xref:advanced/local-vlm-server.adoc[Running a Local VLM Server] +** xref:advanced/integration-testing/run-uat-script.adoc[Tika-Server REST UAT Script] * xref:developers/index.adoc[Developers] ** xref:developers/serialization.adoc[Serialization and Configuration] * xref:faq.adoc[FAQ] diff --git a/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc new file mode 100644 index 00000000000..1e3365cd524 --- /dev/null +++ b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc @@ -0,0 +1,124 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika-Server REST UAT Script + +A portable shell script that exercises the tika-server REST surface against an +already-running server. The same script is used as the docker image smoke +test, the e2e integration test, and as part of the source-release +verification. + +== Where it lives + +[source] +---- +release-tools/uat/ +├── run-uat.sh # the script +└── test-files/ + ├── testPDF.pdf + ├── testHTML.html + └── test_recursive_embedded.docx +---- + +== What it covers + +Roughly 25 REST endpoint checks across the default-mode endpoints, header +behavior, and error handling — the same surface enumerated in the manual +walkthrough at xref:advanced/integration-testing/tika-server.adoc[Tika-Server +Integration Testing], translated to bash + curl assertions. + +Coverage includes: + +* `/version`, `/parsers`, `/detectors`, `/mime-types` (introspection) +* `/detect/stream` (mime detection) +* `/tika`, `/tika/text`, `/tika/xml`, `/tika/json` (parse) +* `/meta`, `/meta/{field}` (metadata) +* `/rmeta`, `/rmeta/text` (recursive metadata) +* `/unpack/all` (embedded extraction; verifies the response is a valid zip) +* `/language/stream` +* `/meta/form`, `/rmeta/form` (multipart variants) +* `enableUnsecureFeatures=false` gating: `/meta/config`, `/rmeta/config`, + `/tika/config` all return 403 +* `X-Tika-OCRskipOcr` header, `Content-Disposition` filename +* 404 / 405 error handling + +Two checks (T18d, T27) are currently disabled with inline comments pointing +at tika-core behavior anomalies that need fixing — re-enable them when those +land. + +== Running it + +The script takes a URL pointing at a running tika-server. It does *not* start +or stop the server itself. + +[source,bash] +---- +release-tools/uat/run-uat.sh [host] +# default host: http://localhost:9998 +---- + +Exit code: `0` on all-pass, `1` on any failure. Failed checks print the +expected pattern and a truncated response body. + +=== Against the unpacked bin.zip distribution + +[source,bash] +---- +unzip tika-server-standard--bin.zip -d /tmp/tika-server-dist +cd /tmp/tika-server-dist +java -jar tika-server.jar -p 9998 -h localhost & +sleep 12 +~/path/to/tika/release-tools/uat/run-uat.sh +---- + +=== Against the Docker image + +The `docker-tool.sh test-uat` subcommand wraps starting the container, waiting +for `/version`, running the UAT, and stopping the container: + +[source,bash] +---- +cd tika-server/docker-build +./docker-tool.sh test-uat +---- + +=== As part of the e2e tests (CI) + +The Maven module `tika-e2e-tests/tika-server` unpacks the bin.zip, forks +`java -jar tika-server.jar`, and invokes this script via +`org.apache.tika.server.e2e.RunUatSmokeTest`. The CI workflow +`.github/workflows/main-jdk17-build.yml` runs this automatically on every PR +via `mvn -pl tika-e2e-tests -am clean verify -Pe2e`. + +== When to use it + +* *Pre-vote release verification.* Unpack + `tika-server-standard--bin.zip` from `dist/dev` and run the UAT + against it. Catches packaging regressions before the vote thread starts. +* *Pre-publish docker verification.* Run via `docker-tool.sh test-uat` after + building a new image and before tagging it for release. +* *Local development sanity check.* When changing anything in + `tika-server-core` or the bin.zip assembly descriptor, run the UAT against + the build output to confirm you didn't regress endpoint behavior. +* *Adding new endpoints.* When a new REST endpoint lands, add a corresponding + check to the script so future regressions get caught. + +== Platform notes + +The script is bash + curl + unzip. It's skipped automatically on Windows by +the e2e test (no bash). On Linux/macOS it runs as-is. No external dependencies +beyond the standard tooling. diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc index a8f2f8cbc72..c699f00e4a4 100644 --- a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc +++ b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc @@ -17,117 +17,179 @@ = Releasing Tika Docker Images -This guide covers the process for releasing Apache Tika Docker images. +This guide covers releasing the official Apache Tika Docker images +(`apache/tika` and `apache/tika-grpc` on Docker Hub). -== Prerequisites +== Where the Dockerfiles live -=== DockerHub Access +Starting with 4.0.0-alpha-1, the Dockerfiles and the GitHub Actions workflow +that publishes them live in this repository: -You need permissions on the `apache/tika` repository on DockerHub. To obtain access, -create an INFRA JIRA ticket with the "Docker" label. +* `tika-server/docker-build/{minimal,full}/Dockerfile` — `apache/tika` (server) release builds +* `tika-server/docker-build/{minimal,full}/Dockerfile.snapshot` — nightly snapshot builds +* `tika-grpc/docker-build/Dockerfile` — `apache/tika-grpc` release builds +* `.github/workflows/docker-release.yml` — the release publishing workflow +* `.github/workflows/docker-snapshot.yml` — the snapshot publishing workflow (auto on push to `main`) -=== Repository Access +NOTE: The legacy https://github.com/apache/tika-docker[apache/tika-docker] +repository is still used for 3.x patch releases — see <<3x-patches>> below. New +4.x work happens here. -Clone the tika-docker repository: +== Image types -[source,bash] ----- -git clone https://github.com/apache/tika-docker -cd tika-docker ----- +minimal:: +Apache Tika server with base dependencies (Java + the unpacked +`tika-server-standard-bin.zip`). -== Image Types +full:: +Adds Tesseract OCR, GDAL, ImageMagick, and Microsoft fonts. -The tika-docker repository produces two types of images: +`apache/tika-grpc`:: +The gRPC server packaged with parser-package jars and pipes plugin zips. -Minimal:: -Apache Tika with base dependencies (Java only) +== Prerequisites -Full:: -Apache Tika plus Tesseract OCR and GDAL +* You have committer permission on `apache/tika` (the GitHub repo). The Docker + release workflow is gated to maintainers via the standard repo permission + model — no separate Docker Hub credential is needed at trigger time; Docker + Hub auth is held by the workflow as a secret. +* The Tika release vote has passed and the artifacts have been moved from + `dist/dev` to `dist/release` (i.e., the bin.zip and parser-package jars are + already on `dlcdn.apache.org`/`downloads.apache.org`). The workflow + downloads those artifacts during the build, so they must be live first. +* The release tag (e.g. `4.0.0-alpha-1`) exists in the repo. `release:perform` + creates it during the upstream release. -== Helper Tools +== Release process -docker-tool.sh:: -Automates building, testing, and publishing Docker images +=== Step 1: Verify the upstream artifacts are live -republish-images.sh:: -Legacy script for batch republishing images +[source,bash] +---- +curl -sLI https://downloads.apache.org/tika//tika-server-standard--bin.zip \ + | head -1 +---- -NOTE: The repository also contains Docker Compose files for advanced scenarios -(Vision, Grobid, OCR, NER), but these are not used for official releases. +If you get a 200, you're ready. If 404, the SVN move from dist/dev to +dist/release hasn't propagated yet — wait a few minutes. -== Release Process +=== Step 2: Trigger the Docker release workflow -=== Step 1: Update README +The workflow is intentionally `workflow_dispatch`-only — it won't auto-fire on +tag push. Trigger it manually after the vote, against the release tag. -Update the "Available Tags" section in `README.md` to include the new version. +*Via the GitHub UI:* -=== Step 2: Update Version +. Open https://github.com/apache/tika/actions +. Select *Docker release - tika-server and tika-grpc* in the left sidebar +. Click *Run workflow* (top-right) +. Under *Use workflow from*, switch from the default branch to *Tags* and pick + the release tag (e.g. `4.0.0-alpha-1`) +. Click *Run workflow* -Increment the TAG version in the `.env` file. +*Via the `gh` CLI:* -=== Step 3: Update Changelog +[source,bash] +---- +gh workflow run docker-release.yml --ref +# e.g. +gh workflow run docker-release.yml --ref 4.0.0-alpha-1 +---- -Update `CHANGES.md` with release information and date. +The `--ref` argument selects the git ref to check out. The workflow extracts +the tag name from `GITHUB_REF` (`refs/tags/`) and uses it for both the +`TIKA_VERSION` build-arg and the published Docker tag. -=== Step 4: Test Locally +=== Step 3: Watch the run -Test the release locally before publishing: +A successful run takes ~30–45 minutes (multi-arch builds across `linux/amd64`, +`linux/arm64`, `linux/s390x` are slow under qemu emulation, especially the +full image). -[source,bash] ----- -./docker-tool.sh build -./docker-tool.sh test ----- +* GitHub UI: the Actions run page streams logs. +* CLI: `gh run watch` will tail the latest run. + +The workflow does three things: -=== Step 5: Commit Changes +. Builds and pushes `apache/tika:` (minimal, multi-arch). +. Builds and pushes `apache/tika:-full` (full, multi-arch). +. Builds and pushes `apache/tika-grpc:` (multi-arch). -Commit all changes: +=== Step 4: Verify the published images [source,bash] ---- -git add README.md .env CHANGES.md -git commit -m "Prepare for Docker release " -git push +# Confirm the manifest landed: +curl -sL "https://hub.docker.com/v2/repositories/apache/tika/tags//" \ + | python3 -c "import sys,json;d=json.load(sys.stdin);print(d.get('tag_last_pushed'), d.get('digest'))" + +# Smoke-test the image locally: +docker pull apache/tika: +docker run --rm -d --name tika-uat -p 127.0.0.1:9998:9998 apache/tika: +sleep 12 +curl -s http://localhost:9998/version +docker rm -f tika-uat ---- -=== Step 6: Build and Publish - -Build and publish the images using the docker-tool script. - -Example for version 3.1.0.0 based on Tika 3.1.0: +For a deeper smoke test that exercises the full REST surface, run the +xref:advanced/integration-testing/run-uat-script.adoc[REST UAT script] +(the same one tied into the e2e tests): [source,bash] ---- -# Build the images -./docker-tool.sh build 3.1.0.0 3.1.0 +release-tools/uat/run-uat.sh http://localhost:9998 +---- -# Test the images -./docker-tool.sh test 3.1.0.0 +Both `apache/tika:` and `apache/tika:-full` should pass. -# Publish to DockerHub -./docker-tool.sh publish 3.1.0.0 3.1.0 ----- +== `:latest` tag policy + +The `apache/tika:latest` and `apache/tika:latest-full` tags currently still +point at the 3.x stable image (the `latest`-tagged 3.3.0 image published from +the external apache/tika-docker repo). -NOTE: Multi-architecture building takes time. The publish step automatically -updates the `-latest` tag on DockerHub. +The release workflow *deliberately does not push `:latest`* for 4.x +alpha/beta/RC builds — those tags stay on 3.x until 4.0.0 GA. When 4.0.0 GA +ships, edit `docker-release.yml` to re-add `apache/tika:latest` and +`apache/tika:latest-full` to the tag lists. -=== Step 7: Tag the Release +`apache/tika-grpc:latest` *is* pushed on every 4.x release — the grpc image is +new in 4.x and has no 3.x incumbent to protect. -Create and push a git tag for the release: +[[3x-patches]] +== 3.x patch releases (legacy path) + +Until 4.0.0 GA, any 3.x patch release (e.g. a 3.3.0.1 with a CVE fix) is +still published from the legacy https://github.com/apache/tika-docker[apache/tika-docker] +repository using its `docker-tool.sh`: [source,bash] ---- -git tag -a 3.1.0.0 -m "New release for 3.1.0.0" +git clone https://github.com/apache/tika-docker +cd tika-docker + +# Edit README.md (Available Tags), CHANGES.md, .env (TAG=...) +# Then commit + push + +./docker-tool.sh build +./docker-tool.sh test +./docker-tool.sh publish + +git tag -a -m "New release for " git push --tags ---- -== Post-Release +Use the 3.x convention `.` (e.g. +`3.3.0.1` for the first Docker rebuild on top of Tika 3.3.0). 4.x releases +drop that scheme and publish bare `` only. + +== Post-release -After publishing the Docker images: +After the workflow completes: -* Verify the images are available on DockerHub at https://hub.docker.com/r/apache/tika -* Test pulling and running the new images -* Update the main Tika website if needed -* Proceed to release the link:helm.html[Helm charts] if applicable +* Verify both images on https://hub.docker.com/r/apache/tika and + https://hub.docker.com/r/apache/tika-grpc. +* Test pulling and running the new images from a clean machine. +* If applicable, proceed to xref:maintainers/release-guides/helm.adoc[release the Helm charts]. +* Update news/announcement copy on the main Tika website if it references the + Docker images. diff --git a/tika-server/docker-build/CHANGES.md b/tika-server/docker-build/CHANGES.md new file mode 100644 index 00000000000..eb6ce314a26 --- /dev/null +++ b/tika-server/docker-build/CHANGES.md @@ -0,0 +1,74 @@ +# Changes + +As of 2.5.0.1, we started adding a digit for Docker versions. Going forward, we'll include +a four digit version, where the first three are the Tika version and the last one is the docker version. +As of 2.5.0.2, we started tagging release commits in our github repo. + +* 4.0.0-alpha-1.0 (9 May 2026) + * First 4.0.0-alpha-1 release (preview; not tagged `latest`) + * Dropped `linux/arm/v7` from the published platforms. 32-bit ARM emulated + builds on Ubuntu 26.04 (resolute) hit a qemu chown-overflow in + `update-notifier-common`'s postinst, which is pulled in by + `ttf-mscorefonts-installer`. `linux/arm64/v8` covers modern ARM. + +* 3.3.0.0 (23 Mar 2026) + * First 3.3.0 release + +* 3.2.3.0 (15 Sep 2025) + * First 3.2.3 release + +* 3.2.2.0 (8 Aug 2025) + * First 3.2.2 release + +* 3.2.1.0 (9 Jul 2025) + * First 3.2.1 release + +* 3.2.0.0 (2 Jun 2025) + * First 3.2.0 release + * Update base to plucky + * Add Japanese language pack for tesseract + * Add ImageMagick + +* 3.1.0.0 (31 Jan 2025) + * First 3.1.0 release + * Update base to oracular + +* 3.0.0.0 (21 Oct 2024) + * First 3.x stable release + * Bump jre to 21 + +* 2.9.2.1 (21 May 2024) + * Updated to noble + * First multi-arch release + +* 2.9.2.0 (10 October 2023) + * Initial release for Tika 2.9.2 + +* 2.9.1.0 (10 October 2023) + * Initial release for Tika 2.9.1 + +* 2.9.0.0 (28 August 2023) + * Initial release for Tika 2.9.0 + +* 2.8.0.0 (15 May 2023) + * Initial release for Tika 2.8.0 + + +* 2.7.0.1 (27 March 2023) + * More efficient build process and final image size via @stumpylog on [pr#17](https://github.com/apache/tika-docker/pull). + +* 2.7.0.0 (6 Feb 2023) + * Initial release for Tika 2.7.0 + +* 2.6.0.1 (10 November 2022) + * Update operating system against OpenSSL CVE (TIKA-3926). + +* 2.6.0.0 (7 November 2022) + * Initial release for Tika 2.6.0 + +* 2.5.0.2 (31 October 2022) + * Fixed root-user regression caused by differences in Docker behavior based on the build system's OS (TIKA-3912) + * Added tika-extras/ directory to pick up extra jars via mounted drive or for those using our image as a base image (TIKA-3907) +* +* 2.5.0.1 (27 October 2022) + * Update to latest jammy to avoid recent CVEs (TIKA-3906) \ No newline at end of file diff --git a/tika-server/docker-build/README.md b/tika-server/docker-build/README.md new file mode 100644 index 00000000000..05b874a075e --- /dev/null +++ b/tika-server/docker-build/README.md @@ -0,0 +1,254 @@ +# tika-docker + +This repo is used to create convenience Docker images for Apache Tika Server published as [apache/tika](https://hub.docker.com/r/apache/tika) on DockerHub by the [Apache Tika](http://tika.apache.org) Dev team + +The images create a functional Apache Tika Server instance that contains the latest Ubuntu running the appropriate version's server on Port 9998 using Java 8 (until version 1.20), Java 11 (1.21 and 1.24.1), Java 14 (until 1.27/2.0.0), Java 16 (for 2.1.0), and Java 17 LTS for newer versions. + +There is a minimal version, which contains only Apache Tika and it's core dependencies, and a full version, which also includes dependencies for the GDAL and Tesseract OCR parsers. To balance showing functionality versus the size of the full image, this file by default installs the language packs for the following languages: +* English +* French +* German +* Italian +* Spanish +* Japanese + +To install more languages, set the build argument `LANGUAGES` or include your own custom packs using an ADD command. + +## Available Tags + +Below are the most recent tags. The `latest` tags track the 3.x stable line; +4.x preview releases are published as version-specific tags only. +- `latest`, `3.3.0.0`: Apache Tika Server 3.3.0.0 (Minimal) +- `latest-full`, `3.3.0.0-full`: Apache Tika Server 3.3.0.0 (Full) +- `4.0.0-alpha-1.0`: Apache Tika Server 4.0.0-alpha-1.0 (Minimal, 4.x preview) +- `4.0.0-alpha-1.0-full`: Apache Tika Server 4.0.0-alpha-1.0 (Full, 4.x preview) +- `3.3.0.0`, `3.3.0.0`: Apache Tika Server 3.3.0.0 (Minimal) +- `3.3.0.0`, `3.3.0.0-full`: Apache Tika Server 3.3.0.0 (Full) +- `3.2.3.0`, `3.2.3.0`: Apache Tika Server 3.2.3.0 (Minimal) +- `3.2.3.0`, `3.2.3.0-full`: Apache Tika Server 3.2.3.0 (Full) +- `3.2.2.0`, `3.2.2.0`: Apache Tika Server 3.2.2.0 (Minimal) +- `3.2.2.0`, `3.2.2.0-full`: Apache Tika Server 3.2.2.0 (Full) +- `3.2.1.0`, `3.2.1.0`: Apache Tika Server 3.2.1.0 (Minimal) +- `3.2.1.0`, `3.2.1.0-full`: Apache Tika Server 3.2.1.0 (Full) +- `3.2.0.0`, `3.2.0.0`: Apache Tika Server 3.2.0.0 (Minimal) +- `3.2.0.0`, `3.2.0.0-full`: Apache Tika Server 3.2.0.0 (Full) +- `3.1.0.0`, `3.1.0.0`: Apache Tika Server 3.1.0.0 (Minimal) +- `3.1.0.0`, `3.1.0.0-full`: Apache Tika Server 3.1.0.0 (Full) +- `3.0.0.0`, `3.0.0.0`: Apache Tika Server 3.0.0.0 (Minimal) +- `3.0.0.0`, `3.0.0.0-full`: Apache Tika Server 3.0.0.0 (Full) +- `3.0.0.0-BETA2`, `3.0.0.0-BETA2`: Apache Tika Server 3.0.0.0-BETA2 (Minimal) +- `3.0.0.0-BETA2`, `3.0.0.0-BETA2-full`: Apache Tika Server 3.0.0.0-BETA2 (Full) +- `2.9.2.1`, `2.9.2.1`: Apache Tika Server 2.9.2.1 (Minimal) +- `2.9.2.1`, `2.9.2.1-full`: Apache Tika Server 2.9.2.1 (Full) +- `2.9.2.0`, `2.9.2.0`: Apache Tika Server 2.9.2.0 (Minimal) +- `2.9.2.0`, `2.9.2.0-full`: Apache Tika Server 2.9.2.0 (Full) +- `2.9.1.0`, `2.9.1.0`: Apache Tika Server 2.9.1.0 (Minimal) +- `2.9.1.0`, `2.9.1.0-full`: Apache Tika Server 2.9.1.0 (Full) +- `2.9.0.0`, `2.9.0.0`: Apache Tika Server 2.9.0.0 (Minimal) +- `2.9.0.0`, `2.9.0.0-full`: Apache Tika Server 2.9.0.0 (Full) +- `2.8.0.0`, `2.8.0.0`: Apache Tika Server 2.8.0.0 (Minimal) +- `2.8.0.0`, `2.8.0.0-full`: Apache Tika Server 2.8.0.0 (Full) +- `2.7.0.1`, `2.7.0.1`: Apache Tika Server 2.7.0.1 (Minimal) +- `2.7.0.1`, `2.7.0.1-full`: Apache Tika Server 2.7.0.1 (Full) +- `2.7.0.0`, `2.7.0.0`: Apache Tika Server 2.7.0.0 (Minimal) +- `2.7.0.0`, `2.7.0.0-full`: Apache Tika Server 2.7.0.0 (Full) +- `2.6.0.1`: Apache Tika Server 2.6.0.1 (Minimal) +- `2.6.0.1-full`: Apache Tika Server 2.6.0.1 (Full) +- `2.6.0.0`: Apache Tika Server 2.6.0.0 (Minimal) +- `2.6.0.0-full`: Apache Tika Server 2.6.0.0 (Full) +- `2.5.0.2`: Apache Tika Server 2.5.0.2 (Minimal) +- `2.5.0.2-full`: Apache Tika Server 2.5.0.2 (Full) +- `2.5.0.1`: Apache Tika Server 2.5.0.1 (Minimal) +- `2.5.0.1-full`: Apache Tika Server 2.5.0.1 (Full) +- `2.5.0`: Apache Tika Server 2.5.0 (Minimal) +- `2.5.0-full`: Apache Tika Server 2.5.0 (Full) +- `2.4.1`: Apache Tika Server 2.4.1 (Minimal) +- `2.4.1-full`: Apache Tika Server 2.4.1 (Full) +- `2.4.0`: Apache Tika Server 2.4.0 (Minimal) +- `2.4.0-full`: Apache Tika Server 2.4.0 (Full) +- `2.3.0`: Apache Tika Server 2.3.0 (Minimal) +- `2.3.0-full`: Apache Tika Server 2.3.0 (Full) +- `2.2.1`: Apache Tika Server 2.2.1 (Minimal) +- `2.2.1-full`: Apache Tika Server 2.2.1 (Full) + +Below are the most recent 1.x series tags. **Note** that as of 30 September 2022, the 1.x branch is no longer supported. + +- `1.28.5`: Apache Tika Server 1.28.5 (Minimal) +- `1.28.5-full`: Apache Tika Server 1.28.5 (Full) +- `1.28.4`: Apache Tika Server 1.28.4 (Minimal) +- `1.28.4-full`: Apache Tika Server 1.28.4 (Full) +- `1.28.3`: Apache Tika Server 1.28.3 (Minimal) +- `1.28.3-full`: Apache Tika Server 1.28.3 (Full) +- `1.28.2`: Apache Tika Server 1.28.2 (Minimal) +- `1.28.2-full`: Apache Tika Server 1.28.2 (Full) +- `1.28.1`: Apache Tika Server 1.28.1 (Minimal) +- `1.28.1-full`: Apache Tika Server 1.28.1 (Full) + +You can see a full set of tags for historical versions [here](https://hub.docker.com/r/apache/tika/tags?page=1&ordering=last_updated). + +## 4.x Preview Notes + +The `4.0.0-alpha-1.0` images are a preview of the upcoming Tika 4.x line and are +not tagged `latest`. + +Tika 4.x changed the `tika-server-standard` packaging: the published jar is now +a thin top-level jar that resolves its dependencies from a sibling `lib/` +directory. The 4.x image therefore ships the unpacked `tika-server-standard-bin.zip` +distribution under `/opt/tika-server/` (containing `tika-server.jar`, `lib/`, +and `plugins/`) instead of a single fat jar. + +The standard REST endpoints (`/tika`, `/rmeta`, `/unpack`, `/detect`, etc.) +work as in 3.x — they spool the request body to a temp file internally via +`TikaInputStream` and do not require any pipes plugin. + +Pipes-mode endpoints (`/pipes`, `/async`) require pf4j plugins. The +`tika-pipes-file-system` plugin is **bundled** under +`/opt/tika-server/plugins/tika-pipes-file-system/` (it ships inside the +upstream `tika-server-standard-bin.zip`). Other pipes plugins +(`tika-pipes-http`, `tika-pipes-s3`, etc.) are not currently bundled in the +preview image; mount them into `/opt/tika-server/plugins/` if you need them. +Bundling additional common plugins is planned for `4.0.0-beta-1.0`. + +## Supported Platforms + +The Docker images are published as multi-platform images supporting the following architectures: + +- `linux/amd64` - 64-bit x86 processors (Intel/AMD) +- `linux/arm64/v8` - 64-bit ARM processors (Apple Silicon, AWS Graviton, etc.) +- `linux/s390x` - IBM System z mainframes + +NOTE: `linux/arm/v7` was published for 3.x but dropped starting with `4.0.0-alpha-1.0`. +If you need 32-bit ARM, pin to a 3.x tag. The drop was driven by a qemu/dpkg +emulation bug that broke font-package installation on the Ubuntu 26.04 base. + +Docker will automatically pull the correct image for your platform when you use `docker pull` or `docker run`. + +## Usage + +### Default + +You can pull down the version you would like using: + + docker pull apache/tika: + +Then to run the container, execute the following command: + + docker run -d -p 127.0.0.1:9998:9998 apache/tika: + +Where is the DockerHub tag corresponding to the Apache Tika Server version - e.g. 1.23, 1.22, 1.23-full, 1.22-full. + +NOTE: The latest and latest-full tags are explicitly set to the latest released version when they are published. + +NOTE: In the example above, we recommend binding the server to localhost because Docker alters iptables and may expose +your tika-server to the internet. If you are confident that your tika-server is on an isolated network +you can simply run: + + docker run -d -p 9998:9998 apache/tika: + +### Custom Config + +From version 1.25 and 1.25-full of the image it is now easier to override the defaults and pass parameters to the running instance. + +So for example if you wish to disable the OCR parser in the full image you could write a custom configuration: + +``` +cat <> tika-config.json +{ + "parsers": [ + { "default-parser": {} }, + { "tesseract-ocr-parser": { "skipOcr": true } } + ] +} +EOT +``` +Then by mounting this custom configuration as a volume, you could pass the command line parameter to load it + + docker run -d -p 127.0.0.1:9998:9998 -v `pwd`/tika-config.json:/tika-config.json apache/tika:-full -c /tika-config.json + +NOTE: Tika 4.x replaced the XML `tika-config.xml` format with JSON +`tika-config.json` (see TIKA-4544). The XML form above is what 2.x / 3.x +images expect; if you're pinned to those tags, keep using the XML. + +You can see more configuration examples on the +[Tika website](https://tika.apache.org/) and in the canonical samples under +`tika-server/tika-server-core/src/test/resources/config-examples/` in the +source tree. + +As of 2.5.0.2, if you'd like to add extra jars from your local `my-jars` directory to Tika's classpath, mount to `/tika-extras` like so: + + docker run -d -p 127.0.0.1:9998:9998 -v `pwd`/my-jars:/tika-extras apache/tika:2.5.0.2-full + +You may want to do this to add optional components, such as the tika-eval metadata filter, or optional +dependencies such as jai-imageio-jpeg2000 (check license compatibility first!). + +### Docker Compose Examples + +There are a number of sample Docker Compose files included in the repos to allow you to test some different scenarios. + +These files use docker-compose 3.x series and include: + +* docker-compose-tika-vision.yml - Vision-Language Model parsing example (OpenAI-compatible / Claude / Gemini) +* docker-compose-tika-grobid.yml - Grobid REST parsing example +* docker-compose-tika-customocr.yml - Tesseract OCR example with custom configuration + +The Docker Compose files and configurations (sourced from _sample-configs_ directory) all have comments in them so you can try different options, or use them as a base to create your own custom configuration. + +**N.B.** You will want to create a environment variable (used in some bash scripts) matching the version of tika-docker you want to work with in the docker compositions e.g. `export TAG=1.26`. Similarly you should also consult `.env` which is used in the docker-compose `.yml` files. + +You can install docker-compose from [here](https://docs.docker.com/compose/install/). + +## Building + +To build the image from scratch, simply invoke: + + docker build -t 'apache/tika' github.com/apache/tika-docker + +You can then use the following command (using the name you allocated in the build command as part of -t option): + + docker run -d -p 127.0.0.1:9998:9998 apache/tika + +## More Information + +For more infomation on Apache Tika Server, go to the [Apache Tika Server documentation](https://cwiki.apache.org/confluence/display/TIKA/TikaServer). + +For more information on Apache Tika, go to the official [Apache Tika](http://tika.apache.org) project website. + +To meet up with others using Apache Tika, consider coming to one of the [Apache Tika Virtual Meetups](https://www.meetup.com/apache-tika-community/). + +For more information on the Apache Software Foundation, go to the [Apache Software Foundation](http://apache.org) website. + +For a full list of changes as of 2.5.0.1, visit [CHANGES.md](CHANGES.md). + +For our current release process, visit [tika-docker Release Process](https://cwiki.apache.org/confluence/display/TIKA/Release+Process+for+tika-docker) + +## Authors + +Apache Tika Dev Team (dev@tika.apache.org) + +## Contributors + +There have been a range of [contributors](https://github.com/apache/tika-docker/graphs/contributors) on GitHub and via suggestions, including: + +- [@grossws](https://github.com/grossws) +- [@arjunyel](https://github.com/arjunyel) +- [@mpdude](https://github.com/mpdude) +- [@laszlocsontosuw](https://github.com/laszlocsontosuw) +- [@tallisonapache](https://github.com/tballison) + +## License + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +## Disclaimer + +It is worth noting that whilst these Docker images download the binary JARs published by the Apache Tika Team on the Apache Software Foundation distribution sites, only the source release of an Apache Software Foundation project is an official release artefact. See [Release Distribution Policy](https://www.apache.org/dev/release-distribution.html) for more details. diff --git a/tika-server/docker-build/docker-compose-tika-customocr.yml b/tika-server/docker-build/docker-compose-tika-customocr.yml new file mode 100644 index 00000000000..29cf667a216 --- /dev/null +++ b/tika-server/docker-build/docker-compose-tika-customocr.yml @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.8" +services: + + ## Apache Tika Server + tika: + image: apache/tika:${TAG}-full + # Override default so we can add the /customocr dir on the classpath + # (for the bundled TesseractOCRConfig.properties). The 4.x image layout + # places the thin server jar at /opt/tika-server/tika-server.jar and its + # deps at /opt/tika-server/lib/*. working_dir=/opt/tika-server matters for + # tika-server's plugin-roots fallback (see TikaServerProcess#resolveDefaultPluginsDir). + entrypoint: [ "/bin/sh", "-c", "exec java -cp \"/customocr:/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $$0 $$@"] + working_dir: /opt/tika-server + # Kept command as example but could be added to entrypoint too + command: -c /tika-config.json + restart: on-failure + ports: + - "9998:9998" + volumes: + # Choose the configuration you want, or add your own custom one + # - ./sample-configs/customocr/tika-config-inline.json:/tika-config.json + - ./sample-configs/customocr/tika-config-rendered.json:/tika-config.json + + diff --git a/tika-server/docker-build/docker-compose-tika-grobid.yml b/tika-server/docker-build/docker-compose-tika-grobid.yml new file mode 100644 index 00000000000..add5d2744f8 --- /dev/null +++ b/tika-server/docker-build/docker-compose-tika-grobid.yml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.8" +services: + + ## Apache Tika Server + tika: + image: apache/tika:${TAG}-full + # Override default so we can add the /grobid dir on the classpath + # (for the bundled GrobidExtractor.properties). The 4.x image layout + # places the thin server jar at /opt/tika-server/tika-server.jar and its + # deps at /opt/tika-server/lib/*. working_dir=/opt/tika-server matters for + # tika-server's plugin-roots fallback. + entrypoint: [ "/bin/sh", "-c", "exec java -cp \"/grobid:/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $$0 $$@"] + working_dir: /opt/tika-server + # Kept command as example but could be added to entrypoint too + command: -c /grobid/tika-config.json + restart: on-failure + ports: + - "9998:9998" + volumes: + - ./sample-configs/grobid:/grobid + depends_on: + - grobid + + ## Grobid Service + grobid: + image: lfoppiano/grobid:0.6.1 + ports: + - "8070:8070" + - "8071:8071" + diff --git a/tika-server/docker-build/docker-compose-tika-vision.yml b/tika-server/docker-build/docker-compose-tika-vision.yml new file mode 100644 index 00000000000..da01d03a277 --- /dev/null +++ b/tika-server/docker-build/docker-compose-tika-vision.yml @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Vision-Language Model parsing for tika-server (Tika 4.x). +# +# The pre-4.x inception-rest / Im2txt / inception-video services and the +# org.apache.tika.parser.recognition.ObjectRecognitionParser they served +# have been removed (TIKA-4499 / TIKA-4500). The 4.x replacement is a +# family of VLM parsers (OpenAI-compatible, Anthropic Claude, Google +# Gemini). See: +# +# docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc +# +# This compose demonstrates the OpenAI-compatible variant pointing at a +# locally-hosted Ollama instance. To use a different VLM: +# - Swap the mounted tika-config.* for vlm-claude.json or vlm-gemini.json +# and pass the relevant API key via env (ANTHROPIC_API_KEY / +# GEMINI_API_KEY). +# - Drop the vlm-server service block below. + +services: + + ## Apache Tika Server + tika: + image: apache/tika:latest-full + command: -c /tika-config.json + restart: on-failure + ports: + - "9998:9998" + volumes: + - ./sample-configs/vision/vlm-openai.json:/tika-config.json + # - ./sample-configs/vision/vlm-claude.json:/tika-config.json + # - ./sample-configs/vision/vlm-gemini.json:/tika-config.json + depends_on: + - vlm-server + + ## Local OpenAI-compatible VLM endpoint. + ## Replace with vLLM, your own FastAPI wrapper, or remove and point + ## baseUrl in vlm-openai.json at OpenAI's real API. + vlm-server: + image: ollama/ollama:latest + ports: + - "8000:11434" + # Volumes for pulled models. Uncomment and pull a vision-capable model + # (e.g. `docker exec ollama pull llava`) before first use. + # volumes: + # - ollama-models:/root/.ollama + +# volumes: +# ollama-models: diff --git a/tika-server/docker-build/docker-tool.sh b/tika-server/docker-build/docker-tool.sh index 2a82b5fa349..db05dddf2ec 100755 --- a/tika-server/docker-build/docker-tool.sh +++ b/tika-server/docker-build/docker-tool.sh @@ -36,6 +36,8 @@ while getopts ":h" opt; do echo " docker-tool.sh -h Display this help message." echo " docker-tool.sh build Builds images for ." echo " docker-tool.sh test Tests images for ." + echo " docker-tool.sh test-uat Runs the tika-server REST UAT against images for ." + echo " Requires TIKA_MAIN env var or sibling tika-main checkout (../tika-main)." echo " docker-tool.sh publish Builds multi-arch images for and pushes to Docker Hub." exit 0 ;; @@ -98,6 +100,35 @@ test_docker_image() { stop_test_container "$container_name" } +test_docker_image_uat() { + container_name=$1 + image=$image_name:$1 + uat_script=$2 + + docker run -d --name "$container_name" -p 127.0.0.1:9998:9998 "$image" \ + || die "couldn't start $image" + + # Wait up to 30s for /version to respond. + for i in $(seq 1 30); do + if curl -fsS --max-time 2 http://localhost:9998/version >/dev/null 2>&1; then + break + fi + sleep 1 + done + + if "$uat_script" http://localhost:9998; then + echo "$(tput setaf 2)Image: $image - UAT passed$(tput sgr0)" + stop_test_container "$container_name" + else + echo "$(tput setaf 1)Image: $image - UAT failed$(tput sgr0)" + echo "--- last 40 lines of container log ---" + docker logs --tail 40 "$container_name" || true + echo "--- end log ---" + stop_test_container "$container_name" + exit 1 + fi +} + shift $((OPTIND -1)) subcommand=$1; shift tika_docker_version=$1; shift @@ -118,13 +149,25 @@ case "$subcommand" in test_docker_image "${tika_docker_version}-full" true ;; + test-uat) + # Run the tika-server REST UAT (release-tools/uat/run-uat.sh, two levels + # up from this script in the tika repo) against both images. + repo_root="$(cd "$(dirname "$0")/../.." && pwd)" + uat_script="${repo_root}/release-tools/uat/run-uat.sh" + if [[ ! -x "$uat_script" ]]; then + die "UAT script not found or not executable: $uat_script" + fi + test_docker_image_uat ${tika_docker_version} "$uat_script" + test_docker_image_uat "${tika_docker_version}-full" "$uat_script" + ;; + publish) docker buildx create --use --name tika-builder || die "couldn't create builder" # Build multi-arch with buildx and push - docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ - --tag ${image_name}:latest --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" - docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ - --tag ${image_name}:latest-full --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" + docker buildx build --platform linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" + docker buildx build --platform linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " ;; diff --git a/tika-server/docker-build/full/Dockerfile b/tika-server/docker-build/full/Dockerfile index 1b918390f62..7c77e4a0482 100644 --- a/tika-server/docker-build/full/Dockerfile +++ b/tika-server/docker-build/full/Dockerfile @@ -15,36 +15,42 @@ # the subsequent stages -- see TIKA-3912 ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS base +FROM ubuntu:resolute AS base FROM base AS fetch_tika ARG TIKA_VERSION ARG CHECK_SIG=true -ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ - ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ +ENV TIKA_SERVER_ARCHIVE="tika-server-standard-${TIKA_VERSION}-bin.zip" \ + NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ TIKA_VERSION=$TIKA_VERSION -RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 wget ca-certificates \ +# 4.x publishes tika-server as a bin.zip distribution. The thin top-level +# tika-server.jar uses its manifest Class-Path to resolve the jars under lib/, +# and tika-server reads pf4j plugins from the plugins/ directory next to it. +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 wget ca-certificates unzip \ && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ - && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ - && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ - && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar - -#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $BACKUP_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || exit 1 \ + && gpg --verify /${TIKA_SERVER_ARCHIVE}.asc /${TIKA_SERVER_ARCHIVE} \ + && mkdir -p /opt/tika-server \ + && unzip -q /${TIKA_SERVER_ARCHIVE} -d /opt/tika-server \ + && rm /${TIKA_SERVER_ARCHIVE} /${TIKA_SERVER_ARCHIVE}.asc FROM base AS runtime ARG UID_GID -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' +ARG LANGUAGES='eng ita fra spa deu jpn' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ @@ -53,12 +59,7 @@ RUN set -eux \ gdal-bin \ imagemagick \ tesseract-ocr \ - tesseract-ocr-eng \ - tesseract-ocr-ita \ - tesseract-ocr-fra \ - tesseract-ocr-spa \ - tesseract-ocr-deu \ - tesseract-ocr-jpn \ + $(printf 'tesseract-ocr-%s ' $LANGUAGES) \ && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ xfonts-utils \ @@ -72,11 +73,17 @@ RUN set -eux \ ARG TIKA_VERSION ENV TIKA_VERSION=$TIKA_VERSION -COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +COPY --from=fetch_tika /opt/tika-server /opt/tika-server +# WORKDIR sets the CWD so tika-server's plugin-root fallback resolves +# `plugins/` relative to /opt/tika-server (its `getCodeSource()` returns a +# lib/* path, not the top-level jar, so the "next-to-jar" resolution misses). +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +# Classpath includes the thin server jar, its lib/ deps, and any user-mounted /tika-extras/. +# tika-server auto-discovers pf4j plugins from /opt/tika-server/plugins/. +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/full/Dockerfile.snapshot b/tika-server/docker-build/full/Dockerfile.snapshot index 4f655005e63..03bcc08e41a 100644 --- a/tika-server/docker-build/full/Dockerfile.snapshot +++ b/tika-server/docker-build/full/Dockerfile.snapshot @@ -15,10 +15,10 @@ ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS runtime +FROM ubuntu:resolute AS runtime ARG UID_GID ARG TIKA_VERSION -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ @@ -44,9 +44,13 @@ RUN set -eux \ && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ENV TIKA_VERSION=$TIKA_VERSION -COPY tika-server/ /tika-server/ +# Snapshot workflow tars the bin distribution into /tika-server/, so +# this COPY lands the thin jar + lib/ + plugins/ at /opt/tika-server/, matching +# the release-variant Dockerfile. +COPY tika-server/ /opt/tika-server/ +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/minimal/Dockerfile b/tika-server/docker-build/minimal/Dockerfile index 1c5195920a4..af641a491d8 100644 --- a/tika-server/docker-build/minimal/Dockerfile +++ b/tika-server/docker-build/minimal/Dockerfile @@ -16,44 +16,48 @@ # the subsequent stages -- see TIKA-3912 ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS base +FROM ubuntu:resolute AS base FROM base AS fetch_tika ARG TIKA_VERSION ARG CHECK_SIG=true -ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ - ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ +ENV TIKA_SERVER_ARCHIVE="tika-server-standard-${TIKA_VERSION}-bin.zip" \ + NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ TIKA_VERSION=$TIKA_VERSION +# 4.x publishes tika-server as a bin.zip distribution. The thin top-level +# tika-server.jar uses its manifest Class-Path to resolve the jars under lib/, +# and tika-server reads pf4j plugins from the plugins/ directory next to it. RUN set -eux \ && apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ gnupg2 \ wget \ ca-certificates \ + unzip \ && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ - && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ - && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ - && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar - -# this used to work, but I'm getting "ERROR: failed to solve: failed to prepare $data as $data2: invalid argument" -# when trying to build 2.9.2.0 -#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $BACKUP_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || exit 1 \ + && gpg --verify /${TIKA_SERVER_ARCHIVE}.asc /${TIKA_SERVER_ARCHIVE} \ + && mkdir -p /opt/tika-server \ + && unzip -q /${TIKA_SERVER_ARCHIVE} -d /opt/tika-server \ + && rm /${TIKA_SERVER_ARCHIVE} /${TIKA_SERVER_ARCHIVE}.asc FROM base AS runtime # must reference uid_gid ARG UID_GID -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends \ @@ -62,9 +66,15 @@ RUN set -eux \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ARG TIKA_VERSION ENV TIKA_VERSION=$TIKA_VERSION -COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +COPY --from=fetch_tika /opt/tika-server /opt/tika-server +# WORKDIR sets the CWD so tika-server's plugin-root fallback resolves +# `plugins/` relative to /opt/tika-server (its `getCodeSource()` returns a +# lib/* path, not the top-level jar, so the "next-to-jar" resolution misses). +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +# Classpath includes the thin server jar, its lib/ deps, and any user-mounted /tika-extras/. +# tika-server auto-discovers pf4j plugins from /opt/tika-server/plugins/. +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/minimal/Dockerfile.snapshot b/tika-server/docker-build/minimal/Dockerfile.snapshot index d701dfee68b..873ee64f560 100644 --- a/tika-server/docker-build/minimal/Dockerfile.snapshot +++ b/tika-server/docker-build/minimal/Dockerfile.snapshot @@ -15,10 +15,10 @@ ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS runtime +FROM ubuntu:resolute AS runtime ARG UID_GID ARG TIKA_VERSION -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends \ @@ -26,9 +26,13 @@ RUN set -eux \ ca-certificates \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ENV TIKA_VERSION=$TIKA_VERSION -COPY tika-server/ /tika-server/ +# Snapshot workflow tars the bin distribution into /tika-server/, so +# this COPY lands the thin jar + lib/ + plugins/ at /opt/tika-server/, matching +# the release-variant Dockerfile. +COPY tika-server/ /opt/tika-server/ +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.json b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.json new file mode 100644 index 00000000000..055e72c9ba8 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.json @@ -0,0 +1,11 @@ +{ + "_comment": "Extract inline images from PDF and OCR them with Tesseract.", + "parsers": [ + { "tesseract-ocr-parser": {} }, + { + "pdf-parser": { + "extractInlineImages": true + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml deleted file mode 100644 index 1c9b613033a..00000000000 --- a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml +++ /dev/null @@ -1,31 +0,0 @@ - - - - - - - - - - - true - - - - - diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json new file mode 100644 index 00000000000..45f3d3bf72b --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json @@ -0,0 +1,16 @@ +{ + "_comment": [ + "Render each PDF page as an image and run Tesseract on it.", + "ocrStrategy options: no_ocr, ocr_only, ocr_and_text, auto." + ], + "parsers": [ + { "tesseract-ocr-parser": {} }, + { + "pdf-parser": { + "ocrStrategy": "ocr_only", + "ocrImageType": "rgb", + "ocrDPI": 100 + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml deleted file mode 100644 index bcd86669963..00000000000 --- a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - - - - - - - ocr_only - rgb - 100 - - - - - diff --git a/tika-server/docker-build/sample-configs/grobid/tika-config.json b/tika-server/docker-build/sample-configs/grobid/tika-config.json new file mode 100644 index 00000000000..943ec19528c --- /dev/null +++ b/tika-server/docker-build/sample-configs/grobid/tika-config.json @@ -0,0 +1,10 @@ +{ + "_comment": "Route PDFs through GROBID (via JournalParser) for journal-article extraction.", + "parsers": [ + { + "journal-parser": { + "_mime-include": ["application/pdf"] + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/grobid/tika-config.xml b/tika-server/docker-build/sample-configs/grobid/tika-config.xml deleted file mode 100644 index 5b4aad9c725..00000000000 --- a/tika-server/docker-build/sample-configs/grobid/tika-config.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - application/pdf - - - diff --git a/tika-server/docker-build/sample-configs/ner/run_tika_server.sh b/tika-server/docker-build/sample-configs/ner/run_tika_server.sh deleted file mode 100755 index fb447be4cfe..00000000000 --- a/tika-server/docker-build/sample-configs/ner/run_tika_server.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -############################################################################# -# See https://cwiki.apache.org/confluence/display/TIKA/TikaAndNER for details -# on how to configure additional NER libraries -############################################################################# - -# ------------------------------------ -# Download OpenNLP Models to classpath -# ------------------------------------ - -OPENNLP_LOCATION="/ner/org/apache/tika/parser/ner/opennlp" -URL="http://opennlp.sourceforge.net/models-1.5" - -mkdir -p $OPENNLP_LOCATION -if [ "$(ls -A $OPENNLP_LOCATION/*.bin)" ]; then - echo "OpenNLP models directory has files, so skipping fetch"; -else - echo "No OpenNLP models found, so fetching them" - wget "$URL/en-ner-person.bin" -O $OPENNLP_LOCATION/ner-person.bin - wget "$URL/en-ner-location.bin" -O $OPENNLP_LOCATION/ner-location.bin - wget "$URL/en-ner-organization.bin" -O $OPENNLP_LOCATION/ner-organization.bin; - wget "$URL/en-ner-date.bin" -O $OPENNLP_LOCATION/ner-date.bin - wget "$URL/en-ner-time.bin" -O $OPENNLP_LOCATION/ner-time.bin - wget "$URL/en-ner-percentage.bin" -O $OPENNLP_LOCATION/ner-percentage.bin - wget "$URL/en-ner-money.bin" -O $OPENNLP_LOCATION/ner-money.bin -fi - -# -------------------------------------------- -# Create RexExp Example for Email on classpath -# -------------------------------------------- -REGEXP_LOCATION="/ner/org/apache/tika/parser/ner/regex" -mkdir -p $REGEXP_LOCATION -echo "EMAIL=(?:[a-z0-9!#$%&'*+/=?^_\`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])" > $REGEXP_LOCATION/ner-regex.txt - - -# ------------------- -# Now run Tika Server -# ------------------- - -# Can be a single implementation or comma seperated list for multiple for "ner.impl.class" property -RECOGNISERS=org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser,org.apache.tika.parser.ner.regex.RegexNERecogniser -# Set classpath to the Tika Server JAR and the /ner folder so it has the configuration and models from above -CLASSPATH="/ner:/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*" -# Run the server with the custom configuration ner.impl.class property and custom /ner/tika-config.xml -exec java -Dner.impl.class=$RECOGNISERS -cp $CLASSPATH org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 -c /ner/tika-config.xml \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/ner/tika-config.xml b/tika-server/docker-build/sample-configs/ner/tika-config.xml deleted file mode 100644 index 65d5774c22f..00000000000 --- a/tika-server/docker-build/sample-configs/ner/tika-config.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - application/pdf - text/plain - text/html - application/xhtml+xml - - - - diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml deleted file mode 100644 index c70c207b281..00000000000 --- a/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - image/jpeg - image/png - image/gif - - http://inception-caption:8764/inception/v3 - 5 - 15 - org.apache.tika.parser.captioning.tf.TensorflowRESTCaptioner - - - - \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml deleted file mode 100644 index f6a4e6a938c..00000000000 --- a/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - video/mp4 - video/quicktime - - http://inception-video:8764/inception/v4 - 4 - 0.015 - fixed - org.apache.tika.parser.recognition.tf.TensorflowRESTVideoRecogniser - - - - \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest.xml b/tika-server/docker-build/sample-configs/vision/inception-rest.xml deleted file mode 100644 index caa64685952..00000000000 --- a/tika-server/docker-build/sample-configs/vision/inception-rest.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - image/jpeg - image/png - image/gif - - http://inception-rest:8764/inception/v4 - 2 - 0.015 - org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser - - - - diff --git a/tika-server/docker-build/sample-configs/vision/vlm-claude.json b/tika-server/docker-build/sample-configs/vision/vlm-claude.json new file mode 100644 index 00000000000..e233516b615 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/vlm-claude.json @@ -0,0 +1,18 @@ +{ + "_comment": [ + "Vision-Language Model parsing via Anthropic's Claude API.", + "Claude can handle OCR images and PDFs natively (no rasterization needed).", + "Set apiKey to your Anthropic API key — DO NOT commit a real key.", + "Prefer passing it via the ANTHROPIC_API_KEY env var and substituting it", + "at container start, e.g. via an entrypoint shim or sidecar that templates", + "this file. See docs: configuration/parsers/vlm-parsers." + ], + "parsers": [ + { + "claude-vlm-parser": { + "apiKey": "${ANTHROPIC_API_KEY}", + "model": "claude-sonnet-4-20250514" + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/vision/vlm-gemini.json b/tika-server/docker-build/sample-configs/vision/vlm-gemini.json new file mode 100644 index 00000000000..4c33e69f3cd --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/vlm-gemini.json @@ -0,0 +1,17 @@ +{ + "_comment": [ + "Vision-Language Model parsing via Google's Gemini generateContent API.", + "Gemini can handle OCR images and PDFs natively (no rasterization needed).", + "Set apiKey to your Google AI Studio API key — DO NOT commit a real key.", + "Prefer GEMINI_API_KEY env var + a templating entrypoint, similar to the", + "Claude config. See docs: configuration/parsers/vlm-parsers." + ], + "parsers": [ + { + "gemini-vlm-parser": { + "apiKey": "${GEMINI_API_KEY}", + "model": "gemini-2.5-flash" + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/vision/vlm-openai.json b/tika-server/docker-build/sample-configs/vision/vlm-openai.json new file mode 100644 index 00000000000..2a4b675ddb3 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/vlm-openai.json @@ -0,0 +1,19 @@ +{ + "_comment": [ + "Vision-Language Model parsing via an OpenAI-compatible endpoint.", + "Works with self-hosted backends (vLLM, Ollama, a local FastAPI wrapper)", + "or against OpenAI's own chat-completions API. Set baseUrl to wherever", + "the OpenAI-compatible endpoint is reachable from the tika container.", + "If the endpoint requires authentication, also set apiKey.", + "See docs: configuration/parsers/vlm-parsers." + ], + "parsers": [ + { + "openai-vlm-parser": { + "baseUrl": "http://vlm-server:8000", + "model": "jinaai/jina-vlm", + "timeoutSeconds": 300 + } + } + ] +} From eedcf016abfc9a5b18a6176ebdc8675db693a7ae Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 17:09:30 -0400 Subject: [PATCH 2/5] TIKA-4725 - update semver and processes --- .github/workflows/docker-release.yml | 96 +++++++++++++------ .../maintainers/release-guides/docker.adoc | 53 ++++++++-- tika-server/docker-build/CHANGES.md | 37 +++++-- tika-server/docker-build/README.md | 33 +++++-- tika-server/docker-build/docker-tool.sh | 40 +++++++- 5 files changed, 201 insertions(+), 58 deletions(-) diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 54f18f5fd75..7abd73d34f2 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -28,6 +28,14 @@ name: Docker release - tika-server and tika-grpc # pointer to) apache/tika-docker. on: workflow_dispatch: + inputs: + tag: + description: 'Tika release tag (e.g. 4.0.0-alpha-1). Must already exist as a git tag.' + required: true + build_number: + description: 'Docker build number for this Tika tag (1 for first build, increment on rebuilds).' + required: true + default: '1' jobs: release-tika-server: @@ -36,12 +44,42 @@ jobs: steps: - uses: actions/checkout@v6 - - - name: Extract version from tag - id: version + with: + ref: ${{ inputs.tag }} + + # Compute the tag set for each image. Three tags per image at minimum: + # apache/tika: (mutable; moves on each rebuild) + # apache/tika:- (immutable; one per rebuild) + # apache/tika:latest (only for non-prerelease tags) + # The grpc image always pushes :latest (no 3.x incumbent to protect). + - name: Compute tags + id: tags run: | - TAG_NAME="${GITHUB_REF#refs/tags/}" - echo "tag=${TAG_NAME}" >> "$GITHUB_OUTPUT" + tag='${{ inputs.tag }}' + build='${{ inputs.build_number }}' + minimal="apache/tika:${tag} + apache/tika:${tag}-${build}" + full="apache/tika:${tag}-full + apache/tika:${tag}-${build}-full" + grpc="apache/tika-grpc:${tag} + apache/tika-grpc:${tag}-${build} + apache/tika-grpc:latest" + case "$tag" in + *-alpha*|*-BETA*|*-RC*) + echo "Prerelease tag $tag — skipping :latest for apache/tika." + ;; + *) + minimal="${minimal} + apache/tika:latest" + full="${full} + apache/tika:latest-full" + ;; + esac + { + echo "minimal<> "$GITHUB_OUTPUT" - name: Set up Docker Buildx uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 @@ -62,12 +100,8 @@ jobs: platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | - TIKA_VERSION=${{ steps.version.outputs.tag }} - # :latest is intentionally NOT pushed. It stays on 3.x (published from - # the external apache/tika-docker repo) until 4.0.0 GA, at which point - # add `apache/tika:latest` back here. - tags: | - apache/tika:${{ steps.version.outputs.tag }} + TIKA_VERSION=${{ inputs.tag }} + tags: ${{ steps.tags.outputs.minimal }} - name: Build and push tika-server full uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 @@ -76,10 +110,8 @@ jobs: platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | - TIKA_VERSION=${{ steps.version.outputs.tag }} - # :latest-full stays on 3.x until 4.0.0 GA; see note above. - tags: | - apache/tika:${{ steps.version.outputs.tag }}-full + TIKA_VERSION=${{ inputs.tag }} + tags: ${{ steps.tags.outputs.full }} release-tika-grpc: runs-on: ubuntu-latest @@ -87,12 +119,8 @@ jobs: steps: - uses: actions/checkout@v6 - - - name: Extract version from tag - id: version - run: | - TAG_NAME="${GITHUB_REF#refs/tags/}" - echo "tag=${TAG_NAME}" >> "$GITHUB_OUTPUT" + with: + ref: ${{ inputs.tag }} - name: Set up JDK 17 uses: actions/setup-java@v5 @@ -116,9 +144,22 @@ jobs: username: ${{ secrets.DOCKERHUB_USER }} password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Compute grpc tags + id: grpc_tags + run: | + tag='${{ inputs.tag }}' + build='${{ inputs.build_number }}' + { + echo "tags<> "$GITHUB_OUTPUT" + - name: Prepare tika-grpc Docker build context run: | - TIKA_VERSION="${{ steps.version.outputs.tag }}" + TIKA_VERSION='${{ inputs.tag }}' OUT_DIR=target/tika-grpc-docker mkdir -p "${OUT_DIR}/libs/tika-grpc" "${OUT_DIR}/plugins" "${OUT_DIR}/config" "${OUT_DIR}/bin" @@ -160,11 +201,8 @@ jobs: platforms: linux/amd64,linux/arm64 push: true build-args: | - VERSION=${{ steps.version.outputs.tag }} + VERSION=${{ inputs.tag }} # apache/tika-grpc is new in 4.x with no prior `:latest` to protect, so - # we track latest from the start. Unlike apache/tika (the server image) - # where :latest stays on 3.x until 4.0.0 GA, the grpc image has no 3.x - # incumbent. - tags: | - apache/tika-grpc:${{ steps.version.outputs.tag }} - apache/tika-grpc:latest + # we track latest from the start (unlike apache/tika the server image, + # whose :latest stays on 3.x until 4.0.0 GA). + tags: ${{ steps.grpc_tags.outputs.tags }} diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc index c699f00e4a4..d02ea87c5a1 100644 --- a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc +++ b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc @@ -78,27 +78,64 @@ dist/release hasn't propagated yet — wait a few minutes. The workflow is intentionally `workflow_dispatch`-only — it won't auto-fire on tag push. Trigger it manually after the vote, against the release tag. +The workflow takes two inputs: + +`tag`:: +The Tika release tag, e.g. `4.0.0-alpha-1`. Must already exist as a git tag +(created by `release:prepare`). The workflow checks out at this ref and uses +it for both the `TIKA_VERSION` build-arg and the published Docker tag. + +`build_number`:: +The Docker build number for this Tika tag. *Use `1` for the initial publish.* +Increment when re-publishing the same Tika version with Docker-only changes +(CVE fixes in the base image, refreshed apt packages, etc.). Each rebuild +publishes an immutable `-` tag alongside the rolling `` tag. + *Via the GitHub UI:* . Open https://github.com/apache/tika/actions . Select *Docker release - tika-server and tika-grpc* in the left sidebar . Click *Run workflow* (top-right) -. Under *Use workflow from*, switch from the default branch to *Tags* and pick - the release tag (e.g. `4.0.0-alpha-1`) +. Fill in `tag` (e.g. `4.0.0-alpha-1`) and `build_number` (e.g. `1`) . Click *Run workflow* *Via the `gh` CLI:* [source,bash] ---- -gh workflow run docker-release.yml --ref -# e.g. -gh workflow run docker-release.yml --ref 4.0.0-alpha-1 +gh workflow run docker-release.yml \ + -f tag=4.0.0-alpha-1 \ + -f build_number=1 ---- -The `--ref` argument selects the git ref to check out. The workflow extracts -the tag name from `GITHUB_REF` (`refs/tags/`) and uses it for both the -`TIKA_VERSION` build-arg and the published Docker tag. +=== Tag scheme + +Each workflow run publishes three tags per image, all pointing at the same +manifest digest: + +[cols="1,3,1", options="header"] +|=== +|Tag |Meaning |Moves on rebuild? + +|`apache/tika:` +|Mutable rolling tag for this Tika version (e.g. `apache/tika:4.0.0-alpha-1`). +|Yes — retagged to the new digest + +|`apache/tika:-` +|Immutable build pin (e.g. `apache/tika:4.0.0-alpha-1-1` for the first build). +Pin by this if you need stability across rebuilds. +|No — never reassigned + +|`apache/tika:latest` +|Mutable rolling tag for the newest stable Tika release. Pushed only for +non-prerelease tags (i.e., no `-alpha`, `-BETA`, `-RC`). Stays on 3.x until +4.0.0 GA. +|Yes — for stable releases only +|=== + +The `-full` variants (`-full`, `--full`, `latest-full`) follow +the same scheme. `apache/tika-grpc` also publishes the three-tag pattern, but +its `:latest` is pushed unconditionally (no 3.x incumbent to protect). === Step 3: Watch the run diff --git a/tika-server/docker-build/CHANGES.md b/tika-server/docker-build/CHANGES.md index eb6ce314a26..5d59f27c942 100644 --- a/tika-server/docker-build/CHANGES.md +++ b/tika-server/docker-build/CHANGES.md @@ -1,15 +1,34 @@ # Changes -As of 2.5.0.1, we started adding a digit for Docker versions. Going forward, we'll include -a four digit version, where the first three are the Tika version and the last one is the docker version. -As of 2.5.0.2, we started tagging release commits in our github repo. - -* 4.0.0-alpha-1.0 (9 May 2026) - * First 4.0.0-alpha-1 release (preview; not tagged `latest`) +Tag convention: +* 2.5.0.1 through 4.0.0-alpha-1.0 used `.` + (e.g. `3.3.0.0`, `4.0.0-alpha-1.0`). Each rebuild bumped the last digit. +* Starting with **4.0.0-alpha-1 (rebuild 1)**, we publish three tags per image: + - `` — rolling, moves on each rebuild + - `-` — immutable, never reassigned (`N=1,2,3,...`) + - `latest` — rolling, newest stable only (prereleases never displace it) + +The legacy 3.x patch flow in the external `apache/tika-docker` repo still uses +the `.N` convention until 4.0.0 GA. + +* 4.0.0-alpha-1 (11 May 2026, rebuild 1) + * Tag scheme changed to `` + `-` + `latest`. + * Migrated build out of the external `apache/tika-docker` repo into + `tika-server/docker-build/` in `apache/tika`. + * Switched server packaging to the unpacked `tika-server-standard-bin.zip` + (`/opt/tika-server/`). Bundles the `tika-pipes-file-system` plugin from + the upstream bin.zip. Pipes-mode endpoints (`/pipes`, `/async`) with + other fetchers/emitters need plugins mounted into + `/opt/tika-server/plugins/`. + * Upgraded base to Ubuntu 26.04 (resolute) and JRE to OpenJDK 25. * Dropped `linux/arm/v7` from the published platforms. 32-bit ARM emulated - builds on Ubuntu 26.04 (resolute) hit a qemu chown-overflow in - `update-notifier-common`'s postinst, which is pulled in by - `ttf-mscorefonts-installer`. `linux/arm64/v8` covers modern ARM. + builds on resolute hit a qemu chown-overflow in `update-notifier-common`'s + postinst (pulled in by `ttf-mscorefonts-installer`). `linux/arm64/v8` + covers modern ARM. + +* 4.0.0-alpha-1.0 (9 May 2026) — frozen legacy tag + * First 4.0.0-alpha-1 release using the old `.N` convention. Retagged + afterward so `4.0.0-alpha-1` (no `.0`) points at the same digest. * 3.3.0.0 (23 Mar 2026) * First 3.3.0 release diff --git a/tika-server/docker-build/README.md b/tika-server/docker-build/README.md index 05b874a075e..4d4beccedfe 100644 --- a/tika-server/docker-build/README.md +++ b/tika-server/docker-build/README.md @@ -16,12 +16,26 @@ To install more languages, set the build argument `LANGUAGES` or include your ow ## Available Tags -Below are the most recent tags. The `latest` tags track the 3.x stable line; -4.x preview releases are published as version-specific tags only. -- `latest`, `3.3.0.0`: Apache Tika Server 3.3.0.0 (Minimal) -- `latest-full`, `3.3.0.0-full`: Apache Tika Server 3.3.0.0 (Full) -- `4.0.0-alpha-1.0`: Apache Tika Server 4.0.0-alpha-1.0 (Minimal, 4.x preview) -- `4.0.0-alpha-1.0-full`: Apache Tika Server 4.0.0-alpha-1.0 (Full, 4.x preview) +Each 4.x release publishes three tags per image, all pointing at the same +manifest digest: + +- `apache/tika:` — mutable, rolls forward on Docker-only rebuilds for the same Tika version. +- `apache/tika:-` — immutable, never reassigned. Pin to this if you want stability across rebuilds. `N=1` is the initial build; `N=2,3,...` for subsequent rebuilds (CVE fixes, base-image refresh, etc.). +- `apache/tika:latest` — rolling pointer to the newest **stable** release. Stays on 3.x until 4.0.0 GA; preview tags (`-alpha`, `-BETA`, `-RC`) do **not** displace it. + +(Same scheme applies to the `-full` variants and to `apache/tika-grpc`, with +the caveat that `apache/tika-grpc:latest` always tracks the newest 4.x release +since there's no 3.x incumbent.) + +Most recent tags: +- `latest`, `latest-full`: Apache Tika Server 3.3.0 (currently — moves to 4.0.0 at GA) +- `4.0.0-alpha-1`, `4.0.0-alpha-1-1`: Apache Tika Server 4.0.0-alpha-1 (Minimal, 4.x preview) +- `4.0.0-alpha-1-full`, `4.0.0-alpha-1-1-full`: Apache Tika Server 4.0.0-alpha-1 (Full, 4.x preview) + +Legacy 3.x and earlier tags use the `.` +convention (e.g. `3.3.0.0`, `3.2.3.0`). Those tags are immutable and still +pullable. + - `3.3.0.0`, `3.3.0.0`: Apache Tika Server 3.3.0.0 (Minimal) - `3.3.0.0`, `3.3.0.0-full`: Apache Tika Server 3.3.0.0 (Full) - `3.2.3.0`, `3.2.3.0`: Apache Tika Server 3.2.3.0 (Minimal) @@ -88,8 +102,11 @@ You can see a full set of tags for historical versions [here](https://hub.docker ## 4.x Preview Notes -The `4.0.0-alpha-1.0` images are a preview of the upcoming Tika 4.x line and are -not tagged `latest`. +The `4.0.0-alpha-1` images are a preview of the upcoming Tika 4.x line and are +not tagged `latest`. Tag scheme is `` (rolling) plus +`-` (immutable) — see Available Tags above. The legacy `.N` +suffix (`4.0.0-alpha-1.0`) is retained as a frozen pointer to the first build +but is no longer the active convention. Tika 4.x changed the `tika-server-standard` packaging: the published jar is now a thin top-level jar that resolves its dependencies from a sibling `lib/` diff --git a/tika-server/docker-build/docker-tool.sh b/tika-server/docker-build/docker-tool.sh index db05dddf2ec..45c762a30a3 100755 --- a/tika-server/docker-build/docker-tool.sh +++ b/tika-server/docker-build/docker-tool.sh @@ -38,7 +38,9 @@ while getopts ":h" opt; do echo " docker-tool.sh test Tests images for ." echo " docker-tool.sh test-uat Runs the tika-server REST UAT against images for ." echo " Requires TIKA_MAIN env var or sibling tika-main checkout (../tika-main)." - echo " docker-tool.sh publish Builds multi-arch images for and pushes to Docker Hub." + echo " docker-tool.sh publish Builds multi-arch images and pushes three tags per image:" + echo " (mutable), - (immutable)," + echo " and latest (for non-prerelease tags only)." exit 0 ;; \? ) @@ -162,12 +164,42 @@ case "$subcommand" in ;; publish) + # publish + # Tag scheme: + # apache/tika: (mutable; moves on each rebuild) + # apache/tika:- (immutable; one per rebuild) + # apache/tika:latest (only for non-prerelease tags; tracks newest stable) + # (plus the matching -full variants for the full image). + publish_tika_version=$tika_docker_version # first positional arg + publish_build_number=$tika_version # second positional arg + if [[ -z "$publish_tika_version" || -z "$publish_build_number" ]]; then + die "Usage: $0 publish " + fi + # Only move :latest for non-prerelease tags. Preview releases never displace + # the latest-stable pointer. + push_latest=true + case "$publish_tika_version" in + *-alpha*|*-BETA*|*-RC*|*-SNAPSHOT*) push_latest=false ;; + esac + + minimal_tags=( --tag "${image_name}:${publish_tika_version}" \ + --tag "${image_name}:${publish_tika_version}-${publish_build_number}" ) + full_tags=( --tag "${image_name}:${publish_tika_version}-full" \ + --tag "${image_name}:${publish_tika_version}-${publish_build_number}-full" ) + if $push_latest; then + minimal_tags+=( --tag "${image_name}:latest" ) + full_tags+=( --tag "${image_name}:latest-full" ) + else + echo "Skipping :latest for prerelease tag: $publish_tika_version" + fi + docker buildx create --use --name tika-builder || die "couldn't create builder" - # Build multi-arch with buildx and push docker buildx build --platform linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ - --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" + "${minimal_tags[@]}" --build-arg TIKA_VERSION=${publish_tika_version} --no-cache --builder tika-builder minimal \ + || stop_and_die "couldn't build multi-arch minimal" docker buildx build --platform linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ - --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" + "${full_tags[@]}" --build-arg TIKA_VERSION=${publish_tika_version} --no-cache --builder tika-builder full \ + || stop_and_die "couldn't build multi-arch full" docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " ;; From cba6367c00525ffe8c7699702f6a8b8da51cf77a Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 11 May 2026 21:02:32 -0400 Subject: [PATCH 3/5] TIKA-4725 - rat --- tika-parent/pom.xml | 1 + tika-server/docker-build/CHANGES.md | 17 +++++++++++++++++ tika-server/docker-build/README.md | 21 +++++++++++++++++++-- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 89fa761d0f2..2e910308f38 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1670,6 +1670,7 @@ nbactions.xml **/test-documents/** **/target/** + **/sample-configs/**/*.json diff --git a/tika-server/docker-build/CHANGES.md b/tika-server/docker-build/CHANGES.md index 5d59f27c942..3515981c9d3 100644 --- a/tika-server/docker-build/CHANGES.md +++ b/tika-server/docker-build/CHANGES.md @@ -1,3 +1,20 @@ + + # Changes Tag convention: diff --git a/tika-server/docker-build/README.md b/tika-server/docker-build/README.md index 4d4beccedfe..ac95456e194 100644 --- a/tika-server/docker-build/README.md +++ b/tika-server/docker-build/README.md @@ -1,6 +1,23 @@ -# tika-docker + + +# Apache Tika Docker images + +These files build the official Docker images for Apache Tika Server published as [apache/tika](https://hub.docker.com/r/apache/tika) on DockerHub by the [Apache Tika](http://tika.apache.org) Dev team. The images create a functional Apache Tika Server instance that contains the latest Ubuntu running the appropriate version's server on Port 9998 using Java 8 (until version 1.20), Java 11 (1.21 and 1.24.1), Java 14 (until 1.27/2.0.0), Java 16 (for 2.1.0), and Java 17 LTS for newer versions. From 5276ba37cc86c139ef981feab901f502492c98da Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 12 May 2026 08:28:33 -0400 Subject: [PATCH 4/5] TIKA-4725 - refactor to automatically release when we add the release tag --- .github/workflows/docker-release.yml | 74 ++++++++++---- .../maintainers/release-guides/docker.adoc | 97 +++++++++++++++++-- .../maintainers/release-guides/tika.adoc | 45 ++++++++- 3 files changed, 182 insertions(+), 34 deletions(-) diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 7abd73d34f2..2cef38f5aa8 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -17,16 +17,22 @@ name: Docker release - tika-server and tika-grpc -# Auto-trigger on tag push is disabled (TIKA-4725). The official tika-docker -# images on Docker Hub (apache/tika) are published from the apache/tika-docker -# repository using its own Dockerfiles and tagging conventions. When this -# workflow ran on the 4.0.0-alpha-1 source tag it pushed an image built from -# the stale Dockerfiles under tika-server/docker-build/ to -# apache/tika:4.0.0-alpha-1, which collided with the tika-docker-managed tag -# and ran with the pre-4.x bare-jar entrypoint (broken plugin loading). Re-enable -# only after the in-repo Dockerfiles are kept in sync with (or replaced by a -# pointer to) apache/tika-docker. +# Auto-trigger only on GA-style version tags (digit.digit.digit with no +# hyphen). Prerelease tags (`4.0.0-rc1`, `4.0.0-alpha-1`, `4.0.0-BETA`, +# `4.0.0-SNAPSHOT`) and branch-style tags (`branch_4x`) intentionally do not +# fire — the convention is: +# release:prepare creates `X.Y.Z-rcN` for the vote (workflow stays silent), +# vote passes, the release manager pushes a separate `X.Y.Z` tag pointing +# at the same commit, and *that* push triggers this workflow. +# Manual rebuilds (CVE in base image, plugin refresh) use workflow_dispatch +# with an explicit build_number. on: + push: + tags: + - '[0-9]*.[0-9]*.[0-9]*' # 4.0.0, 10.20.30, etc. + tags-ignore: + - '*-*' # anything with a hyphen is a prerelease/non-GA tag + - '*_*' # anything with an underscore is a branch/non-version tag workflow_dispatch: inputs: tag: @@ -36,6 +42,16 @@ on: description: 'Docker build number for this Tika tag (1 for first build, increment on rebuilds).' required: true default: '1' + source_ref: + description: 'Git ref to build from. Defaults to `tag`. Override only for Dockerfile-update rebuilds where the source has changed since the original tag was cut.' + required: false + +# Resolve the effective tag and build number from either trigger source. +# `inputs.*` is populated only by workflow_dispatch; on a tag push, fall +# back to the tag's short name (e.g. `4.0.0`) and build_number=1. +env: + TAG: ${{ inputs.tag || github.ref_name }} + BUILD: ${{ inputs.build_number || '1' }} jobs: release-tika-server: @@ -45,7 +61,8 @@ jobs: steps: - uses: actions/checkout@v6 with: - ref: ${{ inputs.tag }} + ref: ${{ inputs.source_ref || env.TAG }} + fetch-depth: 0 # full history so we can push a provenance tag at the end # Compute the tag set for each image. Three tags per image at minimum: # apache/tika: (mutable; moves on each rebuild) @@ -55,8 +72,8 @@ jobs: - name: Compute tags id: tags run: | - tag='${{ inputs.tag }}' - build='${{ inputs.build_number }}' + tag='${{ env.TAG }}' + build='${{ env.BUILD }}' minimal="apache/tika:${tag} apache/tika:${tag}-${build}" full="apache/tika:${tag}-full @@ -64,8 +81,11 @@ jobs: grpc="apache/tika-grpc:${tag} apache/tika-grpc:${tag}-${build} apache/tika-grpc:latest" + # Any hyphen in the tag = prerelease (alpha/beta/rc/SNAPSHOT/etc., + # in any case). Mirrors the `tags-ignore: ['*-*']` rule on the + # auto-trigger so manual workflow_dispatch behaves the same way. case "$tag" in - *-alpha*|*-BETA*|*-RC*) + *-*) echo "Prerelease tag $tag — skipping :latest for apache/tika." ;; *) @@ -100,7 +120,7 @@ jobs: platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | - TIKA_VERSION=${{ inputs.tag }} + TIKA_VERSION=${{ env.TAG }} tags: ${{ steps.tags.outputs.minimal }} - name: Build and push tika-server full @@ -110,9 +130,23 @@ jobs: platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | - TIKA_VERSION=${{ inputs.tag }} + TIKA_VERSION=${{ env.TAG }} tags: ${{ steps.tags.outputs.full }} + # After a successful publish, push a `-` git tag for + # provenance. Skipped on build_number=1 because the original `` already + # marks the source state of build 1. Lives in the server job (not the grpc + # job) to avoid both jobs racing to push the same tag. + - name: Push provenance git tag + if: ${{ env.BUILD != '1' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git tag "${TAG}-${BUILD}" + git push origin "${TAG}-${BUILD}" + release-tika-grpc: runs-on: ubuntu-latest timeout-minutes: 120 @@ -120,7 +154,7 @@ jobs: steps: - uses: actions/checkout@v6 with: - ref: ${{ inputs.tag }} + ref: ${{ inputs.source_ref || env.TAG }} - name: Set up JDK 17 uses: actions/setup-java@v5 @@ -147,8 +181,8 @@ jobs: - name: Compute grpc tags id: grpc_tags run: | - tag='${{ inputs.tag }}' - build='${{ inputs.build_number }}' + tag='${{ env.TAG }}' + build='${{ env.BUILD }}' { echo "tags<-` tag alongside the rolling `` tag. +The Docker build number for this Tika tag. Use `1` for the initial publish; +increment when re-publishing the same Tika version with Docker-only changes +(CVE fixes in the base image, refreshed apt packages, etc.). See +<> below for the full rebuild flow. + +`source_ref`:: +Optional. Git ref to build from. Defaults to the value of `tag`. Override only +for Docker-only rebuilds where the Dockerfile or other build inputs have +changed since the original `tag` was cut — for example, when you've made +Dockerfile updates on `main` after the GA release and want build 2 to pick +them up. *Via the GitHub UI:* @@ -137,6 +160,60 @@ The `-full` variants (`-full`, `--full`, `latest-full`) follow the same scheme. `apache/tika-grpc` also publishes the three-tag pattern, but its `:latest` is pushed unconditionally (no 3.x incumbent to protect). +[[republishing]] +=== Re-publishing an existing Tika version (Docker-only rebuild) + +When the Tika source hasn't changed but you need a new Docker image — base +image CVE, refreshed apt packages, Dockerfile fix — bump `build_number` +instead of cutting a new Tika version. + +The Tika git tag (e.g. `4.0.0`) stays put. The `-` suffix in +`apache/tika:4.0.0-2` is a *Docker Hub tag only*, never a git tag pushed by +hand. The workflow auto-creates a `4.0.0-2` git tag at the same SHA it built +from for provenance. + +*Case 1: pure base-image refresh* (no Dockerfile changes — `FROM ubuntu:resolute` +just picks up newer upstream layers). + +[source,bash] +---- +gh workflow run docker-release.yml \ + -f tag=4.0.0 \ + -f build_number=2 +---- + +`source_ref` defaults to the `tag`, so the workflow checks out at the +original `4.0.0` source state. + +*Case 2: Dockerfile changes since the original release.* Land the +Dockerfile changes on `main` first (or on a branch). Then point the +workflow at that ref: + +[source,bash] +---- +gh workflow run docker-release.yml \ + -f tag=4.0.0 \ + -f build_number=2 \ + -f source_ref=main +---- + +In either case, the workflow: + +. Builds from `inputs.source_ref` (or the original `tag` if unset). +. Publishes `apache/tika:4.0.0-2` (immutable), retags `apache/tika:4.0.0` + and `apache/tika:latest` to the new digest, plus the matching `-full` and + `tika-grpc` tags. +. Pushes a git tag `4.0.0-2` pointing at the source SHA used. The + `tags-ignore: ['*-*']` rule means this provenance tag does not re-trigger + the workflow. + +Six months later, `git show 4.0.0-2` shows the exact source state for that +build and `docker pull apache/tika:4.0.0-2` returns the image built from it. + +NOTE: The provenance-tag step runs only when `build_number != 1`. The +initial build's source state is already marked by the original Tika git tag +(e.g. `4.0.0`); no need to duplicate it as `4.0.0-1`. + === Step 3: Watch the run A successful run takes ~30–45 minutes (multi-arch builds across `linux/amd64`, diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc index 7d65fe86372..286d4ed6f87 100644 --- a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc +++ b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc @@ -121,6 +121,20 @@ This will prompt you to confirm: * The SCM tag name * The next development version +[IMPORTANT] +==== +*Always enter `X.Y.Z-rcN` as the SCM tag name — never the bare `X.Y.Z`.* + +Pushing a tag of the form `X.Y.Z` (digit.digit.digit, no hyphen) immediately +auto-triggers the Docker release workflow, which publishes images to Docker +Hub and moves `apache/tika:latest`. If you enter bare `X.Y.Z` here, those +images go live *before the vote has even started.* + +For the first vote: enter `X.Y.Z-rc1`. For a second RC: `X.Y.Z-rc2`. Etc. The +bare `X.Y.Z` tag is created later in Step 12 after the vote passes, and +that push is what triggers the Docker release. +==== + === Step 8: Perform the Release Execute the Maven release perform goal: @@ -231,13 +245,27 @@ Upon successful vote (at least 3 +1 votes from PMC members): . Release the Nexus staging repository (click "Release" button) . Move artifacts from dev to release distribution: - ++ [source,bash] ---- svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \ https://dist.apache.org/repos/dist/release/tika/X.Y.Z \ -m "Release Apache Tika X.Y.Z" ---- +. Create the GA git tag from the winning RC and push it. *This auto-triggers + the Docker release workflow* (see xref:maintainers/release-guides/docker.adoc[Releasing + Tika Docker Images]): ++ +[source,bash] +---- +git tag X.Y.Z X.Y.Z-rcN # point GA tag at the same commit as the winning RC +git push origin X.Y.Z +---- ++ +For a prerelease (`X.Y.Z-alpha-N`, `X.Y.Z-beta-N`, etc.) where you don't +want `:latest` to move and don't want the workflow to auto-fire, skip this +substep. Trigger the Docker release manually via `workflow_dispatch` instead; +see the Docker guide. [[troubleshooting]] == Troubleshooting `release:perform` @@ -341,10 +369,19 @@ Refresh the website documentation to reflect the new release: === Release Docker and Helm Images -Follow the separate guides for releasing: +For a GA release, the Docker images publish automatically when the +`X.Y.Z` tag is pushed in Step 12 above — no manual step needed. Watch the +"Docker release - tika-server and tika-grpc" workflow run in the Actions +tab to confirm. See xref:maintainers/release-guides/docker.adoc[Releasing +Tika Docker Images] for the tag scheme, verification steps, and how to +publish a manual rebuild (CVE in base image, etc.). + +For a prerelease (`X.Y.Z-alpha-N`, `X.Y.Z-beta-N`, RC variants), the Docker +workflow does *not* auto-fire — trigger it manually via `workflow_dispatch` +per the Docker guide. -* link:docker.html[Docker images] -* link:helm.html[Helm charts] +Helm charts are released separately via xref:maintainers/release-guides/helm.adoc[Releasing +Tika Helm Charts]. === Send Announcements From aaa49b9742172d72c4093944a496110f8061aa6d Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 12 May 2026 08:45:36 -0400 Subject: [PATCH 5/5] TIKA-4725 - refactor to automatically release when we add the release tag --- .github/workflows/docker-release.yml | 46 +++++++++++++++---- .../maintainers/release-guides/docker.adoc | 7 +++ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 2cef38f5aa8..b5b1b58208a 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -17,22 +17,25 @@ name: Docker release - tika-server and tika-grpc -# Auto-trigger only on GA-style version tags (digit.digit.digit with no -# hyphen). Prerelease tags (`4.0.0-rc1`, `4.0.0-alpha-1`, `4.0.0-BETA`, -# `4.0.0-SNAPSHOT`) and branch-style tags (`branch_4x`) intentionally do not -# fire — the convention is: +# Auto-trigger on tag push for GA-style version tags only. The convention is: # release:prepare creates `X.Y.Z-rcN` for the vote (workflow stays silent), # vote passes, the release manager pushes a separate `X.Y.Z` tag pointing # at the same commit, and *that* push triggers this workflow. # Manual rebuilds (CVE in base image, plugin refresh) use workflow_dispatch # with an explicit build_number. +# +# GH Actions doesn't allow combining `tags` (include) and `tags-ignore` on +# a single event, so the filter is expressed as `tags-ignore` only. Any tag +# without a hyphen or underscore fires the workflow; this rejects prerelease +# tags (`4.0.0-rc1`, `4.0.0-alpha-1`, `4.0.0-BETA`) and branch-style tags +# (`branch_4x`). The `Compute tags` step has a separate prerelease check +# (`*-*`) that gates `:latest` defense-in-depth for the workflow_dispatch +# path where the auto-trigger filter isn't in play. on: push: - tags: - - '[0-9]*.[0-9]*.[0-9]*' # 4.0.0, 10.20.30, etc. tags-ignore: - - '*-*' # anything with a hyphen is a prerelease/non-GA tag - - '*_*' # anything with an underscore is a branch/non-version tag + - '*-*' # anything with a hyphen is a prerelease/non-GA tag + - '*_*' # anything with an underscore is a branch/non-version tag workflow_dispatch: inputs: tag: @@ -54,7 +57,33 @@ env: BUILD: ${{ inputs.build_number || '1' }} jobs: + # Gating job for push triggers: refuse to publish if the tag isn't shaped + # like a GA version (digit+ . digit+ . digit+). The `tags-ignore` filter at + # the `on:` level already blocks anything with `-` or `_`, but it doesn't + # reject other oddities like `wip`, `foo`, or `test`. This job is the + # belt-and-suspenders to those (which were the suspenders). + # + # workflow_dispatch trigger is permissive — humans pick the tag (which can + # be alpha/beta/RC) — so the strict check is push-only. + validate-tag: + runs-on: ubuntu-latest + steps: + - name: Reject non-GA-style tags on push triggers + run: | + if [[ "${{ github.event_name }}" != "push" ]]; then + echo "workflow_dispatch trigger — skipping strict tag-shape validation." + exit 0 + fi + tag='${{ github.ref_name }}' + if [[ ! "$tag" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "::error title=Non-GA tag::Refusing to publish: tag '$tag' is not a GA-style X.Y.Z." + echo "::error::For prerelease publishes (alpha/BETA/RC), use workflow_dispatch with an explicit tag and build_number." + exit 1 + fi + echo "Tag '$tag' is GA-style. Proceeding." + release-tika-server: + needs: validate-tag runs-on: ubuntu-latest timeout-minutes: 60 @@ -148,6 +177,7 @@ jobs: git push origin "${TAG}-${BUILD}" release-tika-grpc: + needs: validate-tag runs-on: ubuntu-latest timeout-minutes: 120 diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc index 2e7675d768b..76e6bfc231d 100644 --- a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc +++ b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc @@ -83,6 +83,13 @@ automatically. Prerelease tags (`4.0.0-rc1`, `4.0.0-alpha-1`, anything with a hyphen) and branch-style tags (`branch_4x`, anything with an underscore) are filtered out by `tags-ignore: ['*-*', '*_*']` and stay silent. +A second `validate-tag` gating job enforces strict `X.Y.Z` shape on push +triggers (defense-in-depth against odd tag names like `wip` that bypass the +`tags-ignore` filter). It fails fast with a clear error before any build +starts. It's skipped for `workflow_dispatch` triggers, which are intentionally +permissive — that path is used for prerelease publishes where the tag name +won't be GA-shaped. + The standard ASF release flow looks like: . `release:prepare` creates `X.Y.Z-rcN` for the vote → workflow does *not*