diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 0000000000..b8ae386167
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,14 @@
+# Commits listed here are skipped by `git blame` so that mechanical, whole-tree
+# reformats do not obscure the author who actually wrote each line.
+#
+# GitHub honors this file automatically in the web blame view. For `git blame`
+# on the command line, opt in once per clone:
+#
+# git config blame.ignoreRevsFile .git-blame-ignore-revs
+#
+# When adding a new entry, include a one-line comment above the SHA explaining
+# what the commit did and why it should be skipped. Only mechanical reformats
+# belong here -- never use this to hide substantive changes.
+
+# Apply Palantir Java Format to entire codebase (#1761)
+4b75dd524198dea5b789fd383f99ce974510fb1d
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 64fe051ba9..4f1a4eb7f6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -21,6 +21,8 @@ jobs:
name: Java ${{ matrix.Java }} build and test
steps:
- uses: actions/checkout@v3
+ with:
+ fetch-depth: 0 # full history + tags so palantir/git-version sees the latest release tag
- name: Set up java ${{ matrix.Java }}
uses: actions/setup-java@v3
with:
@@ -49,6 +51,8 @@ jobs:
name: Tests that require external APIs
steps:
- uses: actions/checkout@v3
+ with:
+ fetch-depth: 0 # full history + tags so palantir/git-version sees the latest release tag
- name: Set up java 17
uses: actions/setup-java@v3
with:
@@ -67,11 +71,30 @@ jobs:
with:
name: test-results-external-apis
path: build/reports/tests
+ formatCheck:
+ runs-on: ubuntu-latest
+ name: Java Format Check
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ fetch-depth: 0 # full history + tags so palantir/git-version sees the latest release tag
+ - name: Set up java 17
+ uses: actions/setup-java@v3
+ with:
+ java-version: '17'
+ distribution: 'adopt'
+ cache: gradle
+ - name: Grant execute permission for gradlew
+ run: chmod +x gradlew
+ - name: Verify formatting
+ run: ./gradlew spotlessCheck
spotBugs:
runs-on: ubuntu-latest
name: SpotBugs
steps:
- uses: actions/checkout@v3
+ with:
+ fetch-depth: 0 # full history + tags so palantir/git-version sees the latest release tag
- name: Set up java 17
uses: actions/setup-java@v3
with:
diff --git a/.gitignore b/.gitignore
index 03a8d6d509..606b65033a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
htsjdk.iws
.command_tmp
atlassian-ide-plugin.xml
-/htsjdk.version.properties
/test-output/
.DS_Store
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index f00fe8b27e..0000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-language: java
-dist: trusty
-sudo: true
-services:
- - docker
-before_cache:
- - rm -f $HOME/.gradle/caches/modules-2/modules-2.lock
-cache:
- directories:
- - $HOME/.gradle/caches/
- - $HOME/.gradle/wrapper/
- - $HOME/.m2
-env:
- global:
- - HTSJDK_SAMTOOLS_BIN=/usr/bin/samtools
-jdk:
- - oraclejdk8
- - openjdk8
- - openjdk11
-matrix:
- fast_finish: true
- allow_failures:
- - env: TEST_TYPE=EXTERNAL_APIS
- - env: TEST_TYPE=FTP
- include:
- - jdk: oraclejdk8
- env: TEST_TYPE=EXTERNAL_APIS
- - jdk: oraclejdk8
- env: TEST_TYPE=FTP
- - jdk: openjdk8
- env: SPOT_BUGS=true
-
-before_install:
- - scripts/install-samtools.sh
- - scripts/htsget-scripts/start-htsget-test-server.sh
-
-script:
- - if [[ $SPOT_BUGS == "true" ]]; then
- ./gradlew spotBugsMain spotBugsTest;
- elif [[ $TEST_TYPE == "FTP" ]]; then
- ./gradlew testFTP jacocoTestReport;
- elif [[ $TEST_TYPE == "EXTERNAL_APIS" ]]; then
- ./gradlew testExternalApis jacocoTestReport;
- else
- ./gradlew test jacocoTestReport;
- fi
-
-after_success:
- - bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
- - echo "TRAVIS_BRANCH='$TRAVIS_BRANCH'";
- echo "JAVA_HOME='$JAVA_HOME'";
- if [ "$TRAVIS_BRANCH" == "master" ]; then
- if [[ $JAVA_HOME = *java-8-openjdk* ]]; then
- ./gradlew publish;
- fi;
- fi;
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3472be10fc..83f6c1035b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,132 @@ early infrastructure for a plugin-based codec framework and resource bundles.
---
+## 5.0.0
+
+Major release.
+
+### Headlines
+
+- **CRAM 3.1 write support** (the culmination of the read-side codec work in 4.2.0 and the reader wiring in 4.3.0 — htsjdk can now produce CRAM 3.1 files that are interoperable with samtools/htslib).
+- **CRAM 3.1 is now the default write version** (previously 3.0). On the same input, files written with the new default `NORMAL` profile are roughly 36% smaller and encode 18-20% faster than what htsjdk 4.3 produced with its `FAST` (3.0) default.
+- **Major speed-ups across the BAM and CRAM read/write paths** vs htsjdk 4.3.0. Measured on AWS m8gd / m8id (single thread, 32.7M-read input), the headline wins are: BAM write 50-58% faster, CRAM encode (FAST) 41-47% faster, CRAM read 42-46% faster, BAM read 30-31% faster.
+- **`jlibdeflate` is now the default DEFLATE engine** ([jlibdeflate](https://github.com/fulcrumgenomics/jlibdeflate) wrapping native libdeflate); falls back to the JDK `Deflater`/`Inflater` if the native library cannot be loaded.
+- **Slimmed-down runtime dependency tree** (SRA support removed, Nashorn moved to an opt-in dependency, several stale or misleading dependency declarations cleaned up).
+- **Enforced automatic code formatting** via Palantir Java Format on every build.
+- **Unit test improvements**: pass/fail stats now reported correctly when run via Gradle, and total suite runtime massively reduced (now 2-3 minutes).
+
+### ⚠️ Breaking changes
+
+Consumers should review these before upgrading.
+
+- **SRA support removed.** All `htsjdk.samtools.sra.*` types, `SRAFileReader`, `SRAIterator`,
+ `SRAIndex`, `SamInputResource.of(SRAAccession)`, `SamReader.Type.SRA_TYPE`, and the
+ `InputResource.Type.SRA_ACCESSION` enum value have been deleted. The
+ `gov.nih.nlm.ncbi:ngs-java` dependency (and the `samjdk.sra_libraries_download` system
+ property) are gone. Consumers needing SRA access must use NCBI's tooling or a different
+ library (#1774).
+- **Nashorn is no longer a transitive runtime dependency.** The `JavascriptSamRecordFilter`
+ and `JavascriptVariantFilter` classes still exist but htsjdk no longer ships
+ `org.openjdk.nashorn:nashorn-core` (or its 5 ASM transitives) on consumers' runtime
+ classpath. Consumers who use the JavaScript filter classes must add
+ `org.openjdk.nashorn:nashorn-core:15.7` (or another JSR-223 `"js"` engine) to their own
+ runtime classpath; the no-engine error message names the artifact and prints both Gradle
+ and Maven coordinates (#1775).
+- **`SAMRecord.toString()` now returns the full SAM-format string** for the record (all 11
+ mandatory SAM fields plus tags), replacing the previous minimal summary. The previous
+ output was usually insufficient to debug failures in `println()` calls or test-assertion
+ messages; the new output is the same line you would see in a SAM file. Anything that
+ parses or asserts against the exact old format will need updating (#1762).
+- **CRAM slice headers no longer include the optional content digest tags** (BD/SD/B5/S5/B1/S1).
+ Matches htslib/samtools behavior. Block-level CRC32 (required since CRAM 3.0) still
+ provides data integrity. Technically a wire-format change but with zero known practical
+ impact, since no known tools consume these tags.
+- **Default CRAM version for writing is now 3.1** (was 3.0). CRAM 3.0 readers will not be
+ able to read newly-produced files; pass an explicit version to the writer if you need 3.0
+ output.
+
+### CRAM 3.1 Write Support
+
+- Enable CRAM 3.1 writing with all spec codecs: rANS Nx16, adaptive arithmetic Range coder, FQZComp, Name Tokenisation, and STRIPE
+- Add configurable compression profiles (FAST, NORMAL, SMALL, ARCHIVE) with trial compression for automatic codec selection
+- Implement `TrialCompressor` to replace ad-hoc triple-compression for tags and align trial candidates with htslib
+- Add `GzipCodec` for direct Deflater/Inflater GZIP compression, wired into CRAM as a codec option
+- Strip NM/MD tags on CRAM encode and regenerate on decode, matching htslib behavior
+- Implement attached (same-slice) mate pair resolution
+- Align DataSeries content IDs with htslib for cross-implementation debugging
+- Remove content digest tags (BD/SD/B5/S5/B1/S1) from CRAM slice headers, matching htslib/samtools behavior (see Breaking changes)
+- Default CRAM version for writing is now 3.1 (was 3.0; see Breaking changes)
+- Add `CramConverter` command-line tool for testing and benchmarking CRAM write profiles
+- Add cross-implementation CRAM validation pipeline (`validation/`) for round-tripping against samtools/htslib
+- Add bases-per-slice threshold to bound slice memory when writing long reads
+- Refine `CompressionHeader` map serialization
+- Resolve a pile of in-tree `TODO`s in CRAM structure classes
+
+### CRAM correctness and cross-implementation fixes
+
+These fixes apply to both reading and writing CRAM and substantially improve interoperability with samtools/htslib.
+
+- Fix CRAM `TLEN` computation to match htslib (cross-tool comparisons of the same input now produce matching `TLEN` values)
+- Fix `CIGAR` reconstruction when the sequence is `*` (`CF_UNKNOWN_BASES`)
+- Fix `=`/`X` `CIGAR` op comparison in cross-implementation tests
+- Fix CRAM archive header overflow on large containers
+- Fix crash when reading a CRAM container with no slices
+- Fix unmapped-read query in the hts-specs compliance harness
+- Document the supplementary/secondary read-name resolution limitation in the writer
+
+### Codec and Compression Optimizations
+
+- Refactor and optimize all rANS codecs: byte-array API, backwards-write encoding, and general simplifications
+- Optimize Name Tokeniser encoder: replace regex with hand-written parser; add per-type flags, STRIPE support, stream deduplication, and all-MATCH elimination
+- Optimize FQZComp, Range coder, and rANS encoder hot paths
+- Tune NORMAL profile codec assignments based on empirical compression testing
+
+### Performance
+
+- Integrate [jlibdeflate](https://github.com/fulcrumgenomics/jlibdeflate) for native libdeflate-backed DEFLATE compression and decompression. Used by default; falls back to the JDK Deflater/Inflater if the native library cannot be loaded (#1768)
+- A few targeted optimizations to the BAM decoding path yielding ~6-7% improvement in BAM read performance (#1764)
+- Replace `ByteArrayInputStream`/`ByteArrayOutputStream` with unsynchronized `CRAMByteReader`/`CRAMByteWriter` to eliminate synchronization overhead in CRAM
+- Fuse read base restoration, CIGAR building, and NM/MD computation into a single pass during CRAM decode
+- Cache tag key metadata to eliminate per-record `String` allocation during CRAM decode
+- Pool `RANSNx16Decode` instances in the Name Tokeniser
+- Optimize BAM nibble-to-ASCII base decoding with a bulk lookup table
+
+### Bug fixes
+
+- Fix LTF8 9-byte write bug: wrong bit shift (`>> 28` instead of `>> 24`) corrupted the high byte of large CRAM offsets (#1765)
+- Fix `SamLocusIterator` so that read position is not incorrectly offset (#1758)
+- Fix asymmetric `SamPairUtil.getPairOrientation` on dovetail pairs (#1771)
+- Catch `UnsatisfiedLinkError` when loading the snappy native library so failure to load it does not abort downstream consumers (#1753)
+
+### Build, tooling, and dependency clean-up
+
+- **Code formatting:** apply [Palantir Java Format](https://github.com/palantir/palantir-java-format) to the entire codebase and enforce it on every build via [Spotless](https://github.com/diffplug/spotless). `compileJava` auto-formats source in place; CI separately runs `spotlessCheck` as the enforcement boundary. See `CONTRIBUTING.md` for details, including the `.git-blame-ignore-revs` opt-in for the bulk-format commit (#1761)
+- **Maven Central publishing migrated** from the legacy OSSRH endpoint to the new [Sonatype Central Portal](https://central.sonatype.com), via the [NMCP Gradle plugin](https://github.com/GradleUp/nmcp). Consumer-visible groupId/artifactId/version coordinates are unchanged (#1769)
+- **Snapshot versioning** now embeds the short commit hash (e.g. `5.0.0-23c681a-SNAPSHOT`) so each snapshot is a distinct, pinnable artifact rather than a moving Maven SNAPSHOT (#1772)
+- **Test runner** now correctly reports failures rather than silently skipping them when a `@DataProvider` throws (#1759)
+- **Existing API deprecations** cleaned up across `htsjdk.samtools` and `htsjdk.variant` (#1767)
+- **`commons-logging` direct declaration removed.** htsjdk does not use commons-logging itself; the version pin is now expressed as a Gradle dependency constraint and only kicks in transitively when JEXL pulls it
+- **Nashorn moved to `compileOnly`** — see Breaking changes
+- **`gov.nih.nlm.ncbi:ngs-java` removed** — see Breaking changes (SRA support)
+
+### Compatibility
+
+- Compiled and tested against JDK 17 (CI default), 21, and 24. CI continues to build only on 17. htsjdk's published minimum remains Java 17 (set in 4.0.0)
+
+### Testing and Infrastructure
+
+- Add hts-specs CRAM 3.0 / 3.1 decode-compliance tests, plus FQZComp round-trip tests using hts-specs quality data
+- Add CRAI index query correctness tests and codec round-trip property tests
+- Split CRAM 3.1 fidelity tests into per-profile classes for parallel execution
+- Speed up BCF2 and SeekableStream integration tests; cache test data in CRAM index test classes
+- Reduce `CRAMFileBAIIndexTest` from 4 to 2 slice-size variants, sampling every 200th
+- Downsample the CEUTrio test CRAM from ~654K to ~150K records (47 MB → 11 MB)
+- Reduce memory pressure in unit tests to eliminate OOM failures
+- Fix thread-safety bug in `VariantContextTestProvider` causing non-deterministic test counts
+- Bulk up the JavaScript filter test suites: replace 4 checked-in `.js` fixtures with 46 small inline-script tests covering all three constructors, return-type semantics, bindings, and error paths (#1775)
+
+---
+
## 4.3.0 (2025-05-09)
Completes CRAM 3.1 read support by wiring the codec implementations (added in 4.2.0) into
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..e84d014a2e
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,300 @@
+# Contributing to HTSJDK
+
+## Code Style
+
+HTSJDK uses [Palantir Java Format](https://github.com/palantir/palantir-java-format)
+(applied via the [Spotless](https://github.com/diffplug/spotless) Gradle plugin)
+to enforce a single, mechanical code style across the codebase. There are no
+formatting knobs to configure -- the formatter is the style guide.
+
+Formatting is applied automatically as part of `compileJava`: every build
+runs `spotlessJavaApply`, which rewrites any unformatted source in place
+before compiling. In normal use you shouldn't need to invoke the formatter
+yourself -- just build, and your code is formatted. If you want to format
+without compiling, run:
+
+```bash
+./gradlew spotlessApply
+```
+
+CI runs `./gradlew spotlessCheck` (verify-only, no mutation) so a PR with
+unformatted code still fails CI -- the local auto-format is a convenience,
+not the enforcement boundary.
+
+### Git blame and the bulk-format commit
+
+The codebase was reformatted in a single mechanical commit. To keep `git blame`
+useful (so you see the author who actually wrote each line, not the reformat
+commit), the repository ships a `.git-blame-ignore-revs` file. GitHub honors
+it automatically in the web UI; for `git blame` on the command line, opt in
+once per clone:
+
+```bash
+git config blame.ignoreRevsFile .git-blame-ignore-revs
+```
+
+## Building
+
+HTSJDK uses Gradle (via the Gradle wrapper). To build:
+
+```bash
+./gradlew jar
+```
+
+To run tests:
+
+```bash
+./gradlew test
+```
+
+To install to your local Maven repository (e.g. for testing with downstream projects):
+
+```bash
+./gradlew install
+```
+
+## Publishing to Maven Central
+
+HTSJDK is published to Maven Central via the [Sonatype Central Portal](https://central.sonatype.com).
+The build uses the [NMCP Gradle plugin](https://github.com/GradleUp/nmcp) to handle bundle
+creation and upload.
+
+### Prerequisites
+
+#### 1. Sonatype Central Portal Account and Tokens
+
+You need a Sonatype Central Portal account with access to the `com.github.samtools` namespace.
+
+Generate a user token:
+
+1. Log in to https://central.sonatype.com
+2. Go to Account > User Token
+3. Click "Generate User Token"
+4. **Save the username and password immediately** -- they are shown only once
+
+#### 2. Configure Sonatype Credentials
+
+Gradle resolves project properties in this order (highest precedence first):
+
+| Priority | Method | Example |
+|----------|--------|---------|
+| 1 | Command-line `-P` flag | `-PsonatypeUsername=...` |
+| 2 | Environment variable | `ORG_GRADLE_PROJECT_sonatypeUsername=...` |
+| 3 | `~/.gradle/gradle.properties` | `sonatypeUsername=...` |
+
+**Option A: `~/.gradle/gradle.properties`** (recommended for local development)
+
+Add to `~/.gradle/gradle.properties`:
+
+```properties
+sonatypeUsername=
* Example Content Type: Reads
@@ -221,7 +221,8 @@ public interface HtsCodec
* There can be many codecs for a given content type, each representing a different version of an
@@ -26,7 +26,7 @@
*/
public enum HtsContentType {
- //where would a FASTQ codec fit ? in the same category (which implies the same interfaces) ?
+ // where would a FASTQ codec fit ? in the same category (which implies the same interfaces) ?
/**
* Haploid reference content type (see {@link HaploidReferenceFormats} for related formats)
*/
@@ -43,7 +43,7 @@ public enum HtsContentType {
VARIANT_CONTEXTS,
/**
- * Features content type (see {@link htsjdk.beta.plugin.features} for related formats)
+ * Features content type (see {@code htsjdk.beta.plugin.features} for related formats)
*/
FEATURES,
}
diff --git a/src/main/java/htsjdk/beta/plugin/HtsDecoder.java b/src/main/java/htsjdk/beta/plugin/HtsDecoder.java
index 4040f1ce3a..b5d4693a66 100644
--- a/src/main/java/htsjdk/beta/plugin/HtsDecoder.java
+++ b/src/main/java/htsjdk/beta/plugin/HtsDecoder.java
@@ -15,8 +15,7 @@
* @param
@@ -26,7 +27,7 @@ public class HtsCodecRegistry {
* Create a registry. Protected to prevent use outside of the registry package. To create
* a private registry from outside the registry package, use {@link #createPrivateRegistry}.
*/
- protected HtsCodecRegistry() { }
+ protected HtsCodecRegistry() {}
/**
* Add a codec to the registry. If a codec that supports the same (format, version) (determined
@@ -66,15 +67,21 @@ protected HtsCodecRegistry() { }
*
* @return a mutable registry instance for private use
*/
- public synchronized static HtsCodecRegistry createPrivateRegistry() {
+ public static synchronized HtsCodecRegistry createPrivateRegistry() {
final HtsCodecRegistry privateRegistry = new HtsCodecRegistry();
// propagate the codecs from the sourceRegistry to the new registry
- HtsDefaultRegistry.htsDefaultCodecRegistry.getHaploidReferenceResolver().getCodecs()
+ HtsDefaultRegistry.htsDefaultCodecRegistry
+ .getHaploidReferenceResolver()
+ .getCodecs()
+ .forEach(c -> privateRegistry.registerCodec(c));
+ HtsDefaultRegistry.htsDefaultCodecRegistry
+ .getReadsResolver()
+ .getCodecs()
.forEach(c -> privateRegistry.registerCodec(c));
- HtsDefaultRegistry.htsDefaultCodecRegistry.getReadsResolver().getCodecs().
- forEach(c -> privateRegistry.registerCodec(c));
- HtsDefaultRegistry.htsDefaultCodecRegistry.getVariantsResolver().getCodecs()
+ HtsDefaultRegistry.htsDefaultCodecRegistry
+ .getVariantsResolver()
+ .getCodecs()
.forEach(c -> privateRegistry.registerCodec(c));
return privateRegistry;
}
@@ -84,21 +91,25 @@ public synchronized static HtsCodecRegistry createPrivateRegistry() {
*
* @return the {@link HaploidReferenceResolver} for this registry
*/
- public synchronized HaploidReferenceResolver getHaploidReferenceResolver() { return htsHaploidReferenceResolver; }
+ public synchronized HaploidReferenceResolver getHaploidReferenceResolver() {
+ return htsHaploidReferenceResolver;
+ }
/**
* Get the {@link ReadsResolver} for this registry.
*
* @return the {@link ReadsResolver} for this registry
*/
- public synchronized ReadsResolver getReadsResolver() { return htsReadsResolver; }
+ public synchronized ReadsResolver getReadsResolver() {
+ return htsReadsResolver;
+ }
/**
* Get the {@link VariantsResolver} for this registry.
*
* @return the {@link VariantsResolver} for this registry
*/
- public synchronized VariantsResolver getVariantsResolver() { return htsVariantsResolver; }
-
+ public synchronized VariantsResolver getVariantsResolver() {
+ return htsVariantsResolver;
+ }
}
-
diff --git a/src/main/java/htsjdk/beta/plugin/registry/HtsCodecResolver.java b/src/main/java/htsjdk/beta/plugin/registry/HtsCodecResolver.java
index dbfc7d7ed2..cd83db6b4e 100644
--- a/src/main/java/htsjdk/beta/plugin/registry/HtsCodecResolver.java
+++ b/src/main/java/htsjdk/beta/plugin/registry/HtsCodecResolver.java
@@ -1,18 +1,17 @@
package htsjdk.beta.plugin.registry;
+import htsjdk.annotations.InternalAPI;
+import htsjdk.beta.exception.HtsjdkException;
import htsjdk.beta.exception.HtsjdkIOException;
-import htsjdk.beta.plugin.HtsCodec;
-import htsjdk.beta.plugin.HtsVersion;
+import htsjdk.beta.exception.HtsjdkPluginException;
import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.BundleResource;
import htsjdk.beta.io.bundle.SignatureStream;
-import htsjdk.beta.exception.HtsjdkException;
-import htsjdk.beta.exception.HtsjdkPluginException;
+import htsjdk.beta.plugin.HtsCodec;
+import htsjdk.beta.plugin.HtsVersion;
import htsjdk.io.IOPath;
import htsjdk.samtools.util.Log;
-import htsjdk.annotations.InternalAPI;
import htsjdk.utils.ValidationUtils;
-
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
@@ -37,8 +36,8 @@
public class HtsCodecResolver
@@ -30,7 +25,7 @@
* {@link ReadsCodec}s, such as {@link ReadsDecoder}, {@link ReadsEncoder},
* {@link htsjdk.beta.plugin.reads.ReadsDecoderOptions}.
*/
-public class ReadsResolver extends HtsCodecResolver Usage:
+ * If no output is specified, records are read and iterated but not written.
+ */
+public class BamConverter {
+
+ private static final String USAGE = String.join(
+ "\n",
+ "Usage: BamConverter [output]",
+ "",
+ "Read and optionally convert a BAM file.",
+ "",
+ "Arguments:",
+ " input Input BAM file",
+ " output Optional output BAM file (omit to read-only)");
+
+ /**
+ * Entry point. Parses command-line arguments and performs the read/conversion.
+ *
+ * @param args command-line arguments (see USAGE for details)
+ */
+ public static void main(final String[] args) {
+ if (hasFlag(args, "--help") || hasFlag(args, "-h")) {
+ System.out.println(USAGE);
+ System.exit(0);
+ }
+ if (args.length < 1) {
+ System.err.println(USAGE);
+ System.exit(1);
+ }
+
+ final boolean eager = hasFlag(args, "--eager");
+ // Collect positional args (non-flag arguments)
+ final String[] positional =
+ java.util.Arrays.stream(args).filter(a -> !a.startsWith("--")).toArray(String[]::new);
+ if (positional.length < 1) {
+ System.err.println(USAGE);
+ System.exit(1);
+ }
+ final String inputPath = positional[0];
+ final String outputPath = positional.length > 1 ? positional[1] : null;
+
+ if (outputPath != null) {
+ System.err.printf("Converting %s -> %s%s%n", inputPath, outputPath, eager ? " (eager decode)" : "");
+ } else {
+ System.err.printf("Reading %s (no output%s)%n", inputPath, eager ? ", eager decode" : "");
+ }
+
+ final SamReaderFactory readerFactory =
+ SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
+
+ long count = 0;
+ final long startTime = System.currentTimeMillis();
+
+ try (final SamReader reader = readerFactory.open(new File(inputPath))) {
+ final SAMFileHeader header = reader.getFileHeader();
+
+ if (outputPath != null) {
+ final SAMFileWriterFactory writerFactory = new SAMFileWriterFactory();
+ try (final SAMFileWriter writer =
+ writerFactory.makeBAMWriter(header, true, new File(outputPath).toPath())) {
+ for (final SAMRecord record : reader) {
+ if (eager) record.eagerDecode();
+ writer.addAlignment(record);
+ count++;
+ if (count % 1_000_000 == 0) {
+ System.err.printf(" ... %,d records%n", count);
+ }
+ }
+ }
+ } else {
+ for (final SAMRecord record : reader) {
+ if (eager) record.eagerDecode();
+ count++;
+ if (count % 1_000_000 == 0) {
+ System.err.printf(" ... %,d records%n", count);
+ }
+ }
+ }
+ } catch (final Exception e) {
+ die("Error: " + e.getMessage());
+ }
+
+ final long elapsed = System.currentTimeMillis() - startTime;
+ final long inputSize = new File(inputPath).length();
+
+ if (outputPath != null) {
+ final long outputSize = new File(outputPath).length();
+ System.err.printf(
+ "Done. %,d records in %.1fs. Input: %,d bytes, Output: %,d bytes (%.1f%%)%n",
+ count,
+ elapsed / 1000.0,
+ inputSize,
+ outputSize,
+ inputSize > 0 ? (100.0 * outputSize / inputSize) : 0);
+ } else {
+ System.err.printf("Done. %,d records in %.1fs. Input: %,d bytes%n", count, elapsed / 1000.0, inputSize);
+ }
+ }
+
+ private static boolean hasFlag(final String[] args, final String flag) {
+ for (final String arg : args) {
+ if (flag.equals(arg)) return true;
+ }
+ return false;
+ }
+
+ private static void die(final String message) {
+ System.err.println("ERROR: " + message);
+ System.err.println();
+ System.err.println(USAGE);
+ System.exit(1);
+ }
+}
diff --git a/src/main/java/htsjdk/samtools/BamFileIoUtils.java b/src/main/java/htsjdk/samtools/BamFileIoUtils.java
index 709c8ed76d..761a22a8f7 100644
--- a/src/main/java/htsjdk/samtools/BamFileIoUtils.java
+++ b/src/main/java/htsjdk/samtools/BamFileIoUtils.java
@@ -13,14 +13,12 @@
import htsjdk.samtools.util.Md5CalculatingOutputStream;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.utils.ValidationUtils;
-
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.List;
public class BamFileIoUtils {
@@ -40,11 +38,15 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path
reheaderBamFile(samFileHeader, inputFile, outputFile, true, true);
}
-
/**
* Support File input types for backward compatibility. Use the same method with Path inputs below.
*/
- public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) {
+ public static void reheaderBamFile(
+ final SAMFileHeader samFileHeader,
+ final File inputFile,
+ final File outputFile,
+ final boolean createMd5,
+ final boolean createIndex) {
reheaderBamFile(samFileHeader, IOUtil.toPath(inputFile), IOUtil.toPath(outputFile), createMd5, createIndex);
}
@@ -57,7 +59,12 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File
* @param createMd5 Whether or not to create an MD5 file for the new BAM
* @param createIndex Whether or not to create an index file for the new BAM
*/
- public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile, final boolean createMd5, final boolean createIndex) {
+ public static void reheaderBamFile(
+ final SAMFileHeader samFileHeader,
+ final Path inputFile,
+ final Path outputFile,
+ final boolean createMd5,
+ final boolean createIndex) {
ValidationUtils.nonNull(inputFile);
ValidationUtils.nonNull(outputFile);
IOUtil.assertFileIsReadable(inputFile);
@@ -79,7 +86,11 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path
}
}
- public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
+ public static void blockCopyBamFile(
+ final File inputFile,
+ final OutputStream outputStream,
+ final boolean skipHeader,
+ final boolean skipTerminator) {
blockCopyBamFile(IOUtil.toPath(inputFile), outputStream, skipHeader, skipTerminator);
}
@@ -91,10 +102,16 @@ public static void blockCopyBamFile(final File inputFile, final OutputStream out
* @param skipHeader If true, the header of the input file will not be copied to the output stream
* @param skipTerminator If true, the terminator block of the input file will not be written to the output stream
*/
- public static void blockCopyBamFile(final Path inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
- try (final SeekablePathStream in = new SeekablePathStream(inputFile)){
- // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true
- final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile);
+ public static void blockCopyBamFile(
+ final Path inputFile,
+ final OutputStream outputStream,
+ final boolean skipHeader,
+ final boolean skipTerminator) {
+ try (final SeekablePathStream in = new SeekablePathStream(inputFile)) {
+ // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator
+ // block and not copy it if skipTerminator is true
+ final BlockCompressedInputStream.FileTermination term =
+ BlockCompressedInputStream.checkTermination(inputFile);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new SAMException(inputFile.toUri() + " does not have a valid GZIP block at the end of the file.");
@@ -109,7 +126,8 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out
// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop (tsato: update this comment)
if (remainingInBlock >= 0) {
- final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path) null);
+ final BlockCompressedOutputStream blockOut =
+ new BlockCompressedOutputStream(outputStream, (Path) null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
@@ -119,7 +137,7 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out
blockIn.close(); // tsato: why doesn't IntelliJ say this is unnecessary?
in.seek(pos);
- } catch (IOException e){
+ } catch (IOException e) {
throw new HtsjdkException("Encountered an error.", e);
}
}
@@ -127,8 +145,10 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out
// Copy remainder of input stream into output stream
final long currentPos = in.position();
final long length = Files.size(inputFile);
- final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ?
- BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
+ final long skipLast =
+ ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator)
+ ? BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length
+ : 0;
final long bytesToWrite = length - skipLast - currentPos;
IOUtil.transferByStream(in, outputStream, bytesToWrite);
@@ -143,7 +163,8 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out
* (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
* blocks (excluding a terminator block if present) are copied directly from input to output.
*/
- public static void gatherWithBlockCopying(final List
+ * java -cp htsjdk.jar htsjdk.samtools.BamConverter input.bam [output.bam]
+ *
+ *
+ *