Skip to content

Commit 8a4cb98

Browse files
committed
Add bases-per-slice threshold to bound slice memory for long reads
Previously the slice flush decision was based only on a record count threshold (e.g. 100,000 records for the archive profile). For long-read data (PacBio HiFi 10-20kb, ONT), a full slice buffers 1+ GB of quality scores, which then OOMs the FQZComp quality-score encoder. Add a second threshold on accumulated bases per slice, modelled on htslib's dual-threshold rule: flush when either records-per-slice or bases-per-slice is reached. Default bases-per-slice is readsPerSlice * 500, matching htslib. This keeps short-read behaviour unchanged (500bp * 10000 = 5MB fits comfortably) while capping PacBio/ONT slices at ~3,000-5,000 records. - CRAMEncodingStrategy: add basesPerSlice field + get/set, with a derived default when not explicitly set. - SliceFactory.getUpdatedReferenceContext: now takes numberOfBases alongside numberOfSAMRecords; flushes the slice if either threshold is reached. - ContainerFactory: accumulate bases (via SAMRecord.getReadLength) across the active slice; pass through and reset on slice emit.
1 parent eff8590 commit 8a4cb98

5 files changed

Lines changed: 63 additions & 16 deletions

File tree

src/main/java/htsjdk/samtools/cram/build/ContainerFactory.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ public final class ContainerFactory {
7474
private final List<SAMRecord> sliceSAMRecords;
7575

7676
private long globalRecordCounter = 0;
77+
private long accumulatedBases = 0;
7778
private int currentReferenceContextID = ReferenceContext.UNINITIALIZED_REFERENCE_ID;
7879

7980
/**
@@ -112,11 +113,13 @@ public final Container getNextContainer(final SAMRecord samRecord, final long co
112113
final int updatedReferenceContextID = sliceFactory.getUpdatedReferenceContext(
113114
currentReferenceContextID,
114115
nextRecordIndex,
115-
sliceSAMRecords.size());
116+
sliceSAMRecords.size(),
117+
accumulatedBases);
116118

117119
if (shouldEmitSlice(updatedReferenceContextID)) {
118120
sliceFactory.createNewSliceEntry(currentReferenceContextID, sliceSAMRecords);
119121
sliceSAMRecords.clear();
122+
accumulatedBases = 0;
120123

121124
if (shouldEmitContainer(
122125
currentReferenceContextID,
@@ -130,6 +133,7 @@ public final Container getNextContainer(final SAMRecord samRecord, final long co
130133
}
131134

132135
sliceSAMRecords.add(samRecord);
136+
accumulatedBases += samRecord.getReadLength();
133137
return container;
134138
}
135139

src/main/java/htsjdk/samtools/cram/build/SliceFactory.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public final class SliceFactory {
5151

5252
private long sliceRecordCounter;
5353
private final int maxRecordsPerSlice;
54+
private final long maxBasesPerSlice;
5455
private final int minimumSingleReferenceSliceThreshold;
5556
private final boolean coordinateSorted;
5657

@@ -71,6 +72,7 @@ public SliceFactory(
7172
this.cramReferenceRegion = new CRAMReferenceRegion(cramReferenceSource, samFileHeader.getSequenceDictionary());
7273
minimumSingleReferenceSliceThreshold = encodingStrategy.getMinimumSingleReferenceSliceSize();
7374
maxRecordsPerSlice = this.encodingStrategy.getReadsPerSlice();
75+
maxBasesPerSlice = this.encodingStrategy.getBasesPerSlice();
7476
this.coordinateSorted = samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
7577
this.sliceRecordCounter = globalRecordCounter;
7678
cramRecordSliceEntries = new ArrayList<>(this.encodingStrategy.getSlicesPerContainer());
@@ -174,7 +176,12 @@ private final List<CRAMCompressionRecord> convertToCRAMRecords(final List<SAMRec
174176

175177
/**
176178
* Decide if the current records should be flushed based on the current reference context, the reference context
177-
* for the next record to be written, and the number of records seen so far.
179+
* for the next record to be written, the number of records seen so far, and the accumulated number of bases.
180+
*
181+
* A slice is considered "full" when either {@link CRAMEncodingStrategy#getReadsPerSlice} records or
182+
* {@link CRAMEncodingStrategy#getBasesPerSlice} bases have been accumulated -- whichever happens first.
183+
* This matches htslib behaviour and prevents pathologically large slices for long-read data
184+
* (PacBio HiFi, ONT) where 100K records could otherwise buffer >1 GB of quality scores.
178185
*
179186
* Slices with the Multiple Reference flag (-2) set as the sequence ID in the header may contain reads mapped to
180187
* multiple external references, including unmapped reads (placed on these references or unplaced), but multiple
@@ -188,13 +195,17 @@ private final List<CRAMCompressionRecord> convertToCRAMRecords(final List<SAMRec
188195
* reference-differencing. This latter scenario is recommended for unsorted or non-coordinate-sorted data.
189196
*
190197
* @param nextReferenceIndex reference index of the next record to be emitted
198+
* @param numberOfSAMRecords number of SAM records accumulated so far for the current slice
199+
* @param numberOfBases total bases accumulated so far for the current slice
191200
* @return ReferenceContext.UNINITIALIZED_REFERENCE_ID if a current slice should be flushed and
192201
* subsequent records should go into a new slice; otherwise the updated reference context.
193202
*/
194203
public int getUpdatedReferenceContext(
195204
final int currentReferenceContext,
196205
final int nextReferenceIndex,
197-
final int numberOfSAMRecords) {
206+
final int numberOfSAMRecords,
207+
final long numberOfBases) {
208+
final boolean sliceFull = numberOfSAMRecords >= maxRecordsPerSlice || numberOfBases >= maxBasesPerSlice;
198209
switch (currentReferenceContext) {
199210
// uninitialized can go to: unmapped or mapped
200211
case ReferenceContext.UNINITIALIZED_REFERENCE_ID:
@@ -207,9 +218,9 @@ public int getUpdatedReferenceContext(
207218
case ReferenceContext.UNMAPPED_UNPLACED_ID:
208219
if (nextReferenceIndex == currentReferenceContext) {
209220
// still unmapped...
210-
return numberOfSAMRecords < maxRecordsPerSlice ?
211-
ReferenceContext.UNMAPPED_UNPLACED_ID :
212-
ReferenceContext.UNINITIALIZED_REFERENCE_ID;
221+
return sliceFull ?
222+
ReferenceContext.UNINITIALIZED_REFERENCE_ID :
223+
ReferenceContext.UNMAPPED_UNPLACED_ID;
213224
} else if (coordinateSorted) {
214225
// coordinate sorted, and we're going from unmapped to mapped ??
215226
throw new CRAMException("Invalid coord-sorted input - unmapped records must be last");
@@ -218,7 +229,7 @@ public int getUpdatedReferenceContext(
218229
// record into the same slice with the unmapped ones, since there is no index query
219230
// concern since we're not coord sorted anyway (though there is no reference compression
220231
// happening in this container).
221-
return numberOfSAMRecords >= maxRecordsPerSlice ?
232+
return sliceFull ?
222233
ReferenceContext.UNINITIALIZED_REFERENCE_ID :
223234
ReferenceContext.MULTIPLE_REFERENCE_ID;
224235
}
@@ -233,7 +244,7 @@ public int getUpdatedReferenceContext(
233244
ReferenceContext.UNINITIALIZED_REFERENCE_ID; // emit a small mutli-ref
234245
} else {
235246
// multi-ref, not coord sorted
236-
return numberOfSAMRecords >= maxRecordsPerSlice ?
247+
return sliceFull ?
237248
ReferenceContext.UNINITIALIZED_REFERENCE_ID :
238249
ReferenceContext.MULTIPLE_REFERENCE_ID;
239250
}
@@ -243,7 +254,7 @@ public int getUpdatedReferenceContext(
243254
// (currentReferenceContext is an actual reference index, not a sentinel).
244255
if (nextReferenceIndex == currentReferenceContext) {
245256
// still on the same reference contig
246-
return numberOfSAMRecords >= maxRecordsPerSlice ?
257+
return sliceFull ?
247258
ReferenceContext.UNINITIALIZED_REFERENCE_ID :
248259
nextReferenceIndex;
249260
} else {

src/main/java/htsjdk/samtools/cram/structure/CRAMCompressionProfile.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@ public enum CRAMCompressionProfile {
5252
* <p>This profile uses trial compression: multiple codecs are tried per block and the smallest
5353
* result wins. Additional candidates include BZIP2, the Range (arithmetic) coder, and GZIP.
5454
*/
55-
// Note: large slices increase tag dictionary size; CompressionHeader.internalWrite()
56-
// uses a fixed 100KB buffer for the tag dictionary, which may need adjustment for very
57-
// large slice sizes.
5855
ARCHIVE(CramVersions.CRAM_v3_1, 7, 100_000);
5956

6057
private final CRAMVersion cramVersion;

src/main/java/htsjdk/samtools/cram/structure/CRAMEncodingStrategy.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,20 @@
4646
public class CRAMEncodingStrategy {
4747
public static final int DEFAULT_MINIMUM_SINGLE_REFERENCE_SLICE_THRESHOLD = 1000;
4848
public static final int DEFAULT_READS_PER_SLICE = 10000;
49+
/**
50+
* Default ratio of bases-per-slice to reads-per-slice. Matches htslib's default rule
51+
* {@code bases_per_slice = seqs_per_slice * 500}. A slice is flushed when either the
52+
* record count or the accumulated base count is reached, preventing individual slices
53+
* from growing pathologically large when input reads are long (PacBio HiFi, ONT).
54+
*/
55+
public static final int DEFAULT_BASES_PER_READ = 500;
4956

5057
private CRAMVersion cramVersion = CramVersions.CRAM_v3_1;
5158
private int gzipCompressionLevel = Defaults.COMPRESSION_LEVEL;
5259
private int minimumSingleReferenceSliceSize = DEFAULT_MINIMUM_SINGLE_REFERENCE_SLICE_THRESHOLD;
5360
private int readsPerSlice = DEFAULT_READS_PER_SLICE;
61+
// 0 means "derive from readsPerSlice at query time". Explicit override via setBasesPerSlice.
62+
private long basesPerSlice = 0;
5463
private int slicesPerContainer = 1;
5564

5665
private EnumMap<DataSeries, CompressorDescriptor> compressorMap;
@@ -146,6 +155,32 @@ public int getMinimumSingleReferenceSliceSize() {
146155
return minimumSingleReferenceSliceSize;
147156
}
148157

158+
/**
159+
* Set the maximum accumulated bases per slice. When the accumulated bases in a slice
160+
* reaches this threshold, the slice is flushed even if {@link #getReadsPerSlice} has
161+
* not been reached. This prevents individual slices from growing pathologically large
162+
* for long-read data (PacBio HiFi, ONT).
163+
*
164+
* <p>Setting a value of 0 reverts to the default ({@link #getReadsPerSlice} {@code *}
165+
* {@link #DEFAULT_BASES_PER_READ}), matching htslib's rule.
166+
*
167+
* @param basesPerSlice maximum bases per slice, or 0 to use the default
168+
* @return this strategy for chaining
169+
*/
170+
public CRAMEncodingStrategy setBasesPerSlice(final long basesPerSlice) {
171+
ValidationUtils.validateArg(basesPerSlice >= 0, "basesPerSlice must be >= 0");
172+
this.basesPerSlice = basesPerSlice;
173+
return this;
174+
}
175+
176+
/**
177+
* @return the bases-per-slice threshold. If no explicit value was set, returns
178+
* {@link #getReadsPerSlice} {@code *} {@link #DEFAULT_BASES_PER_READ} (matching htslib).
179+
*/
180+
public long getBasesPerSlice() {
181+
return basesPerSlice > 0 ? basesPerSlice : (long) readsPerSlice * DEFAULT_BASES_PER_READ;
182+
}
183+
149184
/**
150185
* Set the GZIP compression level used for data series compressed with GZIP.
151186
*

src/test/java/htsjdk/samtools/cram/build/SliceFactoryTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ private void testEmitSliceMultiSliceContainer() {
134134
0L);
135135
sliceFactory.createNewSliceEntry(0, CRAMStructureTestHelper.createSAMRecordsMapped(10, 0));
136136
Assert.assertEquals(
137-
sliceFactory.getUpdatedReferenceContext(0, 1, 1),
137+
sliceFactory.getUpdatedReferenceContext(0, 1, 1, 0),
138138
ReferenceContext.UNINITIALIZED_REFERENCE_ID
139139
);
140140
}
@@ -151,7 +151,7 @@ private void testEmitSliceCoordinateSortedPositive(
151151
CRAMStructureTestHelper.SAM_FILE_HEADER,
152152
0L);
153153
Assert.assertEquals(
154-
sliceFactory.getUpdatedReferenceContext(currentReferenceContext, nextReferenceContext, nRecordsSeen),
154+
sliceFactory.getUpdatedReferenceContext(currentReferenceContext, nextReferenceContext, nRecordsSeen, 0),
155155
expectedUpdatedReferenceContext
156156
);
157157
}
@@ -185,7 +185,7 @@ private void testEmitSliceCoordinateSortedNegative(
185185
CRAMStructureTestHelper.REFERENCE_SOURCE,
186186
CRAMStructureTestHelper.SAM_FILE_HEADER,
187187
0L);
188-
sliceFactory.getUpdatedReferenceContext(currentReferenceContext, nextReferenceContext, nRecordsSeen);
188+
sliceFactory.getUpdatedReferenceContext(currentReferenceContext, nextReferenceContext, nRecordsSeen, 0);
189189
}
190190

191191
@DataProvider(name="emitSliceQuerynameSortedPositive")
@@ -236,7 +236,7 @@ private void testEmitSliceQuerynameSortedPositive(
236236
querySortedSAMFileHeader,
237237
0L);
238238
Assert.assertEquals(
239-
sliceFactory.getUpdatedReferenceContext(currentReferenceContext, nextReferenceContext, nRecordsSeen),
239+
sliceFactory.getUpdatedReferenceContext(currentReferenceContext, nextReferenceContext, nRecordsSeen, 0),
240240
expectedUpdatedReferenceContext
241241
);
242242
}

0 commit comments

Comments
 (0)