Skip to content

Commit 6aa602f

Browse files
tfenneclaude
andcommitted
Add cross-implementation CRAM validation pipeline
Snakemake pipeline that round-trips a curated set of public BAM/CRAM files through htsjdk and samtools and verifies output equivalence across all four CRAM 3.1 compression profiles (FAST, NORMAL, SMALL, ARCHIVE) and all four encode/decode combinations. The pipeline was used to validate the CRAM 3.1 write implementation in the previous commit against samtools 1.21 on real-world data spanning five sequencing platforms (Illumina, PacBio HiFi, ONT, Element AVITI, Ultima), seven assay types (WGS, WGBS, RNA-seq, scRNA-seq, exome, Hi-C, amplicon), and two organisms. Pipeline (validation/) ---------------------- - Snakefile: encode original -> CRAM with both htsjdk and samtools, decode each CRAM with both, then compare each decoded BAM against the original. 4 profiles * 4 encoder/decoder combinations = 16 comparisons per sample. - Two sample-sheet formats: samples.tsv - local file paths (3 columns) test_datasets.tsv - remote URLs auto-downloaded (9 columns) Format is auto-detected from header. Reference URLs may be a comma-separated list, with optional `#contig_name` suffix to rename a single-sequence FASTA's header on download (e.g. Lambda phage chrL spike-in for bisulfite samples). - Intermediate files (BAMs and CRAMs) are marked temp() so they are cleaned up as soon as their dependents finish; rule priorities push the DAG to drain depth-first, keeping peak disk bounded. - Per-rule log directives, retry/escalating-memory ladder for the Java jobs (4 GB -> 6 GB -> 8 GB -> 10 GB), and pixi.toml for a reproducible toolchain. CramComparison -------------- A general-purpose record-by-record SAM/BAM/CRAM comparison utility used by the pipeline as the comparator. Tolerates the universal CRAM roundtrip normalisations: - CIGAR =/X collapsed to M before comparison (CRAM stores match/ mismatch in read features, so the =/X distinction is lost on decode in both htsjdk and samtools). - mapQ ignored for unmapped reads (SAM spec leaves it undefined, and both implementations normalise it to 0 on decode). - Auto-generated MD/NM tags stripped when one side is missing them. - Unsigned B-array type differences tolerated (CRAM stores signed). Exit code 0 on both match and mismatch; exit 2 only on actual error, so Snakemake preserves the result file in either case. Co-authored-by: Claude <noreply@anthropic.com>
1 parent e5e7d4c commit 6aa602f

10 files changed

Lines changed: 5379 additions & 0 deletions

File tree

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
package htsjdk.samtools.cram;
2+
3+
import htsjdk.samtools.*;
4+
import htsjdk.samtools.util.SequenceUtil;
5+
6+
import java.io.*;
7+
import java.util.*;
8+
9+
/**
10+
* Command-line tool for comparing two SAM/BAM/CRAM files record-by-record.
11+
* Handles known CRAM/SAM differences (auto-generated MD/NM, unsigned B-array types)
12+
* and reports mismatches with clear field-level detail.
13+
*
14+
* <p>Exit codes: 0 = comparison completed (match or mismatch), 2 = error.
15+
*/
16+
public class CramComparison {
17+
18+
private static final String USAGE = String.join("\n",
19+
"Usage: CramComparison <file1> <file2> [options]",
20+
"",
21+
"Compare two SAM/BAM/CRAM files record by record.",
22+
"",
23+
"Options:",
24+
" --reference <path> Reference FASTA (required for CRAM)",
25+
" --output <path> Write results to file (default: stderr only)",
26+
" --lenient Compare only name, flags, position, bases, qualities",
27+
" --max-diffs <N> Stop after N mismatches (default: 10)",
28+
" --ignore-tags <list> Comma-separated tags to skip (e.g. MD,NM)",
29+
" --help Print this help message",
30+
"",
31+
"Exit codes: 0 = comparison completed, 2 = error"
32+
);
33+
34+
// Tags that CRAM auto-generates on decode and may not be in the source
35+
private static final Set<String> CRAM_AUTO_TAGS = Set.of("MD", "NM");
36+
37+
public static void main(final String[] args) {
38+
if (hasFlag(args, "--help") || hasFlag(args, "-h") || args.length < 2) {
39+
System.out.println(USAGE);
40+
System.exit(args.length < 2 ? 2 : 0);
41+
}
42+
43+
try {
44+
System.exit(run(args));
45+
} catch (final Exception e) {
46+
System.err.println("ERROR: " + e.getMessage());
47+
System.exit(2);
48+
}
49+
}
50+
51+
/**
52+
* Run the comparison and return the exit code (0=match, 1=mismatch, 2=error).
53+
* Separated from main() for testability.
54+
*/
55+
public static int run(final String[] args) {
56+
final String file1 = args[0];
57+
final String file2 = args[1];
58+
59+
String referencePath = null;
60+
String outputPath = null;
61+
boolean lenient = false;
62+
int maxDiffs = 10;
63+
final Set<String> ignoreTags = new HashSet<>();
64+
65+
for (int i = 2; i < args.length; i++) {
66+
switch (args[i]) {
67+
case "--reference": case "-r":
68+
referencePath = args[++i];
69+
break;
70+
case "--output": case "-o":
71+
outputPath = args[++i];
72+
break;
73+
case "--lenient":
74+
lenient = true;
75+
break;
76+
case "--max-diffs":
77+
maxDiffs = Integer.parseInt(args[++i]);
78+
break;
79+
case "--ignore-tags":
80+
for (final String tag : args[++i].split(",")) {
81+
ignoreTags.add(tag.trim());
82+
}
83+
break;
84+
default:
85+
System.err.println("Unknown option: " + args[i]);
86+
return 2;
87+
}
88+
}
89+
90+
final SamReaderFactory factory = SamReaderFactory.makeDefault()
91+
.validationStringency(ValidationStringency.SILENT);
92+
if (referencePath != null) {
93+
factory.referenceSequence(new File(referencePath));
94+
}
95+
96+
try (final PrintWriter out = outputPath != null
97+
? new PrintWriter(new BufferedWriter(new FileWriter(outputPath)))
98+
: null) {
99+
return compareFiles(factory, file1, file2, lenient, maxDiffs, ignoreTags, out);
100+
} catch (final IOException e) {
101+
System.err.println("ERROR: " + e.getMessage());
102+
return 2;
103+
}
104+
}
105+
106+
/**
107+
* Compare two files and write results to the output writer (if non-null) and stderr.
108+
*/
109+
private static int compareFiles(final SamReaderFactory factory, final String file1, final String file2,
110+
final boolean lenient, final int maxDiffs, final Set<String> ignoreTags,
111+
final PrintWriter out) {
112+
long recordCount = 0;
113+
int diffCount = 0;
114+
115+
try (final SamReader reader1 = factory.open(new File(file1));
116+
final SamReader reader2 = factory.open(new File(file2))) {
117+
118+
final Iterator<SAMRecord> it1 = reader1.iterator();
119+
final Iterator<SAMRecord> it2 = reader2.iterator();
120+
121+
while (it1.hasNext() && it2.hasNext()) {
122+
final SAMRecord rec1 = it1.next();
123+
final SAMRecord rec2 = it2.next();
124+
recordCount++;
125+
126+
final String diff = lenient
127+
? compareLenient(rec1, rec2)
128+
: compareStrict(rec1, rec2, ignoreTags);
129+
130+
if (diff != null) {
131+
diffCount++;
132+
if (diffCount <= maxDiffs) {
133+
emit(out, "Record %d: %s", recordCount, diff);
134+
emit(out, " file1: %s", rec1.getSAMString().trim());
135+
emit(out, " file2: %s", rec2.getSAMString().trim());
136+
}
137+
}
138+
}
139+
140+
// Check for unequal record counts
141+
if (it1.hasNext() || it2.hasNext()) {
142+
long extra1 = 0, extra2 = 0;
143+
while (it1.hasNext()) { it1.next(); extra1++; }
144+
while (it2.hasNext()) { it2.next(); extra2++; }
145+
emit(out, "FAIL: Record count mismatch: file1 has %d records, file2 has %d records",
146+
recordCount + extra1, recordCount + extra2);
147+
return 0;
148+
}
149+
} catch (final Exception e) {
150+
System.err.println("ERROR: " + e.getMessage());
151+
e.printStackTrace(System.err);
152+
return 2;
153+
}
154+
155+
if (diffCount > 0) {
156+
if (diffCount > maxDiffs) {
157+
emit(out, "FAIL: %d mismatches in %,d records (showing first %d)", diffCount, recordCount, maxDiffs);
158+
} else {
159+
emit(out, "FAIL: %d mismatches in %,d records", diffCount, recordCount);
160+
}
161+
return 0;
162+
}
163+
164+
emit(out, "OK: %,d records match", recordCount);
165+
return 0;
166+
}
167+
168+
/** Write a formatted message to stderr and optionally to an output file. */
169+
private static void emit(final PrintWriter out, final String format, final Object... args) {
170+
final String message = String.format(format, args);
171+
System.err.println(message);
172+
if (out != null) {
173+
out.println(message);
174+
}
175+
}
176+
177+
/** Lenient comparison: only name, flags, position, bases, qualities. */
178+
private static String compareLenient(final SAMRecord a, final SAMRecord b) {
179+
if (!Objects.equals(a.getReadName(), b.getReadName()))
180+
return "readName: " + a.getReadName() + " vs " + b.getReadName();
181+
if (a.getFlags() != b.getFlags())
182+
return "flags: " + a.getFlags() + " vs " + b.getFlags();
183+
if (!Objects.equals(a.getReferenceName(), b.getReferenceName()))
184+
return "ref: " + a.getReferenceName() + " vs " + b.getReferenceName();
185+
if (a.getAlignmentStart() != b.getAlignmentStart())
186+
return "start: " + a.getAlignmentStart() + " vs " + b.getAlignmentStart();
187+
if (a.getAlignmentEnd() != b.getAlignmentEnd())
188+
return "end: " + a.getAlignmentEnd() + " vs " + b.getAlignmentEnd();
189+
if (!Arrays.equals(a.getReadBases(), b.getReadBases()))
190+
return "bases differ";
191+
if (!Arrays.equals(a.getBaseQualities(), b.getBaseQualities()))
192+
return "qualities differ";
193+
return null;
194+
}
195+
196+
/**
197+
* Strict comparison with known CRAM tolerance:
198+
* - Auto-generated MD/NM tags stripped when one side lacks them
199+
* - Unsigned B-array type differences tolerated (CRAM stores as signed)
200+
* - mapQ is ignored for unmapped reads (SAM spec leaves it undefined, and both
201+
* htsjdk and samtools normalize it to 0 on CRAM decode regardless of source)
202+
* - CIGAR =/X operators are normalized to M (CRAM does not preserve the distinction)
203+
*/
204+
private static String compareStrict(final SAMRecord a, final SAMRecord b, final Set<String> ignoreTags) {
205+
// Core fields
206+
final String lenientDiff = compareLenient(a, b);
207+
if (lenientDiff != null) return lenientDiff;
208+
209+
// Only compare mapQ for mapped reads; for unmapped reads the SAM spec leaves
210+
// mapQ undefined and CRAM normalizes it to 0 regardless of the source value.
211+
if (!a.getReadUnmappedFlag() && a.getMappingQuality() != b.getMappingQuality())
212+
return "mapQ: " + a.getMappingQuality() + " vs " + b.getMappingQuality();
213+
// CRAM stores match/mismatch information in "read features" separately from
214+
// the CIGAR operator, so =/X distinction is not preserved through a roundtrip:
215+
// both htsjdk and samtools emit M on decode. Normalize before comparing.
216+
final String cigarA = normalizeCigar(a.getCigarString());
217+
final String cigarB = normalizeCigar(b.getCigarString());
218+
if (!Objects.equals(cigarA, cigarB))
219+
return "cigar: " + a.getCigarString() + " vs " + b.getCigarString();
220+
if (!Objects.equals(a.getMateReferenceName(), b.getMateReferenceName()))
221+
return "mateRef: " + a.getMateReferenceName() + " vs " + b.getMateReferenceName();
222+
if (a.getMateAlignmentStart() != b.getMateAlignmentStart())
223+
return "mateStart: " + a.getMateAlignmentStart() + " vs " + b.getMateAlignmentStart();
224+
if (a.getInferredInsertSize() != b.getInferredInsertSize())
225+
return "tlen: " + a.getInferredInsertSize() + " vs " + b.getInferredInsertSize();
226+
227+
// Compare tags -- build maps, strip auto-generated and ignored tags
228+
final Map<String, Object> tagsA = getTagMap(a);
229+
final Map<String, Object> tagsB = getTagMap(b);
230+
231+
// Remove ignored tags
232+
for (final String tag : ignoreTags) {
233+
tagsA.remove(tag);
234+
tagsB.remove(tag);
235+
}
236+
237+
// Strip CRAM auto-generated tags when the other side doesn't have them
238+
for (final String tag : CRAM_AUTO_TAGS) {
239+
if (tagsA.containsKey(tag) && !tagsB.containsKey(tag)) tagsA.remove(tag);
240+
if (tagsB.containsKey(tag) && !tagsA.containsKey(tag)) tagsB.remove(tag);
241+
}
242+
243+
// Compare tag sets
244+
final Set<String> allTags = new TreeSet<>();
245+
allTags.addAll(tagsA.keySet());
246+
allTags.addAll(tagsB.keySet());
247+
248+
for (final String tag : allTags) {
249+
final Object valA = tagsA.get(tag);
250+
final Object valB = tagsB.get(tag);
251+
if (valA == null) return "tag " + tag + ": missing in file1, present in file2";
252+
if (valB == null) return "tag " + tag + ": present in file1, missing in file2";
253+
if (!tagValuesEqual(valA, valB))
254+
return "tag " + tag + ": values differ";
255+
}
256+
257+
return null;
258+
}
259+
260+
/**
261+
* Collapse =/X/M operators into a single run of M. Non-alignment operators
262+
* (I, D, N, S, H, P) are preserved as-is. Used to tolerate the CRAM
263+
* roundtrip normalization of =/X to M.
264+
*/
265+
static String normalizeCigar(final String cigar) {
266+
if (cigar == null || cigar.isEmpty() || "*".equals(cigar)) return cigar;
267+
final StringBuilder sb = new StringBuilder(cigar.length());
268+
int matchRun = 0;
269+
int i = 0;
270+
while (i < cigar.length()) {
271+
int j = i;
272+
while (j < cigar.length() && Character.isDigit(cigar.charAt(j))) j++;
273+
final int len = Integer.parseInt(cigar.substring(i, j));
274+
final char op = cigar.charAt(j);
275+
if (op == '=' || op == 'X' || op == 'M') {
276+
matchRun += len;
277+
} else {
278+
if (matchRun > 0) {
279+
sb.append(matchRun).append('M');
280+
matchRun = 0;
281+
}
282+
sb.append(len).append(op);
283+
}
284+
i = j + 1;
285+
}
286+
if (matchRun > 0) sb.append(matchRun).append('M');
287+
return sb.toString();
288+
}
289+
290+
private static Map<String, Object> getTagMap(final SAMRecord rec) {
291+
final Map<String, Object> map = new LinkedHashMap<>();
292+
for (final SAMRecord.SAMTagAndValue tv : rec.getAttributes()) {
293+
map.put(tv.tag, tv.value);
294+
}
295+
return map;
296+
}
297+
298+
/** Compare tag values with deep array equality, tolerating signed/unsigned type mismatch. */
299+
private static boolean tagValuesEqual(final Object a, final Object b) {
300+
if (a instanceof byte[] && b instanceof byte[]) return Arrays.equals((byte[]) a, (byte[]) b);
301+
if (a instanceof short[] && b instanceof short[]) return Arrays.equals((short[]) a, (short[]) b);
302+
if (a instanceof int[] && b instanceof int[]) return Arrays.equals((int[]) a, (int[]) b);
303+
if (a instanceof float[] && b instanceof float[]) return Arrays.equals((float[]) a, (float[]) b);
304+
return Objects.equals(a, b);
305+
}
306+
307+
private static boolean hasFlag(final String[] args, final String flag) {
308+
for (final String arg : args) {
309+
if (flag.equals(arg)) return true;
310+
}
311+
return false;
312+
}
313+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package htsjdk.samtools.cram;
2+
3+
import htsjdk.HtsjdkTest;
4+
import org.testng.Assert;
5+
import org.testng.annotations.Test;
6+
7+
public class CramComparisonTest extends HtsjdkTest {
8+
9+
@Test
10+
public void testNormalizeCigarNullAndEmpty() {
11+
Assert.assertNull(CramComparison.normalizeCigar(null));
12+
Assert.assertEquals(CramComparison.normalizeCigar(""), "");
13+
Assert.assertEquals(CramComparison.normalizeCigar("*"), "*");
14+
}
15+
16+
@Test
17+
public void testNormalizeCigarPureM() {
18+
Assert.assertEquals(CramComparison.normalizeCigar("150M"), "150M");
19+
}
20+
21+
@Test
22+
public void testNormalizeCigarEqualToM() {
23+
Assert.assertEquals(CramComparison.normalizeCigar("150="), "150M");
24+
}
25+
26+
@Test
27+
public void testNormalizeCigarEqualsAndX() {
28+
Assert.assertEquals(CramComparison.normalizeCigar("35=1X5=1X5="), "47M");
29+
}
30+
31+
@Test
32+
public void testNormalizeCigarWithIndelsAndSoftClips() {
33+
Assert.assertEquals(
34+
CramComparison.normalizeCigar("270S31=2I48=1D15=4I1=1I12="),
35+
"270S31M2I48M1D15M4I1M1I12M");
36+
}
37+
38+
@Test
39+
public void testNormalizeCigarLongWithRuns() {
40+
Assert.assertEquals(
41+
CramComparison.normalizeCigar("270S31=2I48=1D15=4I1=1I12=1I38=1D30=1I53=1D23=1D35=1X5=1X5=1X5=1X5=1X5=1X8=1D39=1D19=1D6=1D42=1D28=14955S"),
42+
"270S31M2I48M1D15M4I1M1I12M1I38M1D30M1I53M1D23M1D74M1D39M1D19M1D6M1D42M1D28M14955S");
43+
}
44+
45+
@Test
46+
public void testNormalizeCigarPreservesNonMatchOps() {
47+
Assert.assertEquals(CramComparison.normalizeCigar("100M100N100M"), "100M100N100M");
48+
Assert.assertEquals(CramComparison.normalizeCigar("10S100M10S"), "10S100M10S");
49+
Assert.assertEquals(CramComparison.normalizeCigar("10H100M10H"), "10H100M10H");
50+
}
51+
}

validation/.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Snakemake working files
2+
.snakemake/
3+
output/
4+
5+
# Downloaded remote datasets and references
6+
downloads/
7+
8+
# Rule logs
9+
logs/
10+
11+
# Pixi environment
12+
.pixi/

0 commit comments

Comments
 (0)