|
| 1 | +package htsjdk.samtools.cram; |
| 2 | + |
| 3 | +import htsjdk.samtools.*; |
| 4 | +import htsjdk.samtools.util.SequenceUtil; |
| 5 | + |
| 6 | +import java.io.*; |
| 7 | +import java.util.*; |
| 8 | + |
| 9 | +/** |
| 10 | + * Command-line tool for comparing two SAM/BAM/CRAM files record-by-record. |
| 11 | + * Handles known CRAM/SAM differences (auto-generated MD/NM, unsigned B-array types) |
| 12 | + * and reports mismatches with clear field-level detail. |
| 13 | + * |
| 14 | + * <p>Exit codes: 0 = comparison completed (match or mismatch), 2 = error. |
| 15 | + */ |
| 16 | +public class CramComparison { |
| 17 | + |
| 18 | + private static final String USAGE = String.join("\n", |
| 19 | + "Usage: CramComparison <file1> <file2> [options]", |
| 20 | + "", |
| 21 | + "Compare two SAM/BAM/CRAM files record by record.", |
| 22 | + "", |
| 23 | + "Options:", |
| 24 | + " --reference <path> Reference FASTA (required for CRAM)", |
| 25 | + " --output <path> Write results to file (default: stderr only)", |
| 26 | + " --lenient Compare only name, flags, position, bases, qualities", |
| 27 | + " --max-diffs <N> Stop after N mismatches (default: 10)", |
| 28 | + " --ignore-tags <list> Comma-separated tags to skip (e.g. MD,NM)", |
| 29 | + " --help Print this help message", |
| 30 | + "", |
| 31 | + "Exit codes: 0 = comparison completed, 2 = error" |
| 32 | + ); |
| 33 | + |
| 34 | + // Tags that CRAM auto-generates on decode and may not be in the source |
| 35 | + private static final Set<String> CRAM_AUTO_TAGS = Set.of("MD", "NM"); |
| 36 | + |
| 37 | + public static void main(final String[] args) { |
| 38 | + if (hasFlag(args, "--help") || hasFlag(args, "-h") || args.length < 2) { |
| 39 | + System.out.println(USAGE); |
| 40 | + System.exit(args.length < 2 ? 2 : 0); |
| 41 | + } |
| 42 | + |
| 43 | + try { |
| 44 | + System.exit(run(args)); |
| 45 | + } catch (final Exception e) { |
| 46 | + System.err.println("ERROR: " + e.getMessage()); |
| 47 | + System.exit(2); |
| 48 | + } |
| 49 | + } |
| 50 | + |
| 51 | + /** |
| 52 | + * Run the comparison and return the exit code (0=match, 1=mismatch, 2=error). |
| 53 | + * Separated from main() for testability. |
| 54 | + */ |
| 55 | + public static int run(final String[] args) { |
| 56 | + final String file1 = args[0]; |
| 57 | + final String file2 = args[1]; |
| 58 | + |
| 59 | + String referencePath = null; |
| 60 | + String outputPath = null; |
| 61 | + boolean lenient = false; |
| 62 | + int maxDiffs = 10; |
| 63 | + final Set<String> ignoreTags = new HashSet<>(); |
| 64 | + |
| 65 | + for (int i = 2; i < args.length; i++) { |
| 66 | + switch (args[i]) { |
| 67 | + case "--reference": case "-r": |
| 68 | + referencePath = args[++i]; |
| 69 | + break; |
| 70 | + case "--output": case "-o": |
| 71 | + outputPath = args[++i]; |
| 72 | + break; |
| 73 | + case "--lenient": |
| 74 | + lenient = true; |
| 75 | + break; |
| 76 | + case "--max-diffs": |
| 77 | + maxDiffs = Integer.parseInt(args[++i]); |
| 78 | + break; |
| 79 | + case "--ignore-tags": |
| 80 | + for (final String tag : args[++i].split(",")) { |
| 81 | + ignoreTags.add(tag.trim()); |
| 82 | + } |
| 83 | + break; |
| 84 | + default: |
| 85 | + System.err.println("Unknown option: " + args[i]); |
| 86 | + return 2; |
| 87 | + } |
| 88 | + } |
| 89 | + |
| 90 | + final SamReaderFactory factory = SamReaderFactory.makeDefault() |
| 91 | + .validationStringency(ValidationStringency.SILENT); |
| 92 | + if (referencePath != null) { |
| 93 | + factory.referenceSequence(new File(referencePath)); |
| 94 | + } |
| 95 | + |
| 96 | + try (final PrintWriter out = outputPath != null |
| 97 | + ? new PrintWriter(new BufferedWriter(new FileWriter(outputPath))) |
| 98 | + : null) { |
| 99 | + return compareFiles(factory, file1, file2, lenient, maxDiffs, ignoreTags, out); |
| 100 | + } catch (final IOException e) { |
| 101 | + System.err.println("ERROR: " + e.getMessage()); |
| 102 | + return 2; |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + /** |
| 107 | + * Compare two files and write results to the output writer (if non-null) and stderr. |
| 108 | + */ |
| 109 | + private static int compareFiles(final SamReaderFactory factory, final String file1, final String file2, |
| 110 | + final boolean lenient, final int maxDiffs, final Set<String> ignoreTags, |
| 111 | + final PrintWriter out) { |
| 112 | + long recordCount = 0; |
| 113 | + int diffCount = 0; |
| 114 | + |
| 115 | + try (final SamReader reader1 = factory.open(new File(file1)); |
| 116 | + final SamReader reader2 = factory.open(new File(file2))) { |
| 117 | + |
| 118 | + final Iterator<SAMRecord> it1 = reader1.iterator(); |
| 119 | + final Iterator<SAMRecord> it2 = reader2.iterator(); |
| 120 | + |
| 121 | + while (it1.hasNext() && it2.hasNext()) { |
| 122 | + final SAMRecord rec1 = it1.next(); |
| 123 | + final SAMRecord rec2 = it2.next(); |
| 124 | + recordCount++; |
| 125 | + |
| 126 | + final String diff = lenient |
| 127 | + ? compareLenient(rec1, rec2) |
| 128 | + : compareStrict(rec1, rec2, ignoreTags); |
| 129 | + |
| 130 | + if (diff != null) { |
| 131 | + diffCount++; |
| 132 | + if (diffCount <= maxDiffs) { |
| 133 | + emit(out, "Record %d: %s", recordCount, diff); |
| 134 | + emit(out, " file1: %s", rec1.getSAMString().trim()); |
| 135 | + emit(out, " file2: %s", rec2.getSAMString().trim()); |
| 136 | + } |
| 137 | + } |
| 138 | + } |
| 139 | + |
| 140 | + // Check for unequal record counts |
| 141 | + if (it1.hasNext() || it2.hasNext()) { |
| 142 | + long extra1 = 0, extra2 = 0; |
| 143 | + while (it1.hasNext()) { it1.next(); extra1++; } |
| 144 | + while (it2.hasNext()) { it2.next(); extra2++; } |
| 145 | + emit(out, "FAIL: Record count mismatch: file1 has %d records, file2 has %d records", |
| 146 | + recordCount + extra1, recordCount + extra2); |
| 147 | + return 0; |
| 148 | + } |
| 149 | + } catch (final Exception e) { |
| 150 | + System.err.println("ERROR: " + e.getMessage()); |
| 151 | + e.printStackTrace(System.err); |
| 152 | + return 2; |
| 153 | + } |
| 154 | + |
| 155 | + if (diffCount > 0) { |
| 156 | + if (diffCount > maxDiffs) { |
| 157 | + emit(out, "FAIL: %d mismatches in %,d records (showing first %d)", diffCount, recordCount, maxDiffs); |
| 158 | + } else { |
| 159 | + emit(out, "FAIL: %d mismatches in %,d records", diffCount, recordCount); |
| 160 | + } |
| 161 | + return 0; |
| 162 | + } |
| 163 | + |
| 164 | + emit(out, "OK: %,d records match", recordCount); |
| 165 | + return 0; |
| 166 | + } |
| 167 | + |
| 168 | + /** Write a formatted message to stderr and optionally to an output file. */ |
| 169 | + private static void emit(final PrintWriter out, final String format, final Object... args) { |
| 170 | + final String message = String.format(format, args); |
| 171 | + System.err.println(message); |
| 172 | + if (out != null) { |
| 173 | + out.println(message); |
| 174 | + } |
| 175 | + } |
| 176 | + |
| 177 | + /** Lenient comparison: only name, flags, position, bases, qualities. */ |
| 178 | + private static String compareLenient(final SAMRecord a, final SAMRecord b) { |
| 179 | + if (!Objects.equals(a.getReadName(), b.getReadName())) |
| 180 | + return "readName: " + a.getReadName() + " vs " + b.getReadName(); |
| 181 | + if (a.getFlags() != b.getFlags()) |
| 182 | + return "flags: " + a.getFlags() + " vs " + b.getFlags(); |
| 183 | + if (!Objects.equals(a.getReferenceName(), b.getReferenceName())) |
| 184 | + return "ref: " + a.getReferenceName() + " vs " + b.getReferenceName(); |
| 185 | + if (a.getAlignmentStart() != b.getAlignmentStart()) |
| 186 | + return "start: " + a.getAlignmentStart() + " vs " + b.getAlignmentStart(); |
| 187 | + if (a.getAlignmentEnd() != b.getAlignmentEnd()) |
| 188 | + return "end: " + a.getAlignmentEnd() + " vs " + b.getAlignmentEnd(); |
| 189 | + if (!Arrays.equals(a.getReadBases(), b.getReadBases())) |
| 190 | + return "bases differ"; |
| 191 | + if (!Arrays.equals(a.getBaseQualities(), b.getBaseQualities())) |
| 192 | + return "qualities differ"; |
| 193 | + return null; |
| 194 | + } |
| 195 | + |
| 196 | + /** |
| 197 | + * Strict comparison with known CRAM tolerance: |
| 198 | + * - Auto-generated MD/NM tags stripped when one side lacks them |
| 199 | + * - Unsigned B-array type differences tolerated (CRAM stores as signed) |
| 200 | + * - mapQ is ignored for unmapped reads (SAM spec leaves it undefined, and both |
| 201 | + * htsjdk and samtools normalize it to 0 on CRAM decode regardless of source) |
| 202 | + * - CIGAR =/X operators are normalized to M (CRAM does not preserve the distinction) |
| 203 | + */ |
| 204 | + private static String compareStrict(final SAMRecord a, final SAMRecord b, final Set<String> ignoreTags) { |
| 205 | + // Core fields |
| 206 | + final String lenientDiff = compareLenient(a, b); |
| 207 | + if (lenientDiff != null) return lenientDiff; |
| 208 | + |
| 209 | + // Only compare mapQ for mapped reads; for unmapped reads the SAM spec leaves |
| 210 | + // mapQ undefined and CRAM normalizes it to 0 regardless of the source value. |
| 211 | + if (!a.getReadUnmappedFlag() && a.getMappingQuality() != b.getMappingQuality()) |
| 212 | + return "mapQ: " + a.getMappingQuality() + " vs " + b.getMappingQuality(); |
| 213 | + // CRAM stores match/mismatch information in "read features" separately from |
| 214 | + // the CIGAR operator, so =/X distinction is not preserved through a roundtrip: |
| 215 | + // both htsjdk and samtools emit M on decode. Normalize before comparing. |
| 216 | + final String cigarA = normalizeCigar(a.getCigarString()); |
| 217 | + final String cigarB = normalizeCigar(b.getCigarString()); |
| 218 | + if (!Objects.equals(cigarA, cigarB)) |
| 219 | + return "cigar: " + a.getCigarString() + " vs " + b.getCigarString(); |
| 220 | + if (!Objects.equals(a.getMateReferenceName(), b.getMateReferenceName())) |
| 221 | + return "mateRef: " + a.getMateReferenceName() + " vs " + b.getMateReferenceName(); |
| 222 | + if (a.getMateAlignmentStart() != b.getMateAlignmentStart()) |
| 223 | + return "mateStart: " + a.getMateAlignmentStart() + " vs " + b.getMateAlignmentStart(); |
| 224 | + if (a.getInferredInsertSize() != b.getInferredInsertSize()) |
| 225 | + return "tlen: " + a.getInferredInsertSize() + " vs " + b.getInferredInsertSize(); |
| 226 | + |
| 227 | + // Compare tags -- build maps, strip auto-generated and ignored tags |
| 228 | + final Map<String, Object> tagsA = getTagMap(a); |
| 229 | + final Map<String, Object> tagsB = getTagMap(b); |
| 230 | + |
| 231 | + // Remove ignored tags |
| 232 | + for (final String tag : ignoreTags) { |
| 233 | + tagsA.remove(tag); |
| 234 | + tagsB.remove(tag); |
| 235 | + } |
| 236 | + |
| 237 | + // Strip CRAM auto-generated tags when the other side doesn't have them |
| 238 | + for (final String tag : CRAM_AUTO_TAGS) { |
| 239 | + if (tagsA.containsKey(tag) && !tagsB.containsKey(tag)) tagsA.remove(tag); |
| 240 | + if (tagsB.containsKey(tag) && !tagsA.containsKey(tag)) tagsB.remove(tag); |
| 241 | + } |
| 242 | + |
| 243 | + // Compare tag sets |
| 244 | + final Set<String> allTags = new TreeSet<>(); |
| 245 | + allTags.addAll(tagsA.keySet()); |
| 246 | + allTags.addAll(tagsB.keySet()); |
| 247 | + |
| 248 | + for (final String tag : allTags) { |
| 249 | + final Object valA = tagsA.get(tag); |
| 250 | + final Object valB = tagsB.get(tag); |
| 251 | + if (valA == null) return "tag " + tag + ": missing in file1, present in file2"; |
| 252 | + if (valB == null) return "tag " + tag + ": present in file1, missing in file2"; |
| 253 | + if (!tagValuesEqual(valA, valB)) |
| 254 | + return "tag " + tag + ": values differ"; |
| 255 | + } |
| 256 | + |
| 257 | + return null; |
| 258 | + } |
| 259 | + |
| 260 | + /** |
| 261 | + * Collapse =/X/M operators into a single run of M. Non-alignment operators |
| 262 | + * (I, D, N, S, H, P) are preserved as-is. Used to tolerate the CRAM |
| 263 | + * roundtrip normalization of =/X to M. |
| 264 | + */ |
| 265 | + static String normalizeCigar(final String cigar) { |
| 266 | + if (cigar == null || cigar.isEmpty() || "*".equals(cigar)) return cigar; |
| 267 | + final StringBuilder sb = new StringBuilder(cigar.length()); |
| 268 | + int matchRun = 0; |
| 269 | + int i = 0; |
| 270 | + while (i < cigar.length()) { |
| 271 | + int j = i; |
| 272 | + while (j < cigar.length() && Character.isDigit(cigar.charAt(j))) j++; |
| 273 | + final int len = Integer.parseInt(cigar.substring(i, j)); |
| 274 | + final char op = cigar.charAt(j); |
| 275 | + if (op == '=' || op == 'X' || op == 'M') { |
| 276 | + matchRun += len; |
| 277 | + } else { |
| 278 | + if (matchRun > 0) { |
| 279 | + sb.append(matchRun).append('M'); |
| 280 | + matchRun = 0; |
| 281 | + } |
| 282 | + sb.append(len).append(op); |
| 283 | + } |
| 284 | + i = j + 1; |
| 285 | + } |
| 286 | + if (matchRun > 0) sb.append(matchRun).append('M'); |
| 287 | + return sb.toString(); |
| 288 | + } |
| 289 | + |
| 290 | + private static Map<String, Object> getTagMap(final SAMRecord rec) { |
| 291 | + final Map<String, Object> map = new LinkedHashMap<>(); |
| 292 | + for (final SAMRecord.SAMTagAndValue tv : rec.getAttributes()) { |
| 293 | + map.put(tv.tag, tv.value); |
| 294 | + } |
| 295 | + return map; |
| 296 | + } |
| 297 | + |
| 298 | + /** Compare tag values with deep array equality, tolerating signed/unsigned type mismatch. */ |
| 299 | + private static boolean tagValuesEqual(final Object a, final Object b) { |
| 300 | + if (a instanceof byte[] && b instanceof byte[]) return Arrays.equals((byte[]) a, (byte[]) b); |
| 301 | + if (a instanceof short[] && b instanceof short[]) return Arrays.equals((short[]) a, (short[]) b); |
| 302 | + if (a instanceof int[] && b instanceof int[]) return Arrays.equals((int[]) a, (int[]) b); |
| 303 | + if (a instanceof float[] && b instanceof float[]) return Arrays.equals((float[]) a, (float[]) b); |
| 304 | + return Objects.equals(a, b); |
| 305 | + } |
| 306 | + |
| 307 | + private static boolean hasFlag(final String[] args, final String flag) { |
| 308 | + for (final String arg : args) { |
| 309 | + if (flag.equals(arg)) return true; |
| 310 | + } |
| 311 | + return false; |
| 312 | + } |
| 313 | +} |
0 commit comments