Skip to content

Commit 8cc1e37

Browse files
Yossi Farjounlbergelson
authored andcommitted
disallowing bad characters in SamRecord names (#1238)
* implementing the spec change in samtools/hts-specs#333 * this disallows a number of characters from reference sequence names, these are characters like ',' which do not appear in standard references and which cause parsing issues if they are allowed
1 parent 6ac7a60 commit 8cc1e37

3 files changed

Lines changed: 48 additions & 17 deletions

File tree

src/main/java/htsjdk/samtools/SAMSequenceRecord.java

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
package htsjdk.samtools;
2525

2626

27+
import htsjdk.variant.variantcontext.VariantContext;
28+
2729
import java.math.BigInteger;
2830
import java.net.URI;
2931
import java.net.URISyntaxException;
@@ -57,23 +59,27 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
5759

5860

5961
/**
60-
* This is not a valid sequence name, because it is reserved in the MRNM field of SAM text format
62+
* This is not a valid sequence name, because it is reserved in the RNEXT field of SAM text format
6163
* to mean "same reference as RNAME field."
6264
*/
63-
public static final String RESERVED_MRNM_SEQUENCE_NAME = "=";
65+
66+
public static final String RESERVED_RNEXT_SEQUENCE_NAME = "=";
67+
68+
/* use RESERVED_RNEXT_SEQUENCE_NAME instead. */
69+
@Deprecated
70+
public static final String RESERVED_MRNM_SEQUENCE_NAME = RESERVED_RNEXT_SEQUENCE_NAME;
6471

6572
/**
6673
* The standard tags are stored in text header without type information, because the type of these tags is known.
6774
*/
6875
public static final Set<String> STANDARD_TAGS =
69-
new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG,
70-
SPECIES_TAG));
76+
new HashSet<>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, SPECIES_TAG));
7177

72-
// Split on any whitespace
73-
private static final Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s");
7478
// These are the chars matched by \\s.
7579
private static final char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab
7680

81+
private static final Pattern LEGAL_RNAME_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*");
82+
7783
/**
7884
* @deprecated Use {@link #SAMSequenceRecord(String, int)} instead.
7985
* sequenceLength is required for the object to be considered valid.
@@ -85,9 +91,6 @@ public SAMSequenceRecord(final String name) {
8591

8692
public SAMSequenceRecord(final String name, final int sequenceLength) {
8793
if (name != null) {
88-
if (SEQUENCE_NAME_SPLITTER.matcher(name).find()) {
89-
throw new SAMException("Sequence name contains invalid character: " + name);
90-
}
9194
validateSequenceName(name);
9295
mSequenceName = name.intern();
9396
} else {
@@ -188,8 +191,8 @@ public final SAMSequenceRecord clone() {
188191
public static String truncateSequenceName(final String sequenceName) {
189192
/*
190193
* Instead of using regex split, do it manually for better performance.
191-
return SEQUENCE_NAME_SPLITTER.split(sequenceName, 2)[0];
192-
*/
194+
*/
195+
193196
int truncateAt = sequenceName.length();
194197
for (final char c : WHITESPACE_CHARS) {
195198
int index = sequenceName.indexOf(c);
@@ -204,8 +207,8 @@ public static String truncateSequenceName(final String sequenceName) {
204207
* Throw an exception if the sequence name is not valid.
205208
*/
206209
public static void validateSequenceName(final String name) {
207-
if (RESERVED_MRNM_SEQUENCE_NAME.equals(name)) {
208-
throw new SAMException("'" + RESERVED_MRNM_SEQUENCE_NAME + "' is not a valid sequence name");
210+
if (!LEGAL_RNAME_PATTERN.matcher(name).useAnchoringBounds(true).matches()) {
211+
throw new SAMException(String.format("Sequence name '%s' doesn't match regex: '%s' ", name, LEGAL_RNAME_PATTERN));
209212
}
210213
}
211214

src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,32 @@ public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceR
8585
}
8686

8787
@Test
88-
public void testSetAndCheckDescription(){
88+
public void testSetAndCheckDescription() {
8989
final SAMSequenceRecord record = new SAMSequenceRecord("Test", 1000);
9090
Assert.assertNull(record.getDescription());
9191
final String description = "A description.";
9292
record.setDescription(description);
9393
Assert.assertEquals(record.getDescription(), description);
9494
}
95+
96+
@DataProvider
97+
public Object[][] illegalSequenceNames(){
98+
return new Object[][]{
99+
{"space "},
100+
{"comma,"},
101+
{"lbrace["},
102+
{"rbrace]"},
103+
{"slash\\"},
104+
{"smaller<"},
105+
{"bigger<"},
106+
{"lparen("},
107+
{"rparen)"},
108+
{"lbracket{"},
109+
{"rbracket}"}};
110+
}
111+
112+
@Test(dataProvider = "illegalSequenceNames", expectedExceptions = SAMException.class)
113+
public void testIllegalSequenceNames(final String sequenceName){
114+
new SAMSequenceRecord(sequenceName,100);
115+
}
95116
}

src/test/java/htsjdk/samtools/SequenceNameTruncationAndValidationTest.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public class SequenceNameTruncationAndValidationTest extends HtsjdkTest {
4343
@Test(expectedExceptions = {SAMException.class}, dataProvider = "badSequenceNames")
4444
public void testSequenceRecordThrowsWhenInvalid(final String sequenceName) {
4545
new SAMSequenceRecord(sequenceName, 123);
46-
Assert.fail("Should not reach here.");
46+
Assert.fail("Should not reach here. Sequence " + sequenceName + " should have failed.");
4747
}
4848

4949
@DataProvider(name = "badSequenceNames")
@@ -53,7 +53,13 @@ public Object[][] badSequenceNames() {
5353
{"\t"},
5454
{"\n"},
5555
{"="},
56-
{"Hi, Mom!"}
56+
{"Hi: Mom!"},
57+
{"=Hi:Mom!"},
58+
{"Hi:'Mom!"},
59+
{"Hi:\"Mom!"},
60+
{"Hi:)Mom!"},
61+
{"Hi:(Mom!"},
62+
{"Hi,@Mom!"}
5763
};
5864
}
5965

@@ -65,7 +71,8 @@ public void testSequenceRecordPositiveTest(final String sequenceName) {
6571
@DataProvider(name = "goodSequenceNames")
6672
public Object[][] goodSequenceNames() {
6773
return new Object[][]{
68-
{"Hi,@Mom!"}
74+
{"Hi:@Mom!"},
75+
{"Hi:=Mom!"}
6976
};
7077
}
7178

0 commit comments

Comments
 (0)