Skip to content

Commit 68eac0d

Browse files
committed
Added char validation
1 parent d8f14c6 commit 68eac0d

2 files changed

Lines changed: 17 additions & 1 deletion

File tree

python-kalign/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,14 @@ def align(
154154
f"Sequence at index {i} contains invalid control characters"
155155
)
156156

157+
# Check for digits and other problematic characters that cause platform-specific segfaults
158+
invalid_chars = set(char for char in cleaned_seq if char.isdigit())
159+
if invalid_chars:
160+
raise ValueError(
161+
f"Sequence at index {i} contains invalid characters: {sorted(invalid_chars)}. "
162+
f"Sequences should only contain valid biological sequence characters."
163+
)
164+
157165
# Warn about very short sequences (like C CLI warnings)
158166
very_short_sequences = [
159167
i for i, seq in enumerate(sequences) if len(seq.strip()) < 3

tests/python/test_edge_cases.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,15 @@ def test_mixed_case_sequences(self, mixed_case):
5555

5656
def test_large_sequence_count(self):
5757
"""Test with many sequences."""
58-
many_seqs = [f"ATCG{i:02d}ATCG" for i in range(20)]
58+
# Use valid DNA characters only - create variation with different bases
59+
base_patterns = ["ATCG", "TACG", "GATC", "CGTA", "TGCA", "ACGT", "GCTA", "CTAG"]
60+
many_seqs = []
61+
for i in range(20):
62+
pattern = base_patterns[i % len(base_patterns)]
63+
# Add length variation with valid DNA bases
64+
suffix = "A" * (i % 4) # Add 0-3 A's for length variation
65+
many_seqs.append(f"{pattern}{suffix}TG")
66+
5967
aligned = kalign.align(many_seqs, seq_type="dna")
6068

6169
assert len(aligned) == 20

0 commit comments

Comments
 (0)