Skip to content

Commit 085d394

Browse files
committed
Robustified python API
1 parent a5f52a7 commit 085d394

2 files changed

Lines changed: 64 additions & 23 deletions

File tree

python-kalign/__init__.py

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,48 @@ def align(
108108
<class 'skbio.alignment._tabular_msa.TabularMSA'>
109109
"""
110110

111+
# Input validation - replicate C CLI robustness
111112
if not sequences:
112-
raise ValueError("Empty sequence list provided")
113-
113+
raise ValueError("No sequences were found in the input")
114+
115+
if len(sequences) < 2:
116+
if len(sequences) == 0:
117+
raise ValueError("No sequences were found in the input")
118+
elif len(sequences) == 1:
119+
raise ValueError("Only 1 sequence was found in the input - at least 2 sequences are required for alignment")
120+
114121
if not all(isinstance(seq, str) for seq in sequences):
115122
raise ValueError("All sequences must be strings")
116123

117-
if not all(seq.strip() for seq in sequences):
118-
raise ValueError("All sequences must be non-empty")
124+
# Check for empty or whitespace-only sequences
125+
empty_sequences = []
126+
for i, seq in enumerate(sequences):
127+
if not seq or not seq.strip():
128+
empty_sequences.append(i)
129+
130+
if empty_sequences:
131+
if len(empty_sequences) == 1:
132+
raise ValueError(f"Sequence at index {empty_sequences[0]} is empty or contains only whitespace")
133+
else:
134+
raise ValueError(f"Sequences at indices {empty_sequences} are empty or contain only whitespace")
135+
136+
# Check for valid sequence characters (basic validation)
137+
for i, seq in enumerate(sequences):
138+
# Remove common whitespace and check if anything remains
139+
cleaned_seq = ''.join(seq.split())
140+
if len(cleaned_seq) == 0:
141+
raise ValueError(f"Sequence at index {i} contains only whitespace characters")
142+
143+
# Check for obviously invalid characters (control characters, etc)
144+
if any(ord(char) < 32 for char in cleaned_seq if char not in '\t\n\r'):
145+
raise ValueError(f"Sequence at index {i} contains invalid control characters")
146+
147+
# Warn about very short sequences (like C CLI warnings)
148+
very_short_sequences = [i for i, seq in enumerate(sequences) if len(seq.strip()) < 3]
149+
if very_short_sequences and len(very_short_sequences) > len(sequences) * 0.5:
150+
import warnings
151+
warnings.warn(f"Many sequences are very short (< 3 characters). This may affect alignment quality.",
152+
UserWarning, stacklevel=2)
119153

120154
# Convert string sequence types to integers
121155
seq_type_map = {
@@ -137,21 +171,39 @@ def align(
137171
else:
138172
seq_type_int = seq_type
139173

140-
# Use defaults if not specified
174+
# Parameter validation and defaults
141175
if gap_open is None:
142176
gap_open = -1.0
177+
elif not isinstance(gap_open, (int, float)):
178+
raise ValueError("gap_open must be a number")
179+
elif gap_open > 0:
180+
raise ValueError("gap_open must be negative or zero (penalty values)")
181+
143182
if gap_extend is None:
144183
gap_extend = -1.0
184+
elif not isinstance(gap_extend, (int, float)):
185+
raise ValueError("gap_extend must be a number")
186+
elif gap_extend > 0:
187+
raise ValueError("gap_extend must be negative or zero (penalty values)")
188+
145189
if terminal_gap_extend is None:
146190
terminal_gap_extend = -1.0
191+
elif not isinstance(terminal_gap_extend, (int, float)):
192+
raise ValueError("terminal_gap_extend must be a number")
193+
elif terminal_gap_extend > 0:
194+
raise ValueError("terminal_gap_extend must be negative or zero (penalty values)")
147195

148196
# Handle thread count
149197
if n_threads is None:
150198
n_threads = get_num_threads()
151-
152-
# Validate thread count
153-
if n_threads < 1:
199+
elif not isinstance(n_threads, int):
200+
raise ValueError("n_threads must be an integer")
201+
elif n_threads < 1:
154202
raise ValueError("n_threads must be at least 1")
203+
elif n_threads > 1024: # reasonable upper limit
204+
import warnings
205+
warnings.warn(f"Very high thread count ({n_threads}) may not improve performance and could cause issues.",
206+
UserWarning, stacklevel=2)
155207

156208
# Call the C++ binding for core alignment
157209
try:

tests/python/test_basic_alignment.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import pytest
6+
import sys
67
import kalign
78
from conftest import assert_valid_alignment, assert_alignment_preserves_characters
89

@@ -57,20 +58,8 @@ def test_empty_sequence_list(self):
5758
kalign.align([])
5859

5960
def test_single_sequence(self):
60-
"""Test alignment with single sequence should raise error or handle gracefully."""
61+
"""Test alignment with single sequence should raise error."""
6162
single_seq = ["ATCGATCG"]
6263
# Kalign requires at least 2 sequences for alignment
63-
# The expected behavior is to raise RuntimeError, but we handle platform differences
64-
try:
65-
result = kalign.align(single_seq, seq_type="dna")
66-
# If it somehow succeeds, the result should be sensible
67-
assert isinstance(result, list)
68-
assert len(result) == 1
69-
assert result[0] == single_seq[0] # Should return original sequence
70-
except RuntimeError as e:
71-
# This is the expected behavior
72-
assert "alignment failed" in str(e).lower() or "only 1 sequences found" in str(e).lower()
73-
except Exception as e:
74-
# Log unexpected exceptions but don't fail the test due to platform differences
75-
print(f"Unexpected exception type: {type(e).__name__}: {e}")
76-
pytest.skip(f"Platform-specific handling difference: {type(e).__name__}")
64+
with pytest.raises(ValueError, match="At least 2 sequences are required"):
65+
kalign.align(single_seq, seq_type="dna")

0 commit comments

Comments
 (0)