@@ -108,14 +108,48 @@ def align(
108108 <class 'skbio.alignment._tabular_msa.TabularMSA'>
109109 """
110110
111+ # Input validation - replicate C CLI robustness
111112 if not sequences :
112- raise ValueError ("Empty sequence list provided" )
113-
113+ raise ValueError ("No sequences were found in the input" )
114+
115+ if len (sequences ) < 2 :
116+ if len (sequences ) == 0 :
117+ raise ValueError ("No sequences were found in the input" )
118+ elif len (sequences ) == 1 :
119+ raise ValueError ("Only 1 sequence was found in the input - at least 2 sequences are required for alignment" )
120+
114121 if not all (isinstance (seq , str ) for seq in sequences ):
115122 raise ValueError ("All sequences must be strings" )
116123
117- if not all (seq .strip () for seq in sequences ):
118- raise ValueError ("All sequences must be non-empty" )
124+ # Check for empty or whitespace-only sequences
125+ empty_sequences = []
126+ for i , seq in enumerate (sequences ):
127+ if not seq or not seq .strip ():
128+ empty_sequences .append (i )
129+
130+ if empty_sequences :
131+ if len (empty_sequences ) == 1 :
132+ raise ValueError (f"Sequence at index { empty_sequences [0 ]} is empty or contains only whitespace" )
133+ else :
134+ raise ValueError (f"Sequences at indices { empty_sequences } are empty or contain only whitespace" )
135+
136+ # Check for valid sequence characters (basic validation)
137+ for i , seq in enumerate (sequences ):
138+ # Remove common whitespace and check if anything remains
139+ cleaned_seq = '' .join (seq .split ())
140+ if len (cleaned_seq ) == 0 :
141+ raise ValueError (f"Sequence at index { i } contains only whitespace characters" )
142+
143+ # Check for obviously invalid characters (control characters, etc)
144+ if any (ord (char ) < 32 for char in cleaned_seq if char not in '\t \n \r ' ):
145+ raise ValueError (f"Sequence at index { i } contains invalid control characters" )
146+
147+ # Warn about very short sequences (like C CLI warnings)
148+ very_short_sequences = [i for i , seq in enumerate (sequences ) if len (seq .strip ()) < 3 ]
149+ if very_short_sequences and len (very_short_sequences ) > len (sequences ) * 0.5 :
150+ import warnings
151+ warnings .warn (f"Many sequences are very short (< 3 characters). This may affect alignment quality." ,
152+ UserWarning , stacklevel = 2 )
119153
120154 # Convert string sequence types to integers
121155 seq_type_map = {
@@ -137,21 +171,39 @@ def align(
137171 else :
138172 seq_type_int = seq_type
139173
140- # Use defaults if not specified
174+ # Parameter validation and defaults
141175 if gap_open is None :
142176 gap_open = - 1.0
177+ elif not isinstance (gap_open , (int , float )):
178+ raise ValueError ("gap_open must be a number" )
179+ elif gap_open > 0 :
180+ raise ValueError ("gap_open must be negative or zero (penalty values)" )
181+
143182 if gap_extend is None :
144183 gap_extend = - 1.0
184+ elif not isinstance (gap_extend , (int , float )):
185+ raise ValueError ("gap_extend must be a number" )
186+ elif gap_extend > 0 :
187+ raise ValueError ("gap_extend must be negative or zero (penalty values)" )
188+
145189 if terminal_gap_extend is None :
146190 terminal_gap_extend = - 1.0
191+ elif not isinstance (terminal_gap_extend , (int , float )):
192+ raise ValueError ("terminal_gap_extend must be a number" )
193+ elif terminal_gap_extend > 0 :
194+ raise ValueError ("terminal_gap_extend must be negative or zero (penalty values)" )
147195
148196 # Handle thread count
149197 if n_threads is None :
150198 n_threads = get_num_threads ()
151-
152- # Validate thread count
153- if n_threads < 1 :
199+ elif not isinstance ( n_threads , int ):
200+ raise ValueError ( "n_threads must be an integer" )
201+ elif n_threads < 1 :
154202 raise ValueError ("n_threads must be at least 1" )
203+ elif n_threads > 1024 : # reasonable upper limit
204+ import warnings
205+ warnings .warn (f"Very high thread count ({ n_threads } ) may not improve performance and could cause issues." ,
206+ UserWarning , stacklevel = 2 )
155207
156208 # Call the C++ binding for core alignment
157209 try :
0 commit comments