@@ -769,35 +769,113 @@ def _detect_time_column(self, df) -> str | None:
769769 return c
770770 return None
771771
772- def _rank_dose_candidates (self , df ) -> list [str ]:
773- """Lightweight ranking of dose-like columns."""
772+ def _rank_dose_candidates (self , df : pd .DataFrame ) -> list [str ]:
773+ """Rank DataFrame columns by likelihood of containing dose/concentration data.
774+
775+ This method implements a lightweight scoring system to identify and rank
776+ columns that are most likely to contain dose, concentration, or drug-related
777+ data. The ranking is based on multiple heuristics including column naming
778+ patterns, data types, value ranges, and statistical properties.
779+
780+ Parameters
781+ ----------
782+ df : pd.DataFrame
783+ Input DataFrame containing columns to be evaluated and ranked.
784+ Must contain at least one column with data.
785+
786+ Returns
787+ -------
788+ list[str]
789+ Column names sorted by descending likelihood of containing dose data.
790+ Columns with higher scores appear first. In case of tied scores,
791+ columns with fewer unique values are ranked higher.
792+
793+ Notes
794+ -----
795+ The scoring algorithm considers the following criteria:
796+
797+ - **Name matching** (+2.0 points): Column names containing keywords like
798+ 'dose', 'conc', 'concentration', 'drug', 'compound', 'stim', 'input',
799+ or patterns like 'u<digit>' (case-insensitive).
800+
801+ - **Numeric data type** (+1.0 points): Columns with integer or float dtype.
802+
803+ - **Reasonable cardinality** (+0.8 points): Columns with 2-30 unique
804+ non-null values, which is typical for dose series.
805+
806+ - **Non-negative values** (+0.3 points): All values are >= 0 when converted
807+ to numeric (dose/concentration values are typically non-negative).
808+
809+ - **Monotonic tendency** (+0.2 points): At least 70% of consecutive numeric
810+ differences are non-decreasing, indicating potential dose escalation
811+ patterns. Requires at least 5 non-null numeric values.
812+
813+ Raises
814+ ------
815+ AttributeError
816+ If df does not have the expected pandas DataFrame interface.
817+
818+ ValueError
819+ If df is empty or contains no valid columns for evaluation.
820+
821+ See Also
822+ --------
823+ pandas.DataFrame.nunique : Count unique values in each column
824+ pandas.to_numeric : Convert argument to numeric type
825+ numpy.diff : Calculate discrete differences along array
826+
827+ Warning
828+ -------
829+ This function uses broad exception handling to ensure robustness when
830+ processing diverse data types. Individual column evaluation errors are
831+ silently ignored to prevent failure on edge cases like mixed data types
832+ or missing values.
833+ """
834+ # Compile pattern for dose-related column names
774835 patt = re .compile (
775836 r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b" ,
776837 re .IGNORECASE ,
777838 )
778- scores = {}
779- for col in df .columns : # noqa: B007
839+
840+ scores : dict [str , float ] = {}
841+
842+ for col in df .columns :
780843 s = 0.0
844+
845+ # Score based on column name pattern matching
781846 if patt .search (col or "" ):
782847 s += 2.0
848+
783849 try :
784- if df [col ].dtype .kind in "if" :
850+ # Score based on data type (numeric preferred)
851+ if df [col ].dtype .kind in "if" : # integer or float
785852 s += 1.0
853+
854+ # Score based on reasonable number of unique values
786855 uniq = df [col ].nunique (dropna = True )
787- if 2 <= uniq <= 30 :
856+ if 2 <= uniq <= 30 : # Reasonable range for dose series?
788857 s += 0.8
858+
859+ # Score based on non-negative values (typical for doses)
789860 if np .all (
790861 pd .to_numeric (df [col ], errors = "coerce" ).fillna (0 ) >= 0
791862 ):
792863 s += 0.3
864+
865+ # Score based on monotonic tendency (dose escalation pattern)
793866 ser = pd .to_numeric (df [col ], errors = "coerce" ).dropna ()
794867 if len (ser ) >= 5 :
795868 diffs = np .diff (ser .values )
796- if np .mean (diffs >= 0 ) >= 0.7 :
869+ if np .mean (diffs >= 0 ) >= 0.7 : # 70% non-decreasing
797870 s += 0.2
798- except Exception :
871+
872+ except Exception : # noqa: S110
873+ # Silently handle any data processing errors
799874 pass
875+
800876 scores [col ] = s
877+
878+ # Sort by score (descending) then by unique count (ascending) for ties
801879 return [
802880 c
803881 for c , _ in sorted (
0 commit comments