Skip to content

Commit 18b1cf8

Browse files
committed
added documentation
1 parent a5475db commit 18b1cf8

1 file changed

Lines changed: 86 additions & 8 deletions

File tree

src/petab_gui/controllers/table_controllers.py

Lines changed: 86 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -769,35 +769,113 @@ def _detect_time_column(self, df) -> str | None:
769769
return c
770770
return None
771771

772-
def _rank_dose_candidates(self, df) -> list[str]:
773-
"""Lightweight ranking of dose-like columns."""
772+
def _rank_dose_candidates(self, df: pd.DataFrame) -> list[str]:
773+
"""Rank DataFrame columns by likelihood of containing dose/concentration data.
774+
775+
This method implements a lightweight scoring system to identify and rank
776+
columns that are most likely to contain dose, concentration, or drug-related
777+
data. The ranking is based on multiple heuristics including column naming
778+
patterns, data types, value ranges, and statistical properties.
779+
780+
Parameters
781+
----------
782+
df : pd.DataFrame
783+
Input DataFrame containing columns to be evaluated and ranked.
784+
Must contain at least one column with data.
785+
786+
Returns
787+
-------
788+
list[str]
789+
Column names sorted by descending likelihood of containing dose data.
790+
Columns with higher scores appear first. In case of tied scores,
791+
columns with fewer unique values are ranked higher.
792+
793+
Notes
794+
-----
795+
The scoring algorithm considers the following criteria:
796+
797+
- **Name matching** (+2.0 points): Column names containing keywords like
798+
'dose', 'conc', 'concentration', 'drug', 'compound', 'stim', 'input',
799+
or patterns like 'u<digit>' (case-insensitive).
800+
801+
- **Numeric data type** (+1.0 points): Columns with integer or float dtype.
802+
803+
- **Reasonable cardinality** (+0.8 points): Columns with 2-30 unique
804+
non-null values, which is typical for dose series.
805+
806+
- **Non-negative values** (+0.3 points): All values are >= 0 when converted
807+
to numeric (dose/concentration values are typically non-negative).
808+
809+
- **Monotonic tendency** (+0.2 points): At least 70% of consecutive numeric
810+
differences are non-decreasing, indicating potential dose escalation
811+
patterns. Requires at least 5 non-null numeric values.
812+
813+
Raises
814+
------
815+
AttributeError
816+
If df does not have the expected pandas DataFrame interface.
817+
818+
ValueError
819+
If df is empty or contains no valid columns for evaluation.
820+
821+
See Also
822+
--------
823+
pandas.DataFrame.nunique : Count unique values in each column
824+
pandas.to_numeric : Convert argument to numeric type
825+
numpy.diff : Calculate discrete differences along array
826+
827+
Warning
828+
-------
829+
This function uses broad exception handling to ensure robustness when
830+
processing diverse data types. Individual column evaluation errors are
831+
silently ignored to prevent failure on edge cases like mixed data types
832+
or missing values.
833+
"""
834+
# Compile pattern for dose-related column names
774835
patt = re.compile(
775836
r"\b(dose|conc|concentration|drug|compound|stim|input|u\d+)\b",
776837
re.IGNORECASE,
777838
)
778-
scores = {}
779-
for col in df.columns: # noqa: B007
839+
840+
scores: dict[str, float] = {}
841+
842+
for col in df.columns:
780843
s = 0.0
844+
845+
# Score based on column name pattern matching
781846
if patt.search(col or ""):
782847
s += 2.0
848+
783849
try:
784-
if df[col].dtype.kind in "if":
850+
# Score based on data type (numeric preferred)
851+
if df[col].dtype.kind in "if": # integer or float
785852
s += 1.0
853+
854+
# Score based on reasonable number of unique values
786855
uniq = df[col].nunique(dropna=True)
787-
if 2 <= uniq <= 30:
856+
if 2 <= uniq <= 30: # Reasonable range for dose series?
788857
s += 0.8
858+
859+
# Score based on non-negative values (typical for doses)
789860
if np.all(
790861
pd.to_numeric(df[col], errors="coerce").fillna(0) >= 0
791862
):
792863
s += 0.3
864+
865+
# Score based on monotonic tendency (dose escalation pattern)
793866
ser = pd.to_numeric(df[col], errors="coerce").dropna()
794867
if len(ser) >= 5:
795868
diffs = np.diff(ser.values)
796-
if np.mean(diffs >= 0) >= 0.7:
869+
if np.mean(diffs >= 0) >= 0.7: # 70% non-decreasing
797870
s += 0.2
798-
except Exception:
871+
872+
except Exception: # noqa: S110
873+
# Silently handle any data processing errors
799874
pass
875+
800876
scores[col] = s
877+
878+
# Sort by score (descending) then by unique count (ascending) for ties
801879
return [
802880
c
803881
for c, _ in sorted(

0 commit comments

Comments
 (0)