|
| 1 | +//! Reduction from ClosestSubstring to ILP (Integer Linear Programming). |
| 2 | +//! |
| 3 | +//! Given an alphabet of size `q`, `n` input strings `s_1, ..., s_n` (not |
| 4 | +//! necessarily of equal length), and a window length `ell`, the goal is to |
| 5 | +//! pick a center `c in Sigma^ell` and one length-`ell` window from each input |
| 6 | +//! string that together minimize the worst-case Hamming distance between the |
| 7 | +//! center and any chosen window. The ILP encoding combines the |
| 8 | +//! center-selection variables of ClosestString with one-hot window-choice |
| 9 | +//! indicators, plus a radius variable that is active only on each selected |
| 10 | +//! window. |
| 11 | +//! |
| 12 | +//! - Integer `x_{r, a}` for `r in {0, ..., ell - 1}` and |
| 13 | +//! `a in {0, ..., q - 1}`: `x_{r, a} = 1` iff the center has symbol `a` at |
| 14 | +//! position `r`. The non-negativity of ILP variables together with the |
| 15 | +//! assignment constraint forces every `x_{r, a} in {0, 1}`. |
| 16 | +//! - Integer `y_{i, p}` for input string `s_i` and window start |
| 17 | +//! `p in {0, ..., W_i - 1}` where `W_i = |s_i| - ell + 1`: `y_{i, p} = 1` iff |
| 18 | +//! window `p` is selected from string `s_i`. |
| 19 | +//! - Nonnegative integer radius variable `R`. |
| 20 | +//! - Assignment constraint: `sum_a x_{r, a} = 1` for every position `r`. |
| 21 | +//! - Window-choice constraint: `sum_p y_{i, p} = 1` for every input string. |
| 22 | +//! - Conditional radius constraint per `(i, p)`: |
| 23 | +//! `R + sum_{r} x_{r, s_i[p + r]} - ell * y_{i, p} >= 0`. |
| 24 | +//! When `y_{i, p} = 1`, this becomes `R >= d_H(c, s_i[p..p + ell))`; when |
| 25 | +//! `y_{i, p} = 0`, the constraint is automatically satisfied. |
| 26 | +//! - Objective: minimize `R`. |
| 27 | +//! |
| 28 | +//! Reference: Ming Li, Bin Ma, and Lusheng Wang, "On the closest string and |
| 29 | +//! substring problems," Journal of the ACM 49(2):157-171, 2002. |
| 30 | +//! <https://doi.org/10.1145/506147.506150> |
| 31 | +
|
| 32 | +use crate::models::algebraic::{LinearConstraint, ObjectiveSense, ILP}; |
| 33 | +use crate::models::misc::ClosestSubstring; |
| 34 | +use crate::reduction; |
| 35 | +use crate::rules::traits::{ReduceTo, ReductionResult}; |
| 36 | + |
| 37 | +/// Result of reducing ClosestSubstring to ILP. |
| 38 | +/// |
| 39 | +/// Variable layout (`ILP<i32>`, all non-negative): |
| 40 | +/// - `x_{r, a}` at index `r * alphabet_size + a` for `r in [0, ell)` and |
| 41 | +/// `a in [0, q)`, forced into `{0, 1}` by the assignment constraints. |
| 42 | +/// - `y_{i, p}` at index `q * ell + window_offsets[i] + p` for input string |
| 43 | +/// `s_i` and window start `p in [0, W_i)`, forced into `{0, 1}` by the |
| 44 | +/// window-choice constraints. |
| 45 | +/// - `R` (radius) at index `q * ell + total_num_windows`, a non-negative |
| 46 | +/// integer in `[0, ell]`. |
| 47 | +#[derive(Debug, Clone)] |
| 48 | +pub struct ReductionClosestSubstringToILP { |
| 49 | + target: ILP<i32>, |
| 50 | + alphabet_size: usize, |
| 51 | + substring_length: usize, |
| 52 | + /// Prefix sums of per-string window counts: `window_offsets[i]` is the |
| 53 | + /// number of `y_{j, p}` variables for `j < i`. Has length `num_strings`. |
| 54 | + window_offsets: Vec<usize>, |
| 55 | + /// `window_counts[i] = W_i = |s_i| - ell + 1`. |
| 56 | + window_counts: Vec<usize>, |
| 57 | +} |
| 58 | + |
| 59 | +impl ReductionResult for ReductionClosestSubstringToILP { |
| 60 | + type Source = ClosestSubstring; |
| 61 | + type Target = ILP<i32>; |
| 62 | + |
| 63 | + fn target_problem(&self) -> &ILP<i32> { |
| 64 | + &self.target |
| 65 | + } |
| 66 | + |
| 67 | + /// Decode the integer ILP assignment into the source config layout. |
| 68 | + /// |
| 69 | + /// `ClosestSubstring::evaluate` expects `config` of length `ell + n`: the |
| 70 | + /// first `ell` entries are the center symbols, the remaining `n` entries |
| 71 | + /// are per-string window starts. For each center position `r`, we pick the |
| 72 | + /// unique alphabet symbol `a` with `x_{r, a} = 1`; for each input string |
| 73 | + /// `s_i`, we pick the unique window start `p` with `y_{i, p} = 1`. When no |
| 74 | + /// indicator is set to 1 in some block (which only happens on partial / |
| 75 | + /// infeasible ILP solutions), we fall back to 0 so the returned vector |
| 76 | + /// still has the expected shape. |
| 77 | + fn extract_solution(&self, target_solution: &[usize]) -> Vec<usize> { |
| 78 | + let q = self.alphabet_size; |
| 79 | + let ell = self.substring_length; |
| 80 | + let y_base = q * ell; |
| 81 | + |
| 82 | + let mut out = Vec::with_capacity(ell + self.window_counts.len()); |
| 83 | + |
| 84 | + // Center symbols. |
| 85 | + for r in 0..ell { |
| 86 | + let symbol = (0..q) |
| 87 | + .find(|&a| target_solution.get(r * q + a).copied().unwrap_or(0) == 1) |
| 88 | + .unwrap_or(0); |
| 89 | + out.push(symbol); |
| 90 | + } |
| 91 | + |
| 92 | + // Window starts. |
| 93 | + for (i, &w_i) in self.window_counts.iter().enumerate() { |
| 94 | + let start = (0..w_i) |
| 95 | + .find(|&p| { |
| 96 | + target_solution |
| 97 | + .get(y_base + self.window_offsets[i] + p) |
| 98 | + .copied() |
| 99 | + .unwrap_or(0) |
| 100 | + == 1 |
| 101 | + }) |
| 102 | + .unwrap_or(0); |
| 103 | + out.push(start); |
| 104 | + } |
| 105 | + |
| 106 | + out |
| 107 | + } |
| 108 | +} |
| 109 | + |
| 110 | +#[reduction( |
| 111 | + overhead = { |
| 112 | + num_vars = "alphabet_size * substring_length + total_num_windows + 1", |
| 113 | + num_constraints = "substring_length + num_strings + total_num_windows + 1", |
| 114 | + } |
| 115 | +)] |
| 116 | +impl ReduceTo<ILP<i32>> for ClosestSubstring { |
| 117 | + type Result = ReductionClosestSubstringToILP; |
| 118 | + |
| 119 | + fn reduce_to(&self) -> Self::Result { |
| 120 | + let q = self.alphabet_size(); |
| 121 | + let ell = self.substring_length(); |
| 122 | + let strings = self.strings(); |
| 123 | + let n = strings.len(); |
| 124 | + |
| 125 | + let window_counts: Vec<usize> = strings.iter().map(|s| s.len() - ell + 1).collect(); |
| 126 | + let mut window_offsets: Vec<usize> = Vec::with_capacity(n); |
| 127 | + { |
| 128 | + let mut acc = 0usize; |
| 129 | + for &w in &window_counts { |
| 130 | + window_offsets.push(acc); |
| 131 | + acc += w; |
| 132 | + } |
| 133 | + } |
| 134 | + let total_windows: usize = window_counts.iter().sum(); |
| 135 | + |
| 136 | + let x_idx = |r: usize, a: usize| -> usize { r * q + a }; |
| 137 | + let y_base = q * ell; |
| 138 | + let y_idx = |i: usize, p: usize| -> usize { y_base + window_offsets[i] + p }; |
| 139 | + let r_idx = y_base + total_windows; |
| 140 | + let num_vars = r_idx + 1; |
| 141 | + |
| 142 | + let mut constraints: Vec<LinearConstraint> = |
| 143 | + Vec::with_capacity(ell + n + total_windows + 1); |
| 144 | + |
| 145 | + // Assignment constraints: exactly one symbol per center position. |
| 146 | + // Together with the non-negativity built into ILP<i32>, this also |
| 147 | + // forces every x_{r, a} to lie in {0, 1}. |
| 148 | + for r in 0..ell { |
| 149 | + let terms: Vec<(usize, f64)> = (0..q).map(|a| (x_idx(r, a), 1.0)).collect(); |
| 150 | + constraints.push(LinearConstraint::eq(terms, 1.0)); |
| 151 | + } |
| 152 | + |
| 153 | + // Tight upper bound on R: the worst-case Hamming distance over a |
| 154 | + // length-ell window is at most ell. Added as a single-term `<=` |
| 155 | + // constraint so the solver's bound-tightening pass (which scans for |
| 156 | + // exactly this pattern) picks it up. Without this, R defaults to the |
| 157 | + // full i32 domain, which severely degrades HiGHS performance even on |
| 158 | + // tiny instances. |
| 159 | + constraints.push(LinearConstraint::le(vec![(r_idx, 1.0)], ell as f64)); |
| 160 | + |
| 161 | + // Window-choice constraints: exactly one window per input string. |
| 162 | + // Combined with non-negativity, this forces every y_{i, p} in {0, 1}. |
| 163 | + for (i, &w_i) in window_counts.iter().enumerate() { |
| 164 | + let terms: Vec<(usize, f64)> = (0..w_i).map(|p| (y_idx(i, p), 1.0)).collect(); |
| 165 | + constraints.push(LinearConstraint::eq(terms, 1.0)); |
| 166 | + } |
| 167 | + |
| 168 | + // Conditional radius constraints: for every (input string, window |
| 169 | + // start) pair, R + sum_r x_{r, s_i[p + r]} - ell * y_{i, p} >= 0. |
| 170 | + // - If y_{i, p} = 1: R >= ell - sum_r x_{r, s_i[p + r]} = d_H(c, window). |
| 171 | + // - If y_{i, p} = 0: the LHS is R + (nonneg match count) >= 0, |
| 172 | + // automatically satisfied because R >= 0. |
| 173 | + for (i, s) in strings.iter().enumerate() { |
| 174 | + for p in 0..window_counts[i] { |
| 175 | + let mut terms: Vec<(usize, f64)> = Vec::with_capacity(ell + 2); |
| 176 | + terms.push((r_idx, 1.0)); |
| 177 | + for r in 0..ell { |
| 178 | + terms.push((x_idx(r, s[p + r]), 1.0)); |
| 179 | + } |
| 180 | + terms.push((y_idx(i, p), -(ell as f64))); |
| 181 | + constraints.push(LinearConstraint::ge(terms, 0.0)); |
| 182 | + } |
| 183 | + } |
| 184 | + |
| 185 | + // Objective: minimize R. |
| 186 | + let objective = vec![(r_idx, 1.0)]; |
| 187 | + |
| 188 | + let target = ILP::new(num_vars, constraints, objective, ObjectiveSense::Minimize); |
| 189 | + |
| 190 | + ReductionClosestSubstringToILP { |
| 191 | + target, |
| 192 | + alphabet_size: q, |
| 193 | + substring_length: ell, |
| 194 | + window_offsets, |
| 195 | + window_counts, |
| 196 | + } |
| 197 | + } |
| 198 | +} |
| 199 | + |
| 200 | +#[cfg(feature = "example-db")] |
| 201 | +pub(crate) fn canonical_rule_example_specs() -> Vec<crate::example_db::specs::RuleExampleSpec> { |
| 202 | + vec![crate::example_db::specs::RuleExampleSpec { |
| 203 | + id: "closestsubstring_to_ilp", |
| 204 | + build: || { |
| 205 | + // Canonical issue #1033 instance: binary alphabet, length-3 |
| 206 | + // windows on three length-5 strings. Optimum radius is 1; one |
| 207 | + // optimal center is 010 with windows (0, 1, 0) selecting 000, |
| 208 | + // 010, 110 from s_1, s_2, s_3 respectively. |
| 209 | + let source = ClosestSubstring::new( |
| 210 | + 2, |
| 211 | + vec![ |
| 212 | + vec![0, 0, 0, 1, 1], |
| 213 | + vec![1, 0, 1, 0, 0], |
| 214 | + vec![1, 1, 0, 0, 1], |
| 215 | + ], |
| 216 | + 3, |
| 217 | + ); |
| 218 | + crate::example_db::specs::rule_example_via_ilp::<_, i32>(source) |
| 219 | + }, |
| 220 | + }] |
| 221 | +} |
| 222 | + |
| 223 | +#[cfg(test)] |
| 224 | +#[path = "../unit_tests/rules/closestsubstring_ilp.rs"] |
| 225 | +mod tests; |
0 commit comments