Skip to content

Commit 3e409d1

Browse files
GiggleLiuzazabapclaude
authored
Fix #414: [Model] LongestCommonSubsequence (#666)
* Add plan for #414: [Model] LongestCommonSubsequence * Implement #414: [Model] LongestCommonSubsequence * chore: remove plan file after implementation * fix(cli): remove duplicate Vec<u64> format hint --------- Co-authored-by: zazabap <sweynan@icloud.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 057025f commit 3e409d1

11 files changed

Lines changed: 555 additions & 477 deletions

File tree

docs/paper/reductions.typ

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,13 +2190,60 @@ NP-completeness was established by Garey, Johnson, and Stockmeyer @gareyJohnsonS
21902190
*Example.* Consider host graph $G$ with 7 vertices: a $K_4$ clique on ${0, 1, 2, 3}$ and a triangle on ${4, 5, 6}$ connected via edge $(3, 4)$. Pattern $H = K_4$ with vertices ${a, b, c, d}$. The mapping $f(a) = 0, f(b) = 1, f(c) = 2, f(d) = 3$ preserves all 6 edges of $K_4$, confirming a subgraph isomorphism exists.
21912191
]
21922192

2193-
#problem-def("LongestCommonSubsequence")[
2194-
Given $k$ strings $s_1, dots, s_k$ over a finite alphabet $Sigma$, find a longest string $w$ that is a subsequence of every $s_i$. A string $w$ is a _subsequence_ of $s$ if $w$ can be obtained by deleting zero or more characters from $s$ without changing the order of the remaining characters.
2195-
][
2196-
The LCS problem is polynomial-time solvable for $k = 2$ strings via dynamic programming in $O(n_1 n_2)$ time (Wagner & Fischer, 1974), but NP-hard for $k gt.eq 3$ strings @maier1978. It is a foundational problem in bioinformatics (sequence alignment), version control (diff algorithms), and data compression. The problem is listed as SR10 in Garey & Johnson @garey1979.
2193+
#{
2194+
let x = load-model-example("LongestCommonSubsequence")
2195+
let strings = x.instance.strings
2196+
let witness = x.samples.at(0).config
2197+
let fmt-str(s) = "\"" + s.map(c => str(c)).join("") + "\""
2198+
let string-list = strings.map(fmt-str).join(", ")
2199+
let find-embed(target, candidate) = {
2200+
let positions = ()
2201+
let j = 0
2202+
for (i, ch) in target.enumerate() {
2203+
if j < candidate.len() and ch == candidate.at(j) {
2204+
positions.push(i)
2205+
j += 1
2206+
}
2207+
}
2208+
positions
2209+
}
2210+
let embeds = strings.map(s => find-embed(s, witness))
2211+
[
2212+
#problem-def("LongestCommonSubsequence")[
2213+
Given a finite alphabet $Sigma$, a set $R = {r_1, dots, r_m}$ of strings over $Sigma^*$, and a positive integer $K$, determine whether there exists a string $w in Sigma^*$ with $|w| gt.eq K$ such that every string $r_i in R$ contains $w$ as a _subsequence_: there exist indices $1 lt.eq j_1 < j_2 < dots < j_(|w|) lt.eq |r_i|$ with $r_i[j_t] = w[t]$ for all $t$.
2214+
][
2215+
A classic NP-complete string problem, listed as problem SR10 in Garey and Johnson @garey1979. #cite(<maier1978>, form: "prose") proved NP-completeness, while Garey and Johnson note polynomial-time cases for fixed $K$ or fixed $|R|$. For the special case of two strings, the classical dynamic-programming algorithm of #cite(<wagnerfischer1974>, form: "prose") runs in $O(|r_1| dot |r_2|)$ time. The decision model implemented in this repository fixes the witness length to exactly $K$; this is equivalent to the standard "$|w| gt.eq K$" formulation because any longer common subsequence has a length-$K$ prefix.
21972216

2198-
*Example.* Let $s_1 = $ `ABAC` and $s_2 = $ `BACA` over $Sigma = {A, B, C}$. The longest common subsequence has length 3, e.g., `BAC`: positions 1, 2, 3 of $s_1$ match positions 0, 1, 2 of $s_2$.
2199-
]
2217+
*Example.* Let $Sigma = {0, 1}$ and let the input set $R$ contain the strings #string-list. The witness $w = $ #fmt-str(witness) is a common subsequence of every string in $R$.
2218+
2219+
#figure({
2220+
let blue = graph-colors.at(0)
2221+
align(center, stack(dir: ttb, spacing: 0.35cm,
2222+
stack(dir: ltr, spacing: 0pt,
2223+
box(width: 1.2cm, height: 0.45cm, align(center + horizon, text(8pt, "w ="))),
2224+
..witness.enumerate().map(((i, symbol)) => {
2225+
box(width: 0.48cm, height: 0.48cm, fill: blue.transparentize(70%), stroke: 0.5pt + luma(120),
2226+
align(center + horizon, text(9pt, weight: "bold", str(symbol))))
2227+
}),
2228+
),
2229+
..strings.enumerate().map(((ri, s)) => {
2230+
let embed = embeds.at(ri)
2231+
stack(dir: ltr, spacing: 0pt,
2232+
box(width: 1.2cm, height: 0.45cm, align(center + horizon, text(8pt, "r" + str(ri + 1) + " ="))),
2233+
..s.enumerate().map(((i, symbol)) => {
2234+
let fill = if embed.contains(i) { blue.transparentize(78%) } else { white }
2235+
box(width: 0.48cm, height: 0.48cm, fill: fill, stroke: 0.5pt + luma(120),
2236+
align(center + horizon, text(9pt, weight: "bold", str(symbol))))
2237+
}),
2238+
)
2239+
}),
2240+
))
2241+
})
2242+
2243+
The highlighted positions show one left-to-right embedding of $w = $ #fmt-str(witness) in each input string, certifying the YES answer for $K = 3$.
2244+
]
2245+
]
2246+
}
22002247

22012248
#problem-def("SubsetSum")[
22022249
Given a finite set $A = {a_0, dots, a_(n-1)}$ with sizes $s(a_i) in ZZ^+$ and a target $B in ZZ^+$, determine whether there exists a subset $A' subset.eq A$ such that $sum_(a in A') s(a) = B$.
@@ -3585,19 +3632,19 @@ The following reductions to Integer Linear Programming are straightforward formu
35853632
]
35863633

35873634
#reduction-rule("LongestCommonSubsequence", "ILP")[
3588-
The match-pair ILP formulation @blum2021 encodes subsequence alignment as a binary optimization. For two strings $s_1$ (length $n_1$) and $s_2$ (length $n_2$), each position pair $(j_1, j_2)$ where $s_1[j_1] = s_2[j_2]$ yields a binary variable. Constraints enforce one-to-one matching and order preservation (no crossings). The objective maximizes the number of matched pairs.
3635+
A bounded-witness ILP formulation turns the decision version of LCS into a feasibility problem. Binary variables choose the symbol at each witness position and, for every input string, choose where that witness position is realized. Linear constraints enforce symbol consistency and strictly increasing source positions.
35893636
][
3590-
_Construction._ Given strings $s_1$ and $s_2$:
3637+
_Construction._ Given alphabet $Sigma$, strings $R = {r_1, dots, r_m}$, and bound $K$:
35913638

3592-
_Variables:_ Binary $m_(j_1, j_2) in {0, 1}$ for each $(j_1, j_2)$ with $s_1[j_1] = s_2[j_2]$. Interpretation: $m_(j_1, j_2) = 1$ iff position $j_1$ of $s_1$ is matched to position $j_2$ of $s_2$.
3639+
_Variables:_ Binary $x_(p, a) in {0, 1}$ for witness position $p in {1, dots, K}$ and symbol $a in Sigma$, with $x_(p, a) = 1$ iff the $p$-th witness symbol equals $a$. For every input string $r_i$, witness position $p$, and source index $j in {1, dots, |r_i|}$, binary $y_(i, p, j) = 1$ iff the $p$-th witness symbol is matched to position $j$ of $r_i$.
35933640

3594-
_Constraints:_ (1) Each position in $s_1$ matched at most once: $sum_(j_2 : (j_1, j_2) in M) m_(j_1, j_2) lt.eq 1$ for all $j_1$. (2) Each position in $s_2$ matched at most once: $sum_(j_1 : (j_1, j_2) in M) m_(j_1, j_2) lt.eq 1$ for all $j_2$. (3) No crossings: for $(j_1, j_2), (j'_1, j'_2) in M$ with $j_1 < j'_1$ and $j_2 > j'_2$: $m_(j_1, j_2) + m_(j'_1, j'_2) lt.eq 1$.
3641+
_Constraints:_ (1) Exactly one symbol per witness position: $sum_(a in Sigma) x_(p, a) = 1$ for all $p$. (2) Exactly one matched source position for each $(i, p)$: $sum_(j = 1)^(|r_i|) y_(i, p, j) = 1$. (3) Character consistency: if $r_i[j] = a$, then $y_(i, p, j) lt.eq x_(p, a)$. (4) Strictly increasing matches: for consecutive witness positions $p$ and $p + 1$, forbid $y_(i, p, j') = y_(i, p + 1, j) = 1$ whenever $j' gt.eq j$.
35953642

3596-
_Objective:_ Maximize $sum_((j_1, j_2) in M) m_(j_1, j_2)$.
3643+
_Objective:_ Use the zero objective. The target ILP is feasible iff the source LCS instance is a YES instance.
35973644

3598-
_Correctness._ ($arrow.r.double$) A common subsequence of length $ell$ defines $ell$ matched pairs that are order-preserving (no crossings) and one-to-one, yielding a feasible ILP solution with objective $ell$. ($arrow.l.double$) An ILP solution with objective $ell$ defines $ell$ matched pairs; constraints (1)--(2) ensure one-to-one matching, and constraint (3) ensures order preservation, so the matched characters form a common subsequence of length $ell$.
3645+
_Correctness._ ($arrow.r.double$) If a witness $w = w_1 dots w_K$ is a common subsequence of every string, set $x_(p, w_p) = 1$ and choose, in every $r_i$, the positions where that embedding occurs. Constraints (1)--(4) are satisfied, so the ILP is feasible. ($arrow.l.double$) Any feasible ILP solution selects exactly one symbol for each witness position and exactly one realization in each source string. Character consistency ensures the chosen positions spell the same witness string in every input string, and the ordering constraints ensure those positions are strictly increasing. Therefore the extracted witness is a common subsequence of length $K$.
35993646

3600-
_Solution extraction._ Collect pairs $(j_1, j_2)$ with $m_(j_1, j_2) = 1$, sort by $j_1$, and read the characters.
3647+
_Solution extraction._ For each witness position $p$, read the unique symbol $a$ with $x_(p, a) = 1$ and output the resulting length-$K$ string.
36013648
]
36023649

36033650
== Unit Disk Mapping

docs/paper/references.bib

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,17 @@ @article{maier1978
597597
doi = {10.1145/322063.322075}
598598
}
599599

600+
@article{wagnerfischer1974,
601+
author = {Robert A. Wagner and Michael J. Fischer},
602+
title = {The String-to-String Correction Problem},
603+
journal = {Journal of the ACM},
604+
volume = {21},
605+
number = {1},
606+
pages = {168--173},
607+
year = {1974},
608+
doi = {10.1145/321796.321811}
609+
}
610+
600611
@article{blum2021,
601612
author = {Christian Blum and Maria J. Blesa and Borja Calvo},
602613
title = {{ILP}-based reduced variable neighborhood search for the longest common subsequence problem},

problemreductions-cli/src/cli.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ Flags by problem type:
251251
RuralPostman (RPP) --graph, --edge-weights, --required-edges, --bound
252252
MultipleChoiceBranching --arcs [--weights] --partition --bound [--num-vertices]
253253
SubgraphIsomorphism --graph (host), --pattern (pattern)
254-
LCS --strings
254+
LCS --strings, --bound [--alphabet-size]
255255
FAS --arcs [--weights] [--num-vertices]
256256
FVS --arcs [--weights] [--num-vertices]
257257
StrongConnectivityAugmentation --arcs, --candidate-arcs, --bound [--num-vertices]
@@ -452,13 +452,13 @@ pub struct CreateArgs {
452452
/// Required edge indices for RuralPostman (comma-separated, e.g., "0,2,4")
453453
#[arg(long)]
454454
pub required_edges: Option<String>,
455-
/// Upper bound or length bound (for BoundedComponentSpanningForest, LengthBoundedDisjointPaths, MultipleChoiceBranching, OptimalLinearArrangement, RuralPostman, SCS, or StringToStringCorrection)
455+
/// Upper bound or length bound (for BoundedComponentSpanningForest, LengthBoundedDisjointPaths, LongestCommonSubsequence, MultipleChoiceBranching, OptimalLinearArrangement, RuralPostman, ShortestCommonSupersequence, or StringToStringCorrection)
456456
#[arg(long, allow_hyphen_values = true)]
457457
pub bound: Option<i64>,
458458
/// Pattern graph edge list for SubgraphIsomorphism (e.g., 0-1,1-2,2-0)
459459
#[arg(long)]
460460
pub pattern: Option<String>,
461-
/// Input strings for LCS (e.g., "ABAC;BACA") or SCS (e.g., "0,1,2;1,2,0")
461+
/// Input strings for LCS (e.g., "ABAC;BACA" or "0,1,0;1,0,1") or SCS (e.g., "0,1,2;1,2,0")
462462
#[arg(long)]
463463
pub strings: Option<String>,
464464
/// Directed arcs for directed graph problems (e.g., 0>1,1>2,2>0)
@@ -497,7 +497,7 @@ pub struct CreateArgs {
497497
/// Number of available workers for StaffScheduling
498498
#[arg(long)]
499499
pub num_workers: Option<u64>,
500-
/// Alphabet size for SCS or StringToStringCorrection (optional; inferred from max symbol + 1 if omitted)
500+
/// Alphabet size for LCS, SCS, or StringToStringCorrection (optional; inferred from the input strings if omitted)
501501
#[arg(long)]
502502
pub alphabet_size: Option<usize>,
503503
/// Functional dependencies for MinimumCardinalityKey (semicolon-separated "lhs>rhs" pairs, e.g., "0,1>2;0,2>3")

problemreductions-cli/src/commands/create.rs

Lines changed: 105 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,9 @@ fn example_for(canonical: &str, graph_type: Option<&str>) -> &'static str {
332332
"--universe 4 --r-sets \"0,1,2,3;0,1\" --s-sets \"0,1,2,3;2,3\" --r-weights 2,5 --s-weights 3,6"
333333
}
334334
"SetBasis" => "--universe 4 --sets \"0,1;1,2;0,2;0,1,2\" --k 3",
335+
"LongestCommonSubsequence" => {
336+
"--strings \"010110;100101;001011\" --bound 3 --alphabet-size 2"
337+
}
335338
"MinimumCardinalityKey" => {
336339
"--num-attributes 6 --dependencies \"0,1>2;0,2>3;1,3>4;2,4>5\" --k 2"
337340
}
@@ -377,6 +380,10 @@ fn help_flag_hint(
377380
) -> &'static str {
378381
match (canonical, field_name) {
379382
("BoundedComponentSpanningForest", "max_weight") => "integer",
383+
("LongestCommonSubsequence", "strings") => {
384+
"raw strings: \"ABAC;BACA\" or symbol lists: \"0,1,0;1,0,1\""
385+
}
386+
("ShortestCommonSupersequence", "strings") => "symbol lists: \"0,1,2;1,2,0\"",
380387
("MultipleChoiceBranching", "partition") => "semicolon-separated groups: \"0,1;2,3\"",
381388
_ => type_format_hint(type_name, graph_type),
382389
}
@@ -1294,18 +1301,85 @@ pub fn create(args: &CreateArgs, out: &OutputConfig) -> Result<()> {
12941301

12951302
// LongestCommonSubsequence
12961303
"LongestCommonSubsequence" => {
1304+
let usage =
1305+
"Usage: pred create LCS --strings \"010110;100101;001011\" --bound 3 [--alphabet-size 2]";
12971306
let strings_str = args.strings.as_deref().ok_or_else(|| {
1298-
anyhow::anyhow!(
1299-
"LCS requires --strings\n\n\
1300-
Usage: pred create LCS --strings \"ABAC;BACA\""
1301-
)
1307+
anyhow::anyhow!("LongestCommonSubsequence requires --strings\n\n{usage}")
13021308
})?;
1303-
let strings: Vec<Vec<u8>> = strings_str
1304-
.split(';')
1305-
.map(|s| s.trim().as_bytes().to_vec())
1306-
.collect();
1309+
let bound_i64 = args.bound.ok_or_else(|| {
1310+
anyhow::anyhow!("LongestCommonSubsequence requires --bound\n\n{usage}")
1311+
})?;
1312+
anyhow::ensure!(
1313+
bound_i64 >= 0,
1314+
"LongestCommonSubsequence requires a nonnegative --bound, got {}",
1315+
bound_i64
1316+
);
1317+
let bound = bound_i64 as usize;
1318+
1319+
let segments: Vec<&str> = strings_str.split(';').map(str::trim).collect();
1320+
let comma_mode = segments.iter().any(|segment| segment.contains(','));
1321+
1322+
let (strings, inferred_alphabet_size): (Vec<Vec<usize>>, usize) = if comma_mode {
1323+
let strings = segments
1324+
.iter()
1325+
.map(|segment| {
1326+
if segment.is_empty() {
1327+
return Ok(Vec::new());
1328+
}
1329+
segment
1330+
.split(',')
1331+
.map(|value| {
1332+
value.trim().parse::<usize>().map_err(|e| {
1333+
anyhow::anyhow!("Invalid LCS alphabet index: {}", e)
1334+
})
1335+
})
1336+
.collect::<Result<Vec<_>>>()
1337+
})
1338+
.collect::<Result<Vec<_>>>()?;
1339+
let inferred = strings
1340+
.iter()
1341+
.flat_map(|string| string.iter())
1342+
.copied()
1343+
.max()
1344+
.map(|value| value + 1)
1345+
.unwrap_or(0);
1346+
(strings, inferred)
1347+
} else {
1348+
let mut encoding = BTreeMap::new();
1349+
let mut next_symbol = 0usize;
1350+
let strings = segments
1351+
.iter()
1352+
.map(|segment| {
1353+
segment
1354+
.as_bytes()
1355+
.iter()
1356+
.map(|byte| {
1357+
let entry = encoding.entry(*byte).or_insert_with(|| {
1358+
let current = next_symbol;
1359+
next_symbol += 1;
1360+
current
1361+
});
1362+
*entry
1363+
})
1364+
.collect::<Vec<_>>()
1365+
})
1366+
.collect::<Vec<_>>();
1367+
(strings, next_symbol)
1368+
};
1369+
1370+
let alphabet_size = args.alphabet_size.unwrap_or(inferred_alphabet_size);
1371+
anyhow::ensure!(
1372+
alphabet_size >= inferred_alphabet_size,
1373+
"--alphabet-size {} is smaller than the inferred alphabet size ({})",
1374+
alphabet_size,
1375+
inferred_alphabet_size
1376+
);
1377+
anyhow::ensure!(
1378+
alphabet_size > 0 || (bound == 0 && strings.iter().all(|string| string.is_empty())),
1379+
"LongestCommonSubsequence requires a positive alphabet. Provide --alphabet-size when all strings are empty and --bound > 0.\n\n{usage}"
1380+
);
13071381
(
1308-
ser(LongestCommonSubsequence::new(strings))?,
1382+
ser(LongestCommonSubsequence::new(alphabet_size, strings, bound))?,
13091383
resolved_variant.clone(),
13101384
)
13111385
}
@@ -3089,6 +3163,7 @@ fn create_random(
30893163
#[cfg(test)]
30903164
mod tests {
30913165
use super::create;
3166+
use super::help_flag_hint;
30923167
use super::help_flag_name;
30933168
use super::parse_bool_rows;
30943169
use super::problem_help_flag_name;
@@ -3118,6 +3193,19 @@ mod tests {
31183193
);
31193194
}
31203195

3196+
#[test]
3197+
fn test_problem_help_uses_problem_specific_lcs_strings_hint() {
3198+
assert_eq!(
3199+
help_flag_hint(
3200+
"LongestCommonSubsequence",
3201+
"strings",
3202+
"Vec<Vec<usize>>",
3203+
None,
3204+
),
3205+
"raw strings: \"ABAC;BACA\" or symbol lists: \"0,1,0;1,0,1\""
3206+
);
3207+
}
3208+
31213209
#[test]
31223210
fn test_problem_help_uses_string_to_string_correction_cli_flags() {
31233211
assert_eq!(
@@ -3134,6 +3222,14 @@ mod tests {
31343222
);
31353223
}
31363224

3225+
#[test]
3226+
fn test_problem_help_keeps_generic_vec_vec_usize_hint_for_other_models() {
3227+
assert_eq!(
3228+
help_flag_hint("SetBasis", "sets", "Vec<Vec<usize>>", None),
3229+
"semicolon-separated sets: \"0,1;1,2;0,2\""
3230+
);
3231+
}
3232+
31373233
#[test]
31383234
fn test_problem_help_uses_k_for_staff_scheduling() {
31393235
assert_eq!(

problemreductions-cli/tests/cli_tests.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2837,6 +2837,46 @@ fn test_create_set_basis_no_flags_uses_actual_cli_flag_names() {
28372837
);
28382838
}
28392839

2840+
#[test]
2841+
fn test_create_lcs_with_raw_strings_infers_alphabet() {
2842+
let output = pred()
2843+
.args(["create", "LCS", "--strings", "ABAC;BACA", "--bound", "2"])
2844+
.output()
2845+
.unwrap();
2846+
assert!(
2847+
output.status.success(),
2848+
"stderr: {}",
2849+
String::from_utf8_lossy(&output.stderr)
2850+
);
2851+
let stdout = String::from_utf8(output.stdout).unwrap();
2852+
let json: serde_json::Value = serde_json::from_str(&stdout).unwrap();
2853+
assert_eq!(json["type"], "LongestCommonSubsequence");
2854+
assert_eq!(json["data"]["alphabet_size"], 3);
2855+
assert_eq!(json["data"]["bound"], 2);
2856+
assert_eq!(
2857+
json["data"]["strings"],
2858+
serde_json::json!([[0, 1, 0, 2], [1, 0, 2, 0]])
2859+
);
2860+
}
2861+
2862+
#[test]
2863+
fn test_create_lcs_rejects_empty_strings_with_positive_bound_without_panicking() {
2864+
let output = pred()
2865+
.args(["create", "LCS", "--strings", "", "--bound", "1"])
2866+
.output()
2867+
.unwrap();
2868+
assert!(!output.status.success());
2869+
let stderr = String::from_utf8_lossy(&output.stderr);
2870+
assert!(
2871+
stderr.contains("Provide --alphabet-size when all strings are empty and --bound > 0"),
2872+
"expected user-facing validation error, got: {stderr}"
2873+
);
2874+
assert!(
2875+
!stderr.contains("panicked at"),
2876+
"create command should reject invalid LCS input without panicking: {stderr}"
2877+
);
2878+
}
2879+
28402880
#[test]
28412881
fn test_create_kcoloring_missing_k() {
28422882
let output = pred()

0 commit comments

Comments
 (0)