Skip to content

Commit dfa14bd

Browse files
feat: rule-based sentence segmentation — fix abbreviation splitting
Replace unicode_segmentation::unicode_sentences() which splits on abbreviation periods (Mr., p.m., H.G., etc.), inflating 1-2 word sentence bin to 22% and deflating mean to 6.8 words. New forward-scanning splitter with: - 3-tier abbreviation classification (strong/internal/weak) - Mid-dotted-sequence detection (e.g., p.m., H.G., B.B.C.) - Decimal number handling (42.50) - Ellipsis handling (split only before uppercase) - Dialog tag detection ("Go!" he said — no split at !) - 20 unit tests covering all edge cases Fingerprint after fix: mean 6.8→12.6, 1-2 bin 21.9%→9.8%, long sentences (18+ words) 0%→22.3%, p95 14→35.
1 parent 8c17be3 commit dfa14bd

4 files changed

Lines changed: 466 additions & 5 deletions

File tree

src/stylometry/features/lengths.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ pub const HISTOGRAM_BIN_CENTERS: &[f64] =
3939
pub fn sentence_length_histogram(text: &str) -> Vec<f64> {
4040
use unicode_segmentation::UnicodeSegmentation;
4141

42-
let lengths: Vec<u32> = text
43-
.unicode_sentences()
42+
let lengths: Vec<u32> = super::sentences::split_sentences(text)
43+
.iter()
4444
.map(|s| s.unicode_words().count() as u32)
4545
.filter(|&l| l > 0)
4646
.collect();
@@ -126,8 +126,8 @@ pub fn word_lengths(text: &str) -> LengthStats {
126126
}
127127

128128
pub fn sentence_lengths(text: &str) -> LengthStats {
129-
let lengths: Vec<f64> = text
130-
.unicode_sentences()
129+
let lengths: Vec<f64> = super::sentences::split_sentences(text)
130+
.iter()
131131
.map(|s| s.unicode_words().count() as f64)
132132
.filter(|&l| l > 0.0)
133133
.collect();

src/stylometry/features/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ pub mod ngrams;
1313
pub mod punctuation;
1414
pub mod readability;
1515
pub mod richness;
16+
pub mod sentences;
1617
pub mod vocabulary;

src/stylometry/features/readability.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ pub struct ReadabilityStats {
2222
impl ReadabilityStats {
2323
pub fn compute(text: &str) -> Self {
2424
let words: Vec<&str> = text.unicode_words().collect();
25-
let sentences = text.unicode_sentences().count() as f64;
25+
let sentences = super::sentences::split_sentences(text).len() as f64;
2626
let word_count = words.len() as f64;
2727

2828
if word_count == 0.0 || sentences == 0.0 {

0 commit comments

Comments
 (0)