diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py index 590c470bb..351222325 100644 --- a/pythainlp/soundex/metasound.py +++ b/pythainlp/soundex/metasound.py @@ -16,7 +16,7 @@ _CONS_THANTHAKHAT: str = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์" _THANTHAKHAT: str = "์" # \u0e4c _C1: str = "กขฃคฆฅ" # sound K -> coded letter 1 -_C2: str = "จฉชฌซฐทฒดฎตสศษ" # D -> 2 +_C2: str = "จฉชฌซฐฏทฑฒถธดฎตสศษ" # D -> 2 _C3: str = "ฟฝพผภบป" # B -> 3 _C4: str = "ง" # NG -> 4 _C5: str = "ลฬรนณฦญ" # N -> 5 @@ -78,7 +78,8 @@ def metasound(text: str, length: int = 4) -> str: chars[i] = " " i += 1 - # retain first consonant, encode the rest + # filter out spaces left by karan removal, then truncate + chars = [c for c in chars if c != " "] chars = chars[:length] i = 1 while i < len(chars): diff --git a/pythainlp/soundex/prayut_and_somchaip.py b/pythainlp/soundex/prayut_and_somchaip.py index f26aec432..e466ffcfb 100644 --- a/pythainlp/soundex/prayut_and_somchaip.py +++ b/pythainlp/soundex/prayut_and_somchaip.py @@ -18,7 +18,7 @@ _C0: str = "AEIOUHWYอ" _C1: str = "BFPVบฝฟปผพภว" -_C2: str = "CGJKQSXZขฃคฅฆฉขฌกจซศษส" +_C2: str = "CGJKQSXZขฃคฅฆฉฌกจซศษส" _C3: str = "DTฎดฏตฐฑฒถทธ" _C4: str = "Lลฬ" _C5: str = "MNมณน" diff --git a/tests/core/test_soundex.py b/tests/core/test_soundex.py index 33f95cef6..cd42d2013 100644 --- a/tests/core/test_soundex.py +++ b/tests/core/test_soundex.py @@ -169,3 +169,18 @@ def test_complete_soundex_similarity(self): short_code, long_code ) self.assertAlmostEqual(similarity_diff_len, 5 / 11, places=4) + + def test_metasound_karan_truncation(self): + # B1: karan spaces should be filtered before truncation + # "สรรค์พล" has karan in middle — พ must not be lost + self.assertEqual(metasound("สรรค์พล", 4), "ส553") + self.assertEqual(metasound("รักษ์นา", 4), "ร150") + # No karan — should be unaffected + self.assertEqual(metasound("บูรณการ", 4), "บ551") + + def test_metasound_consonant_classification(self): + # B2: ถ,ธ,ฏ,ฑ should be class 2 (same sound as ท,ด) + self.assertEqual(metasound("กถ", 2), "ก2") + self.assertEqual(metasound("กธ", 2), "ก2") + self.assertEqual(metasound("กฏ", 2), "ก2") + self.assertEqual(metasound("กฑ", 2), "ก2")