Skip to content

Commit 3fc7edb

Browse files
committed
Fix case handling for various capitalization issues
* Fix multi words capitalization, camelCase, proper nouns, abbreviation * Do not change suggested words in dictionary to lower case during build_dict() * Capitalization decision is decided in fix_case()
1 parent c84db35 commit 3fc7edb

3 files changed

Lines changed: 324 additions & 7 deletions

File tree

codespell_lib/_spellchecker.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,9 @@ def build_dict(
5555
translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
5656
for line in f:
5757
[key, data] = line.split("->")
58-
# TODO: For now, convert both to lower.
59-
# Someday we can maybe add support for fixing caps.
58+
# Only convert key to lower case.
59+
# Do not modify data to lower case. Leave it as per dictionary.
6060
key = key.lower()
61-
data = data.lower()
6261
if key not in ignore_words:
6362
add_misspelling(key, data, misspellings)
6463
# generate alternative misspellings/fixes

codespell_lib/_text_util.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,47 @@
1717
"""
1818

1919

20+
def is_camel_case_word(input_word):
21+
return (input_word != input_word.lower()) and \
22+
(input_word != input_word.upper()) and \
23+
("_" not in input_word) and \
24+
("-" not in input_word) and \
25+
(" " not in input_word)
26+
27+
28+
def is_camel_case_string(input_string):
29+
for word in input_string.split(","):
30+
if is_camel_case_word(word) is True:
31+
return True
32+
return False
33+
34+
2035
def fix_case(word: str, fixword: str) -> str:
21-
if word == word.capitalize():
22-
return ", ".join(w.strip().capitalize() for w in fixword.split(","))
36+
if fixword == fixword.upper():
37+
# abbreviation, acronym: fixword is in all upper case.
38+
# Use fixword as per dictionary.
39+
# Eg. asscii->ASCII
40+
return fixword
41+
if word == word.capitalize() and fixword == fixword.lower():
42+
# word is capitalized and fixword(s) in lower.
43+
# Capitalize/Title fixword(s).
44+
# Eg. Weather, Whether,
45+
return fixword.title()
46+
if word == word.capitalize() and not is_camel_case_string(fixword):
47+
# word is capitalized and fixword(s) contain mixed with no camelCase.
48+
# Capitalize/Title fixword(s).
49+
# Eg. skipt->skip, Skype, skipped,
50+
return fixword.title()
2351
if word == word.upper():
52+
# word is in all upper case, change fixword to upper.
53+
# Eg. MONDAY
2454
return fixword.upper()
25-
# they are both lower case
26-
# or we don't have any idea
55+
if word.lower() == fixword.lower():
56+
# Special feature only meant for private custom dictionary.
57+
# word is valid but fixword required in CamelCase.
58+
# Use fixword as per dictionary.
59+
# Eg. mysql->MySQL
60+
return fixword
61+
# word is in lower, capitalize, CamelCase or whatever.
62+
# Use fixword as per dictionary.
2763
return fixword

codespell_lib/tests/test_basic.py

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,288 @@ def test_case_handling_in_fixes(
845845
_helper_test_case_handling_in_fixes(tmp_path, capsys, reason=True)
846846

847847

848+
def _helper_test_case_handling_in_fix_case(
849+
tmpdir: pytest.TempPathFactory,
850+
capsys: pytest.CaptureFixture[str],
851+
dict_entry: str,
852+
bad_input: str,
853+
expected_output: str,
854+
reason: bool,
855+
) -> None:
856+
d = str(tmpdir)
857+
858+
with open(op.join(d, 'dictionary.txt'), 'w') as f:
859+
if reason:
860+
f.write(dict_entry + ' reason\n')
861+
else:
862+
f.write(dict_entry + '\n')
863+
dictionary_name = f.name
864+
865+
# the misspelled word is entirely lowercase
866+
with open(op.join(d, 'bad.txt'), 'w') as f:
867+
f.write(bad_input + '\n')
868+
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
869+
# all suggested fixes must be lowercase too
870+
assert expected_output in stdout
871+
# the reason, if any, must not be modified
872+
if reason:
873+
assert 'reason' in stdout
874+
875+
876+
def test_case_handling_in_fix_case(tmpdir, capsys):
877+
"""Test various case handling in fix_case() function."""
878+
# Test typical: Both misspelled and multiple suggested words are coded
879+
# as lower case in dictionary.
880+
# Verifying: Capitalize is consistent for all suggested words
881+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
882+
'adoptor->adopter, adaptor,',
883+
'early adoptor',
884+
'adopter, adaptor', reason=False)
885+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
886+
'adoptor->adopter, adaptor,',
887+
'Early Adoptor',
888+
'Adopter, Adaptor', reason=False)
889+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
890+
'adoptor->adopter, adaptor,',
891+
'EARLY ADOPTOR',
892+
'ADOPTER, ADAPTOR', reason=False)
893+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
894+
'adoptor->adopter, adaptor,',
895+
'EaRlY AdOpToR',
896+
'adopter, adaptor', reason=False)
897+
# Verifying: Capitalize is consistent for all suggested words
898+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
899+
'adoptor->adopter, adaptor,',
900+
'early adoptor',
901+
'adopter, adaptor', reason=True)
902+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
903+
'adoptor->adopter, adaptor,',
904+
'Early Adoptor',
905+
'Adopter, Adaptor', reason=True)
906+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
907+
'adoptor->adopter, adaptor,',
908+
'EARLY ADOPTOR',
909+
'ADOPTER, ADAPTOR', reason=True)
910+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
911+
'adoptor->adopter, adaptor,',
912+
'EaRlY AdOpToR',
913+
'adopter, adaptor', reason=True)
914+
# Test abbreviation, acronym, initialism: Suggested word coded as
915+
# upper case in dictionary.
916+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
917+
'asscii->ASCII',
918+
'asscii',
919+
'ASCII', reason=False)
920+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
921+
'asscii->ASCII',
922+
'Asscii',
923+
'ASCII', reason=False)
924+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
925+
'asscii->ASCII',
926+
'AssCii',
927+
'ASCII', reason=False)
928+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
929+
'asscii->ASCII',
930+
'ASSCII',
931+
'ASCII', reason=False)
932+
# Test proper nouns: Misspelled coded as lower case in dictionary.
933+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
934+
'austrailia->Australia',
935+
'austrailia',
936+
'Australia', reason=False)
937+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
938+
'austrailia->Australia',
939+
'Austrailia',
940+
'Australia', reason=False)
941+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
942+
'austrailia->Australia',
943+
'AustRailia',
944+
'Australia', reason=False)
945+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
946+
'austrailia->Australia',
947+
'AUSTRAILIA',
948+
'AUSTRALIA', reason=False)
949+
# Test proper nouns, brand names: Misspelled coded as capitalize
950+
# in dictionary.
951+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
952+
'Micosoft->Microsoft',
953+
'micosoft',
954+
'Microsoft', reason=False)
955+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
956+
'Micosoft->Microsoft',
957+
'Micosoft',
958+
'Microsoft', reason=False)
959+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
960+
'Micosoft->Microsoft',
961+
'MicoSoft',
962+
'Microsoft', reason=False)
963+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
964+
'Micosoft->Microsoft',
965+
'MICOSOFT',
966+
'MICROSOFT', reason=False)
967+
# Test typical single: Both misspelled and suggested word both coded
968+
# as lower case in dictionary.
969+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
970+
'pinapple->pineapple',
971+
'pinapple',
972+
'pineapple', reason=False)
973+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
974+
'pinapple->pineapple',
975+
'Pinapple',
976+
'Pineapple', reason=False)
977+
# Test typical multiple: Both misspelled and multiple suggested words
978+
# both coded as lower case in dictionary.
979+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
980+
'uspported->supported, unsupported,',
981+
'uspported',
982+
'supported, unsupported', reason=False)
983+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
984+
'uspported->supported, unsupported,',
985+
'Uspported',
986+
'Supported, Unsupported', reason=False)
987+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
988+
'uspported->supported, unsupported,',
989+
'USPPORTED',
990+
'SUPPORTED, UNSUPPORTED', reason=False)
991+
# Test typical multiple & mix: Misspelled coded in lower. Multiple
992+
# suggested words coded as lower & capitalize case in dictionary.
993+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
994+
'skipt->skip, Skype, skipped,',
995+
'skipt',
996+
'skip, Skype, skipped', reason=False)
997+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
998+
'skipt->skip, Skype, skipped,',
999+
'Skipt',
1000+
'Skip, Skype, Skipped', reason=False)
1001+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1002+
'skipt->skip, Skype, skipped,',
1003+
'SKIPT',
1004+
'SKIP, SKYPE, SKIPPED', reason=False)
1005+
# Test CamelCase basic: Suggested word coded as CamelCase in dictionary.
1006+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1007+
'lesstiff->LessTif',
1008+
'lesstiff',
1009+
'LessTif', reason=False)
1010+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1011+
'lesstiff->LessTif',
1012+
'lessTiff',
1013+
'LessTif', reason=False)
1014+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1015+
'lesstiff->LessTif',
1016+
'Lesstiff',
1017+
'LessTif', reason=False)
1018+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1019+
'lesstiff->LessTif',
1020+
'LessTiff',
1021+
'LessTif', reason=False)
1022+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1023+
'lesstiff->LessTif',
1024+
'LESSTIFF',
1025+
'LESSTIF', reason=False)
1026+
# Test CamelCase brand names: Suggested word coded as CamelCase
1027+
# in dictionary.
1028+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1029+
'mangodb->MongoDB',
1030+
'mangodb',
1031+
'MongoDB', reason=False)
1032+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1033+
'mangodb->MongoDB',
1034+
'mangoDb',
1035+
'MongoDB', reason=False)
1036+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1037+
'mangodb->MongoDB',
1038+
'mangoDB',
1039+
'MongoDB', reason=False)
1040+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1041+
'mangodb->MongoDB',
1042+
'Mangodb',
1043+
'MongoDB', reason=False)
1044+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1045+
'mangodb->MongoDB',
1046+
'MangoDb',
1047+
'MongoDB', reason=False)
1048+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1049+
'mangodb->MongoDB',
1050+
'MangoDB',
1051+
'MongoDB', reason=False)
1052+
# Test CamelCase brand names: Suggested word coded as CamelCase
1053+
# in dictionary.
1054+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1055+
'ebya->eBay',
1056+
'ebya',
1057+
'eBay', reason=False)
1058+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1059+
'ebya->eBay',
1060+
'eBya',
1061+
'eBay', reason=False)
1062+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1063+
'ebya->eBay',
1064+
'Ebya',
1065+
'eBay', reason=False)
1066+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1067+
'ebya->eBay',
1068+
'EBya',
1069+
'eBay', reason=False)
1070+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1071+
'ebya->eBay',
1072+
'EBYA',
1073+
'EBAY', reason=False)
1074+
# Special Test CamelCase, brand names: Misspelled is correct spelling
1075+
# but incorrect case. Suggested word is coded as CamelCase in
1076+
# dictionary. For custom dictionary only.
1077+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1078+
'mariadb->MariaDB',
1079+
'mariadb',
1080+
'MariaDB', reason=False)
1081+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1082+
'mariadb->MariaDB',
1083+
'mariaDb',
1084+
'MariaDB', reason=False)
1085+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1086+
'mariadb->MariaDB',
1087+
'mariaDB',
1088+
'MariaDB', reason=False)
1089+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1090+
'mariadb->MariaDB',
1091+
'Mariadb',
1092+
'MariaDB', reason=False)
1093+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1094+
'mariadb->MariaDB',
1095+
'MariaDb',
1096+
'MariaDB', reason=False)
1097+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1098+
'mariadb->MariaDB',
1099+
'MariaDB',
1100+
'MariaDB', reason=False)
1101+
# Special Test CamelCase, brand names: Misspelled is correct spelling
1102+
# but incorrect case. Multiple suggested words are coded as CamelCase
1103+
# and lower case in dictionary. For custom dictionary only.
1104+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1105+
'mysql->MySQL, mysql,',
1106+
'mysql',
1107+
'MySQL, mysql', reason=False)
1108+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1109+
'mysql->MySQL, mysql,',
1110+
'mySql',
1111+
'MySQL, mysql', reason=False)
1112+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1113+
'mysql->MySQL, mysql,',
1114+
'mySQL',
1115+
'MySQL, mysql', reason=False)
1116+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1117+
'mysql->MySQL, mysql,',
1118+
'Mysql',
1119+
'MySQL, mysql', reason=False)
1120+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1121+
'mysql->MySQL, mysql,',
1122+
'MySql',
1123+
'MySQL, mysql', reason=False)
1124+
_helper_test_case_handling_in_fix_case(tmpdir, capsys,
1125+
'mysql->MySQL, mysql,',
1126+
'MySQL',
1127+
'MySQL, mysql', reason=False)
1128+
1129+
8481130
def test_context(
8491131
tmp_path: Path,
8501132
capsys: pytest.CaptureFixture[str],

0 commit comments

Comments
 (0)