Skip to content

Commit 6461d20

Browse files
committed
Fix case handling for various capitalization issues
* Fix multi words capitalization, camelCase, proper nouns, abbreviation * Do not change suggested words in dictionary to lower case during build_dict() * Capitalization decision is decided in fix_case()
1 parent c84db35 commit 6461d20

3 files changed

Lines changed: 327 additions & 7 deletions

File tree

codespell_lib/_spellchecker.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,9 @@ def build_dict(
5555
translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
5656
for line in f:
5757
[key, data] = line.split("->")
58-
# TODO: For now, convert both to lower.
59-
# Someday we can maybe add support for fixing caps.
58+
# Only convert key to lower case.
59+
# Do not modify data to lower case. Leave it as per dictionary.
6060
key = key.lower()
61-
data = data.lower()
6261
if key not in ignore_words:
6362
add_misspelling(key, data, misspellings)
6463
# generate alternative misspellings/fixes

codespell_lib/_text_util.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,44 @@
1717
"""
1818

1919

20+
def is_camel_case_word(input_word: str) -> bool:
21+
return (input_word != input_word.lower()) and \
22+
(input_word != input_word.upper()) and \
23+
("_" not in input_word) and \
24+
("-" not in input_word) and \
25+
(" " not in input_word)
26+
27+
28+
def is_camel_case_string(input_string: str) -> bool:
29+
return any(is_camel_case_word(word) for word in input_string.split(","))
30+
31+
2032
def fix_case(word: str, fixword: str) -> str:
21-
if word == word.capitalize():
22-
return ", ".join(w.strip().capitalize() for w in fixword.split(","))
33+
if fixword == fixword.upper():
34+
# abbreviation, acronym: fixword is in all upper case.
35+
# Use fixword as per dictionary.
36+
# Eg. asscii->ASCII
37+
return fixword
38+
if word == word.capitalize() and fixword == fixword.lower():
39+
# word is capitalized and fixword(s) in lower.
40+
# Capitalize/Title fixword(s).
41+
# Eg. Weather, Whether,
42+
return fixword.title()
43+
if word == word.capitalize() and not is_camel_case_string(fixword):
44+
# word is capitalized and fixword(s) contain mixed with no camelCase.
45+
# Capitalize/Title fixword(s).
46+
# Eg. skipt->skip, Skype, skipped,
47+
return fixword.title()
2348
if word == word.upper():
49+
# word is in all upper case, change fixword to upper.
50+
# Eg. MONDAY
2451
return fixword.upper()
25-
# they are both lower case
26-
# or we don't have any idea
52+
if word.lower() == fixword.lower():
53+
# Special feature only meant for private custom dictionary.
54+
# word is valid but fixword required in CamelCase.
55+
# Use fixword as per dictionary.
56+
# Eg. mysql->MySQL
57+
return fixword
58+
# word is in lower, capitalize, CamelCase or whatever.
59+
# Use fixword as per dictionary.
2760
return fixword

codespell_lib/tests/test_basic.py

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,294 @@ def test_case_handling_in_fixes(
845845
_helper_test_case_handling_in_fixes(tmp_path, capsys, reason=True)
846846

847847

848+
def _helper_test_case_handling_in_fix_case(
849+
tmpdir: pytest.TempPathFactory,
850+
capsys: pytest.CaptureFixture[str],
851+
dict_entry: str,
852+
bad_input: str,
853+
expected_output: str,
854+
reason: bool,
855+
) -> None:
856+
d = str(tmpdir)
857+
858+
with open(op.join(d, "dictionary.txt"), "w") as f:
859+
if reason:
860+
f.write(dict_entry + " reason\n")
861+
else:
862+
f.write(dict_entry + "\n")
863+
dictionary_name = f.name
864+
865+
# the misspelled word is entirely lowercase
866+
with open(op.join(d, "bad.txt"), "w") as f:
867+
f.write(bad_input + "\n")
868+
result = cs.main("-D", dictionary_name, f.name, std=True)
869+
assert isinstance(result, tuple)
870+
code, stdout, _ = result
871+
assert code == 1
872+
# all suggested fixes must be in expected_output
873+
assert expected_output in stdout
874+
# the reason, if any, must not be modified
875+
if reason:
876+
assert "reason" in stdout
877+
878+
879+
def test_case_handling_in_fix_case(
880+
tmp_path: Path,
881+
capsys: pytest.CaptureFixture[str],
882+
) -> None:
883+
"""Test various case handling in fix_case() function."""
884+
# Test typical: Both misspelled and multiple suggested words are coded
885+
# as lower case in dictionary.
886+
# Verifying: Capitalize is consistent for all suggested words
887+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
888+
"adoptor->adopter, adaptor,",
889+
"early adoptor",
890+
"adopter, adaptor", reason=False)
891+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
892+
"adoptor->adopter, adaptor,",
893+
"Early Adoptor",
894+
"Adopter, Adaptor", reason=False)
895+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
896+
"adoptor->adopter, adaptor,",
897+
"EARLY ADOPTOR",
898+
"ADOPTER, ADAPTOR", reason=False)
899+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
900+
"adoptor->adopter, adaptor,",
901+
"EaRlY AdOpToR",
902+
"adopter, adaptor", reason=False)
903+
# Verifying: Capitalize is consistent for all suggested words
904+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
905+
"adoptor->adopter, adaptor,",
906+
"early adoptor",
907+
"adopter, adaptor", reason=True)
908+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
909+
"adoptor->adopter, adaptor,",
910+
"Early Adoptor",
911+
"Adopter, Adaptor", reason=True)
912+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
913+
"adoptor->adopter, adaptor,",
914+
"EARLY ADOPTOR",
915+
"ADOPTER, ADAPTOR", reason=True)
916+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
917+
"adoptor->adopter, adaptor,",
918+
"EaRlY AdOpToR",
919+
"adopter, adaptor", reason=True)
920+
# Test abbreviation, acronym, initialism: Suggested word coded as
921+
# upper case in dictionary.
922+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
923+
"asscii->ASCII",
924+
"asscii",
925+
"ASCII", reason=False)
926+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
927+
"asscii->ASCII",
928+
"Asscii",
929+
"ASCII", reason=False)
930+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
931+
"asscii->ASCII",
932+
"AssCii",
933+
"ASCII", reason=False)
934+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
935+
"asscii->ASCII",
936+
"ASSCII",
937+
"ASCII", reason=False)
938+
# Test proper nouns: Misspelled coded as lower case in dictionary.
939+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
940+
"austrailia->Australia",
941+
"austrailia",
942+
"Australia", reason=False)
943+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
944+
"austrailia->Australia",
945+
"Austrailia",
946+
"Australia", reason=False)
947+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
948+
"austrailia->Australia",
949+
"AustRailia",
950+
"Australia", reason=False)
951+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
952+
"austrailia->Australia",
953+
"AUSTRAILIA",
954+
"AUSTRALIA", reason=False)
955+
# Test proper nouns, brand names: Misspelled coded as capitalize
956+
# in dictionary.
957+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
958+
"Micosoft->Microsoft",
959+
"micosoft",
960+
"Microsoft", reason=False)
961+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
962+
"Micosoft->Microsoft",
963+
"Micosoft",
964+
"Microsoft", reason=False)
965+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
966+
"Micosoft->Microsoft",
967+
"MicoSoft",
968+
"Microsoft", reason=False)
969+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
970+
"Micosoft->Microsoft",
971+
"MICOSOFT",
972+
"MICROSOFT", reason=False)
973+
# Test typical single: Both misspelled and suggested word both coded
974+
# as lower case in dictionary.
975+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
976+
"pinapple->pineapple",
977+
"pinapple",
978+
"pineapple", reason=False)
979+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
980+
"pinapple->pineapple",
981+
"Pinapple",
982+
"Pineapple", reason=False)
983+
# Test typical multiple: Both misspelled and multiple suggested words
984+
# both coded as lower case in dictionary.
985+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
986+
"uspported->supported, unsupported,",
987+
"uspported",
988+
"supported, unsupported", reason=False)
989+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
990+
"uspported->supported, unsupported,",
991+
"Uspported",
992+
"Supported, Unsupported", reason=False)
993+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
994+
"uspported->supported, unsupported,",
995+
"USPPORTED",
996+
"SUPPORTED, UNSUPPORTED", reason=False)
997+
# Test typical multiple & mix: Misspelled coded in lower. Multiple
998+
# suggested words coded as lower & capitalize case in dictionary.
999+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1000+
"skipt->skip, Skype, skipped,",
1001+
"skipt",
1002+
"skip, Skype, skipped", reason=False)
1003+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1004+
"skipt->skip, Skype, skipped,",
1005+
"Skipt",
1006+
"Skip, Skype, Skipped", reason=False)
1007+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1008+
"skipt->skip, Skype, skipped,",
1009+
"SKIPT",
1010+
"SKIP, SKYPE, SKIPPED", reason=False)
1011+
# Test CamelCase basic: Suggested word coded as CamelCase in dictionary.
1012+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1013+
"lesstiff->LessTif",
1014+
"lesstiff",
1015+
"LessTif", reason=False)
1016+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1017+
"lesstiff->LessTif",
1018+
"lessTiff",
1019+
"LessTif", reason=False)
1020+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1021+
"lesstiff->LessTif",
1022+
"Lesstiff",
1023+
"LessTif", reason=False)
1024+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1025+
"lesstiff->LessTif",
1026+
"LessTiff",
1027+
"LessTif", reason=False)
1028+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1029+
"lesstiff->LessTif",
1030+
"LESSTIFF",
1031+
"LESSTIF", reason=False)
1032+
# Test CamelCase brand names: Suggested word coded as CamelCase
1033+
# in dictionary.
1034+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1035+
"mangodb->MongoDB",
1036+
"mangodb",
1037+
"MongoDB", reason=False)
1038+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1039+
"mangodb->MongoDB",
1040+
"mangoDb",
1041+
"MongoDB", reason=False)
1042+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1043+
"mangodb->MongoDB",
1044+
"mangoDB",
1045+
"MongoDB", reason=False)
1046+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1047+
"mangodb->MongoDB",
1048+
"Mangodb",
1049+
"MongoDB", reason=False)
1050+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1051+
"mangodb->MongoDB",
1052+
"MangoDb",
1053+
"MongoDB", reason=False)
1054+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1055+
"mangodb->MongoDB",
1056+
"MangoDB",
1057+
"MongoDB", reason=False)
1058+
# Test CamelCase brand names: Suggested word coded as CamelCase
1059+
# in dictionary.
1060+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1061+
"ebya->eBay",
1062+
"ebya",
1063+
"eBay", reason=False)
1064+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1065+
"ebya->eBay",
1066+
"eBya",
1067+
"eBay", reason=False)
1068+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1069+
"ebya->eBay",
1070+
"Ebya",
1071+
"eBay", reason=False)
1072+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1073+
"ebya->eBay",
1074+
"EBya",
1075+
"eBay", reason=False)
1076+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1077+
"ebya->eBay",
1078+
"EBYA",
1079+
"EBAY", reason=False)
1080+
# Special Test CamelCase, brand names: Misspelled is correct spelling
1081+
# but incorrect case. Suggested word is coded as CamelCase in
1082+
# dictionary. For custom dictionary only.
1083+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1084+
"mariadb->MariaDB",
1085+
"mariadb",
1086+
"MariaDB", reason=False)
1087+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1088+
"mariadb->MariaDB",
1089+
"mariaDb",
1090+
"MariaDB", reason=False)
1091+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1092+
"mariadb->MariaDB",
1093+
"mariaDB",
1094+
"MariaDB", reason=False)
1095+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1096+
"mariadb->MariaDB",
1097+
"Mariadb",
1098+
"MariaDB", reason=False)
1099+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1100+
"mariadb->MariaDB",
1101+
"MariaDb",
1102+
"MariaDB", reason=False)
1103+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1104+
"mariadb->MariaDB",
1105+
"MariaDB",
1106+
"MariaDB", reason=False)
1107+
# Special Test CamelCase, brand names: Misspelled is correct spelling
1108+
# but incorrect case. Multiple suggested words are coded as CamelCase
1109+
# and lower case in dictionary. For custom dictionary only.
1110+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1111+
"mysql->MySQL, mysql,",
1112+
"mysql",
1113+
"MySQL, mysql", reason=False)
1114+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1115+
"mysql->MySQL, mysql,",
1116+
"mySql",
1117+
"MySQL, mysql", reason=False)
1118+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1119+
"mysql->MySQL, mysql,",
1120+
"mySQL",
1121+
"MySQL, mysql", reason=False)
1122+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1123+
"mysql->MySQL, mysql,",
1124+
"Mysql",
1125+
"MySQL, mysql", reason=False)
1126+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1127+
"mysql->MySQL, mysql,",
1128+
"MySql",
1129+
"MySQL, mysql", reason=False)
1130+
_helper_test_case_handling_in_fix_case(tmp_path, capsys,
1131+
"mysql->MySQL, mysql,",
1132+
"MySQL",
1133+
"MySQL, mysql", reason=False)
1134+
1135+
8481136
def test_context(
8491137
tmp_path: Path,
8501138
capsys: pytest.CaptureFixture[str],

0 commit comments

Comments
 (0)