Skip to content

Commit ba2a4e8

Browse files
committed
Fix case handling for various capitalization issues
* Fix multi words capitalization, camelCase, proper nouns, abbreviation * Do not change suggested words in dictionary to lower case during build_dict() * Capitalization decision is decided in fix_case()
1 parent 3a17154 commit ba2a4e8

2 files changed

Lines changed: 304 additions & 47 deletions

File tree

codespell_lib/_codespell.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -454,10 +454,10 @@ def build_dict(filename, misspellings, ignore_words):
454454
with codecs.open(filename, mode='r', encoding='utf-8') as f:
455455
for line in f:
456456
[key, data] = line.split('->')
457-
# TODO for now, convert both to lower. Someday we can maybe add
458-
# support for fixing caps.
457+
# Convert key to lower case.
458+
# Do not modify data to lower case. Leave it as per dictionary.
459459
key = key.lower()
460-
data = data.lower()
460+
# data = data.lower()
461461
if key in ignore_words:
462462
continue
463463
data = data.strip()
@@ -493,13 +493,51 @@ def is_text_file(filename):
493493
return True
494494

495495

496+
def is_camel_case_word(input_word):
497+
return (input_word != input_word.lower()) and \
498+
(input_word != input_word.upper()) and \
499+
("_" not in input_word) and \
500+
("-" not in input_word) and \
501+
(" " not in input_word)
502+
503+
504+
def is_camel_case_string(input_string):
505+
for word in input_string.split(','):
506+
if is_camel_case_word(word) is True:
507+
return True
508+
return False
509+
510+
496511
def fix_case(word, fixword):
497-
if word == word.capitalize():
498-
return ', '.join(w.strip().capitalize() for w in fixword.split(','))
512+
if fixword == fixword.upper():
513+
# abbreviation, acronym: fixword is in all upper case.
514+
# Use fixword as per dictionary.
515+
# Eg. asscii->ASCII
516+
return fixword
517+
elif word == word.capitalize() and fixword == fixword.lower():
518+
# word is capitalized and fixword(s) in lower.
519+
# Capitalize/Title fixword(s).
520+
# Eg. Weather, Whether,
521+
return fixword.title()
522+
# return ', '.join(w.strip().capitalize() for w in fixword.split(','))
523+
elif word == word.capitalize() and not is_camel_case_string(fixword):
524+
# word is capitalized and fixword(s) contain mixed with no camelCase.
525+
# Capitalize/Title fixword(s).
526+
# Eg. skipt->skip, Skype, skipped,
527+
return fixword.title()
528+
# return ', '.join(w.strip().capitalize() for w in fixword.split(','))
499529
elif word == word.upper():
530+
# word is in all upper case, change fixword to upper.
531+
# Eg. MONDAY
500532
return fixword.upper()
501-
# they are both lower case
502-
# or we don't have any idea
533+
elif word.lower() == fixword.lower():
534+
# Special feature only meant for private custom dictionary.
535+
# word is valid but fixword required in CamelCase.
536+
# Use fixword as per dictionary.
537+
# Eg. mysql->MySQL
538+
return fixword
539+
# word is in lower, capitalize, CamelCase or whatever.
540+
# Use fixword as per dictionary.
503541
return fixword
504542

505543

codespell_lib/tests/test_basic.py

Lines changed: 259 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -393,61 +393,280 @@ def test_case_handling(tmpdir, capsys):
393393
assert f.read().decode('utf-8') == 'this has an ASCII error'
394394

395395

396-
def _helper_test_case_handling_in_fixes(tmpdir, capsys, reason):
396+
def _helper_test_case_handling(tmpdir, capsys, dict_entry, bad_input,
397+
expected_output, reason):
397398
d = str(tmpdir)
398399

399400
with open(op.join(d, 'dictionary.txt'), 'w') as f:
400401
if reason:
401-
f.write('adoptor->adopter, adaptor, reason\n')
402+
f.write(dict_entry + ' reason\n')
402403
else:
403-
f.write('adoptor->adopter, adaptor,\n')
404+
f.write(dict_entry + '\n')
404405
dictionary_name = f.name
405406

406-
# the mispelled word is entirely lowercase
407+
# the misspelled word is entirely lowercase
407408
with open(op.join(d, 'bad.txt'), 'w') as f:
408-
f.write('early adoptor\n')
409+
f.write(bad_input + '\n')
409410
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
410411
# all suggested fixes must be lowercase too
411-
assert 'adopter, adaptor' in stdout
412-
# the reason, if any, must not be modified
413-
if reason:
414-
assert 'reason' in stdout
415-
416-
# the mispelled word is capitalized
417-
with open(op.join(d, 'bad.txt'), 'w') as f:
418-
f.write('Early Adoptor\n')
419-
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
420-
# all suggested fixes must be capitalized too
421-
assert 'Adopter, Adaptor' in stdout
422-
# the reason, if any, must not be modified
423-
if reason:
424-
assert 'reason' in stdout
425-
426-
# the mispelled word is entirely uppercase
427-
with open(op.join(d, 'bad.txt'), 'w') as f:
428-
f.write('EARLY ADOPTOR\n')
429-
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
430-
# all suggested fixes must be uppercase too
431-
assert 'ADOPTER, ADAPTOR' in stdout
432-
# the reason, if any, must not be modified
433-
if reason:
434-
assert 'reason' in stdout
435-
436-
# the mispelled word mixes lowercase and uppercase
437-
with open(op.join(d, 'bad.txt'), 'w') as f:
438-
f.write('EaRlY AdOpToR\n')
439-
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
440-
# all suggested fixes should be lowercase
441-
assert 'adopter, adaptor' in stdout
412+
assert expected_output in stdout
442413
# the reason, if any, must not be modified
443414
if reason:
444415
assert 'reason' in stdout
445416

446417

447-
def test_case_handling_in_fixes(tmpdir, capsys):
448-
"""Test that the case of fixes is similar to the mispelled word."""
449-
_helper_test_case_handling_in_fixes(tmpdir, capsys, reason=False)
450-
_helper_test_case_handling_in_fixes(tmpdir, capsys, reason=True)
418+
def test_case_handling_in_fix_case(tmpdir, capsys):
419+
"""Test various case handling in fix_case() function."""
420+
# Test typical: Both misspelled and multiple suggested words are coded
421+
# as lower case in dictionary.
422+
# Verifying: Capitalize is consistent for all suggested words
423+
_helper_test_case_handling(tmpdir, capsys,
424+
'adoptor->adopter, adaptor,',
425+
'early adoptor',
426+
'adopter, adaptor', reason=False)
427+
_helper_test_case_handling(tmpdir, capsys,
428+
'adoptor->adopter, adaptor,',
429+
'Early Adoptor',
430+
'Adopter, Adaptor', reason=False)
431+
_helper_test_case_handling(tmpdir, capsys,
432+
'adoptor->adopter, adaptor,',
433+
'EARLY ADOPTOR',
434+
'ADOPTER, ADAPTOR', reason=False)
435+
_helper_test_case_handling(tmpdir, capsys,
436+
'adoptor->adopter, adaptor,',
437+
'EaRlY AdOpToR',
438+
'adopter, adaptor', reason=False)
439+
# Verifying: Capitalize is consistent for all suggested words
440+
_helper_test_case_handling(tmpdir, capsys,
441+
'adoptor->adopter, adaptor,',
442+
'early adoptor',
443+
'adopter, adaptor', reason=True)
444+
_helper_test_case_handling(tmpdir, capsys,
445+
'adoptor->adopter, adaptor,',
446+
'Early Adoptor',
447+
'Adopter, Adaptor', reason=True)
448+
_helper_test_case_handling(tmpdir, capsys,
449+
'adoptor->adopter, adaptor,',
450+
'EARLY ADOPTOR',
451+
'ADOPTER, ADAPTOR', reason=True)
452+
_helper_test_case_handling(tmpdir, capsys,
453+
'adoptor->adopter, adaptor,',
454+
'EaRlY AdOpToR',
455+
'adopter, adaptor', reason=True)
456+
# Test abbreviation, acronym, initialism: Suggested word coded as
457+
# upper case in dictionary.
458+
_helper_test_case_handling(tmpdir, capsys,
459+
'asscii->ASCII',
460+
'asscii',
461+
'ASCII', reason=False)
462+
_helper_test_case_handling(tmpdir, capsys,
463+
'asscii->ASCII',
464+
'Asscii',
465+
'ASCII', reason=False)
466+
_helper_test_case_handling(tmpdir, capsys,
467+
'asscii->ASCII',
468+
'AssCii',
469+
'ASCII', reason=False)
470+
_helper_test_case_handling(tmpdir, capsys,
471+
'asscii->ASCII',
472+
'ASSCII',
473+
'ASCII', reason=False)
474+
# Test proper nouns: Misspelled coded as lower case in dictionary.
475+
_helper_test_case_handling(tmpdir, capsys,
476+
'austrailia->Australia',
477+
'austrailia',
478+
'Australia', reason=False)
479+
_helper_test_case_handling(tmpdir, capsys,
480+
'austrailia->Australia',
481+
'Austrailia',
482+
'Australia', reason=False)
483+
_helper_test_case_handling(tmpdir, capsys,
484+
'austrailia->Australia',
485+
'AustRailia',
486+
'Australia', reason=False)
487+
_helper_test_case_handling(tmpdir, capsys,
488+
'austrailia->Australia',
489+
'AUSTRAILIA',
490+
'AUSTRALIA', reason=False)
491+
# Test proper nouns, brand names: Misspelled coded as capitalize
492+
# in dictionary.
493+
_helper_test_case_handling(tmpdir, capsys,
494+
'Micosoft->Microsoft',
495+
'micosoft',
496+
'Microsoft', reason=False)
497+
_helper_test_case_handling(tmpdir, capsys,
498+
'Micosoft->Microsoft',
499+
'Micosoft',
500+
'Microsoft', reason=False)
501+
_helper_test_case_handling(tmpdir, capsys,
502+
'Micosoft->Microsoft',
503+
'MicoSoft',
504+
'Microsoft', reason=False)
505+
_helper_test_case_handling(tmpdir, capsys,
506+
'Micosoft->Microsoft',
507+
'MICOSOFT',
508+
'MICROSOFT', reason=False)
509+
# Test typical single: Both misspelled and suggested word both coded
510+
# as lower case in dictionary.
511+
_helper_test_case_handling(tmpdir, capsys,
512+
'pinapple->pineapple',
513+
'pinapple',
514+
'pineapple', reason=False)
515+
_helper_test_case_handling(tmpdir, capsys,
516+
'pinapple->pineapple',
517+
'Pinapple',
518+
'Pineapple', reason=False)
519+
# Test typical multiple: Both misspelled and multiple suggested words
520+
# both coded as lower case in dictionary.
521+
_helper_test_case_handling(tmpdir, capsys,
522+
'uspported->supported, unsupported,',
523+
'uspported',
524+
'supported, unsupported', reason=False)
525+
_helper_test_case_handling(tmpdir, capsys,
526+
'uspported->supported, unsupported,',
527+
'Uspported',
528+
'Supported, Unsupported', reason=False)
529+
_helper_test_case_handling(tmpdir, capsys,
530+
'uspported->supported, unsupported,',
531+
'USPPORTED',
532+
'SUPPORTED, UNSUPPORTED', reason=False)
533+
# Test typical multiple & mix: Misspelled coded in lower. Multiple
534+
# suggested words coded as lower & capitalize case in dictionary.
535+
_helper_test_case_handling(tmpdir, capsys,
536+
'skipt->skip, Skype, skipped,',
537+
'skipt',
538+
'skip, Skype, skipped', reason=False)
539+
_helper_test_case_handling(tmpdir, capsys,
540+
'skipt->skip, Skype, skipped,',
541+
'Skipt',
542+
'Skip, Skype, Skipped', reason=False)
543+
_helper_test_case_handling(tmpdir, capsys,
544+
'skipt->skip, Skype, skipped,',
545+
'SKIPT',
546+
'SKIP, SKYPE, SKIPPED', reason=False)
547+
# Test CamelCase basic: Suggested word coded as CamelCase in dictionary.
548+
_helper_test_case_handling(tmpdir, capsys,
549+
'lesstiff->LessTif',
550+
'lesstiff',
551+
'LessTif', reason=False)
552+
_helper_test_case_handling(tmpdir, capsys,
553+
'lesstiff->LessTif',
554+
'lessTiff',
555+
'LessTif', reason=False)
556+
_helper_test_case_handling(tmpdir, capsys,
557+
'lesstiff->LessTif',
558+
'Lesstiff',
559+
'LessTif', reason=False)
560+
_helper_test_case_handling(tmpdir, capsys,
561+
'lesstiff->LessTif',
562+
'LessTiff',
563+
'LessTif', reason=False)
564+
_helper_test_case_handling(tmpdir, capsys,
565+
'lesstiff->LessTif',
566+
'LESSTIFF',
567+
'LESSTIF', reason=False)
568+
# Test CamelCase brand names: Suggested word coded as CamelCase
569+
# in dictionary.
570+
_helper_test_case_handling(tmpdir, capsys,
571+
'mangodb->MongoDB',
572+
'mangodb',
573+
'MongoDB', reason=False)
574+
_helper_test_case_handling(tmpdir, capsys,
575+
'mangodb->MongoDB',
576+
'mangoDb',
577+
'MongoDB', reason=False)
578+
_helper_test_case_handling(tmpdir, capsys,
579+
'mangodb->MongoDB',
580+
'mangoDB',
581+
'MongoDB', reason=False)
582+
_helper_test_case_handling(tmpdir, capsys,
583+
'mangodb->MongoDB',
584+
'Mangodb',
585+
'MongoDB', reason=False)
586+
_helper_test_case_handling(tmpdir, capsys,
587+
'mangodb->MongoDB',
588+
'MangoDb',
589+
'MongoDB', reason=False)
590+
_helper_test_case_handling(tmpdir, capsys,
591+
'mangodb->MongoDB',
592+
'MangoDB',
593+
'MongoDB', reason=False)
594+
# Test CamelCase brand names: Suggested word coded as CamelCase
595+
# in dictionary.
596+
_helper_test_case_handling(tmpdir, capsys,
597+
'ebya->eBay',
598+
'ebya',
599+
'eBay', reason=False)
600+
_helper_test_case_handling(tmpdir, capsys,
601+
'ebya->eBay',
602+
'eBya',
603+
'eBay', reason=False)
604+
_helper_test_case_handling(tmpdir, capsys,
605+
'ebya->eBay',
606+
'Ebya',
607+
'eBay', reason=False)
608+
_helper_test_case_handling(tmpdir, capsys,
609+
'ebya->eBay',
610+
'EBya',
611+
'eBay', reason=False)
612+
_helper_test_case_handling(tmpdir, capsys,
613+
'ebya->eBay',
614+
'EBYA',
615+
'EBAY', reason=False)
616+
# Special Test CamelCase, brand names: Misspelled is correct spelling
617+
# but incorrect case. Suggested word is coded as CamelCase in
618+
# dictionary. For custom dictionary only.
619+
_helper_test_case_handling(tmpdir, capsys,
620+
'mariadb->MariaDB',
621+
'mariadb',
622+
'MariaDB', reason=False)
623+
_helper_test_case_handling(tmpdir, capsys,
624+
'mariadb->MariaDB',
625+
'mariaDb',
626+
'MariaDB', reason=False)
627+
_helper_test_case_handling(tmpdir, capsys,
628+
'mariadb->MariaDB',
629+
'mariaDB',
630+
'MariaDB', reason=False)
631+
_helper_test_case_handling(tmpdir, capsys,
632+
'mariadb->MariaDB',
633+
'Mariadb',
634+
'MariaDB', reason=False)
635+
_helper_test_case_handling(tmpdir, capsys,
636+
'mariadb->MariaDB',
637+
'MariaDb',
638+
'MariaDB', reason=False)
639+
_helper_test_case_handling(tmpdir, capsys,
640+
'mariadb->MariaDB',
641+
'MariaDB',
642+
'MariaDB', reason=False)
643+
# Special Test CamelCase, brand names: Misspelled is correct spelling
644+
# but incorrect case. Multiple suggested words are coded as CamelCase
645+
# and lower case in dictionary. For custom dictionary only.
646+
_helper_test_case_handling(tmpdir, capsys,
647+
'mysql->MySQL, mysql,',
648+
'mysql',
649+
'MySQL, mysql', reason=False)
650+
_helper_test_case_handling(tmpdir, capsys,
651+
'mysql->MySQL, mysql,',
652+
'mySql',
653+
'MySQL, mysql', reason=False)
654+
_helper_test_case_handling(tmpdir, capsys,
655+
'mysql->MySQL, mysql,',
656+
'mySQL',
657+
'MySQL, mysql', reason=False)
658+
_helper_test_case_handling(tmpdir, capsys,
659+
'mysql->MySQL, mysql,',
660+
'Mysql',
661+
'MySQL, mysql', reason=False)
662+
_helper_test_case_handling(tmpdir, capsys,
663+
'mysql->MySQL, mysql,',
664+
'MySql',
665+
'MySQL, mysql', reason=False)
666+
_helper_test_case_handling(tmpdir, capsys,
667+
'mysql->MySQL, mysql,',
668+
'MySQL',
669+
'MySQL, mysql', reason=False)
451670

452671

453672
def test_context(tmpdir, capsys):

0 commit comments

Comments
 (0)