Skip to content

Commit 3fb5e0f

Browse files
committed
Fix --ignore-multiline-regex with --write-changes some more
Consider the following file: ``` $ cat test.txt codespell:ignore-begin Thsi line contains a typo codespell:ignore-end Thsi line also contains a typo While this line is correct ``` If we use codespell to fix the second typo: ``` $ codespell \ --ignore-multiline-regex 'codespell:ignore-begin.*codespell:ignore-end' \ --write-changes \ test.txt FIXED: test.txt ``` indeed that typo is fixed, but the text matching the multiline regexp is gone: ``` $ cat test.txt This line also contains a typo While this line is correct $ ... The problem is that FileOpener.get_lines (returning a list of strings) implements --ignore-multiline-regex by blanking out the text matching the regexp. Fix this by changing FileOpener.get_lines to return a list of fragments, each modeled as a tuple (ignored: bool, line_number: int, lines: list[str]), and handling this new format elsewhere.
1 parent b10f2dd commit 3fb5e0f

2 files changed

Lines changed: 75 additions & 36 deletions

File tree

codespell_lib/_codespell.py

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -227,12 +227,14 @@ def init_chardet(self) -> None:
227227

228228
self.encdetector = UniversalDetector()
229229

230-
def open(self, filename: str) -> tuple[list[str], str]:
230+
def open(self, filename: str) -> tuple[list[tuple[bool, int, list[str]]], str]:
231231
if self.use_chardet:
232232
return self.open_with_chardet(filename)
233233
return self.open_with_internal(filename)
234234

235-
def open_with_chardet(self, filename: str) -> tuple[list[str], str]:
235+
def open_with_chardet(
236+
self, filename: str
237+
) -> tuple[list[tuple[bool, int, list[str]]], str]:
236238
self.encdetector.reset()
237239
with open(filename, "rb") as fb:
238240
for line in fb:
@@ -259,7 +261,9 @@ def open_with_chardet(self, filename: str) -> tuple[list[str], str]:
259261

260262
return lines, f.encoding
261263

262-
def open_with_internal(self, filename: str) -> tuple[list[str], str]:
264+
def open_with_internal(
265+
self, filename: str
266+
) -> tuple[list[tuple[bool, int, list[str]]], str]:
263267
encoding = None
264268
first_try = True
265269
for encoding in ("utf-8", "iso-8859-1"):
@@ -286,21 +290,25 @@ def open_with_internal(self, filename: str) -> tuple[list[str], str]:
286290

287291
return lines, encoding
288292

289-
def get_lines(self, f: TextIO) -> list[str]:
293+
def get_lines(self, f: TextIO) -> list[tuple[bool, int, list[str]]]:
294+
fragments = []
295+
line_number = 0
290296
if self.ignore_multiline_regex:
291297
text = f.read()
292298
pos = 0
293-
text2 = ""
294299
for m in re.finditer(self.ignore_multiline_regex, text):
295-
text2 += text[pos : m.start()]
296-
# Replace with blank lines so line numbers are unchanged.
297-
text2 += "\n" * m.group().count("\n")
300+
lines = text[pos : m.start()].splitlines(True)
301+
fragments.append((False, line_number, lines))
302+
line_number += len(lines)
303+
lines = m.group().splitlines(True)
304+
fragments.append((True, line_number, lines))
305+
line_number += len(lines) - 1
298306
pos = m.end()
299-
text2 += text[pos:]
300-
lines = text2.splitlines(True)
307+
lines = text[pos:].splitlines(True)
308+
fragments.append((False, line_number, lines))
301309
else:
302-
lines = f.readlines()
303-
return lines
310+
fragments.append((False, line_number, f.readlines()))
311+
return fragments
304312

305313

306314
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
@@ -870,7 +878,7 @@ def apply_uri_ignore_words(
870878

871879

872880
def parse_lines(
873-
lines: list[str],
881+
fragment: tuple[bool, int, list[str]],
874882
filename: str,
875883
colors: TermColors,
876884
summary: Optional[Summary],
@@ -887,10 +895,13 @@ def parse_lines(
887895
bad_count = 0
888896
changed = False
889897

898+
_, fragment_line_number, lines = fragment
899+
890900
for i, line in enumerate(lines):
891901
line = line.rstrip()
892902
if not line or line in exclude_lines:
893903
continue
904+
line_number = fragment_line_number + i
894905

895906
extra_words_to_ignore = set()
896907
match = inline_ignore_regex.search(line)
@@ -977,7 +988,7 @@ def parse_lines(
977988
continue
978989

979990
cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
980-
cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
991+
cline = f"{colors.FILE}{line_number + 1}{colors.DISABLE}"
981992
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
982993
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
983994

@@ -1028,13 +1039,13 @@ def parse_file(
10281039
options: argparse.Namespace,
10291040
) -> int:
10301041
bad_count = 0
1031-
lines = None
1042+
fragments = None
10321043

10331044
# Read lines.
10341045
if filename == "-":
10351046
f = sys.stdin
10361047
encoding = "utf-8"
1037-
lines = file_opener.get_lines(f)
1048+
fragments = file_opener.get_lines(f)
10381049
else:
10391050
if options.check_filenames:
10401051
for word in extract_words(filename, word_regex, ignore_word_regex):
@@ -1084,42 +1095,51 @@ def parse_file(
10841095
print(f"WARNING: Binary file: {filename}", file=sys.stderr)
10851096
return bad_count
10861097
try:
1087-
lines, encoding = file_opener.open(filename)
1098+
fragments, encoding = file_opener.open(filename)
10881099
except OSError:
10891100
return bad_count
10901101

10911102
# Parse lines.
1092-
bad_count_update, changed = parse_lines(
1093-
lines,
1094-
filename,
1095-
colors,
1096-
summary,
1097-
misspellings,
1098-
ignore_words_cased,
1099-
exclude_lines,
1100-
word_regex,
1101-
ignore_word_regex,
1102-
uri_regex,
1103-
uri_ignore_words,
1104-
context,
1105-
options,
1106-
)
1107-
bad_count += bad_count_update
1103+
changed = False
1104+
for fragment in fragments:
1105+
ignore, _, _ = fragment
1106+
if ignore:
1107+
continue
1108+
1109+
bad_count_update, changed_update = parse_lines(
1110+
fragment,
1111+
filename,
1112+
colors,
1113+
summary,
1114+
misspellings,
1115+
ignore_words_cased,
1116+
exclude_lines,
1117+
word_regex,
1118+
ignore_word_regex,
1119+
uri_regex,
1120+
uri_ignore_words,
1121+
context,
1122+
options,
1123+
)
1124+
bad_count += bad_count_update
1125+
changed = changed or changed_update
11081126

11091127
# Write out lines, if changed.
11101128
if changed:
11111129
if filename == "-":
11121130
print("---")
1113-
for line in lines:
1114-
print(line, end="")
1131+
for _, _, lines in fragments:
1132+
for line in lines:
1133+
print(line, end="")
11151134
else:
11161135
if not options.quiet_level & QuietLevels.FIXES:
11171136
print(
11181137
f"{colors.FWORD}FIXED:{colors.DISABLE} {filename}",
11191138
file=sys.stderr,
11201139
)
11211140
with open(filename, "w", encoding=encoding, newline="") as f:
1122-
f.writelines(lines)
1141+
for _, _, lines in fragments:
1142+
f.writelines(lines)
11231143

11241144
return bad_count
11251145

codespell_lib/tests/test_basic.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,25 @@ def test_ignore_multiline_regex_option(
994994
)
995995
assert fname.read_text() == "This\nThis"
996996

997+
fname.write_text(text)
998+
cs.main(
999+
fname,
1000+
"-w",
1001+
"--ignore-multiline-regex",
1002+
"codespell:ignore-begin.*codespell:ignore-end",
1003+
)
1004+
fixed_text = """
1005+
Please see http://example.com/abandoned for info
1006+
# codespell:ignore-begin
1007+
'''
1008+
abandonned
1009+
abandonned
1010+
'''
1011+
# codespell:ignore-end
1012+
abandoned
1013+
"""
1014+
assert fname.read_text() == fixed_text
1015+
9971016

9981017
def test_uri_regex_option(
9991018
tmp_path: Path,

0 commit comments

Comments
 (0)