forked from InsightSoftwareConsortium/ITKSoftwareGuide
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdoubleWordCheck.py
More file actions
executable file
·100 lines (76 loc) · 2.69 KB
/
doubleWordCheck.py
File metadata and controls
executable file
·100 lines (76 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
"""
Search for doubled words in text
Behavior:
- Reads one or more files (or stdin if none).
- Processes input in "records" delimited by ".\n" (Perl: $/ = ".\n").
- In each record, highlights a repeated word (case-insensitive) where the two
occurrences are separated by whitespace and/or simple HTML tags.
- Removes any leading lines that contain no escape characters.
- Prefixes each remaining line with "<filename>: ".
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
ESC = "\x1b"
# Perl: s/\b([a-z]+)((\s|<[^>]+>)+)(\1\b)/\e[7m$1\e[m$2\e[7m$4\e[m/ig
DOUBLE_WORD_RE = re.compile(
r"\b([a-z]+)((?:\s|<[^>]+>)+)(\1\b)",
re.IGNORECASE,
)
# Perl: s/^([^\e]*\n)+//mg
# Interpreted as: drop initial consecutive lines that contain no ESC.
LEADING_NO_ESC_LINES_RE = re.compile(r"^(?:[^\x1b]*\n)+", re.MULTILINE)
def highlight_double_words(record: str) -> str | None:
"""
Return transformed record if a double-word pattern is found; otherwise None.
"""
def repl(m: re.Match[str]) -> str:
w1 = m.group(1)
sep = m.group(2)
w2 = m.group(3) # same text as group(1) as matched
return f"{ESC}[7m{w1}{ESC}[m{sep}{ESC}[7m{w2}{ESC}[m"
new_record, n = DOUBLE_WORD_RE.subn(repl, record, count=1)
if n == 0:
return None
new_record = LEADING_NO_ESC_LINES_RE.sub("", new_record, count=1)
return new_record
def iter_records(text: str, sep: str = ".\n"):
"""
Yield records split by the exact separator, including the separator (like Perl $/).
"""
start = 0
while True:
idx = text.find(sep, start)
if idx == -1:
if start < len(text):
yield text[start:]
break
end = idx + len(sep)
yield text[start:end]
start = end
def process_stream(name: str, data: str, out) -> None:
for record in iter_records(data, sep=".\n"):
transformed = highlight_double_words(record)
if transformed is None:
continue
# Perl: s/^/$ARGV: /mg => prefix each line
prefixed = re.sub(r"^", f"{name}: ", transformed, flags=re.MULTILINE)
out.write(prefixed)
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("files", nargs="*", help="Files to scan; if empty, read stdin.")
args = ap.parse_args()
if not args.files:
data = sys.stdin.read()
process_stream("<stdin>", data, sys.stdout)
return 0
for f in args.files:
p = Path(f)
data = p.read_text(encoding="utf-8", errors="replace")
process_stream(f, data, sys.stdout)
return 0
if __name__ == "__main__":
raise SystemExit(main())