Skip to content

Commit 0c8b056

Browse files
committed
[cr_checker]: enhance functionality
- allow for different border fill characters - detect multiple headers - add a new line after adding a new header
1 parent 266ba86 commit 0c8b056

2 files changed

Lines changed: 228 additions & 12 deletions

File tree

cr_checker/tests/test_cr_checker.py

Lines changed: 140 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def test_process_files_fix_inserts_header_after_shebang(tmp_path):
251251
assert results["no_copyright"] == 1
252252
expected_header = header_template.format(year=current_year, author=author)
253253
assert script.read_text(encoding="utf-8") == (
254-
"#!/usr/bin/env python3\n" + expected_header + "print('hi')\n"
254+
"#!/usr/bin/env python3\n" + expected_header + "\n" + "print('hi')\n"
255255
)
256256

257257

@@ -301,4 +301,142 @@ def test_process_files_fix_inserts_header_without_shebang(tmp_path):
301301
assert results["fixed"] == 1
302302
assert results["no_copyright"] == 1
303303
expected_header = header_template.format(year=current_year, author=author)
304-
assert script.read_text(encoding="utf-8") == expected_header + "print('hi')\n"
304+
assert script.read_text(encoding="utf-8") == expected_header + "\n" + "print('hi')\n"
305+
306+
307+
# test that border lines with different fill characters are accepted (flexible matching)
308+
def test_process_files_accepts_flexible_border(tmp_path):
309+
cr_checker = load_cr_checker_module()
310+
test_file = tmp_path / "file.cpp"
311+
current_year = datetime.now().year
312+
# Use '/' fill chars instead of '*' for border lines
313+
header = (
314+
"/////////////////////////////////////////////////////////////////////////////////////\n"
315+
f" * Copyright (c) {current_year} Author\n"
316+
" *\n"
317+
" * See the NOTICE file(s) distributed with this work for additional\n"
318+
" * information regarding copyright ownership.\n"
319+
" *\n"
320+
" * This program and the accompanying materials are made available under the\n"
321+
" * terms of the Apache License Version 2.0 which is available at\n"
322+
" * https://www.apache.org/licenses/LICENSE-2.0\n"
323+
" *\n"
324+
" * SPDX-License-Identifier: Apache-2.0\n"
325+
" /////////////////////////////////////////////////////////////////////////////////////\n"
326+
)
327+
test_file.write_text(header + "int main() {}\n", encoding="utf-8")
328+
header_template = load_template("cpp")
329+
330+
results = cr_checker.process_files(
331+
[test_file],
332+
{"cpp": header_template},
333+
False,
334+
use_mmap=False,
335+
encoding="utf-8",
336+
offset=0,
337+
remove_offset=0,
338+
)
339+
340+
assert results["no_copyright"] == 0
341+
342+
343+
# test that a blank line after the header does not cause a check failure
344+
def test_process_files_accepts_header_with_trailing_blank_line(tmp_path):
345+
cr_checker = load_cr_checker_module()
346+
test_file = tmp_path / "file.py"
347+
header_template = load_template("py")
348+
current_year = datetime.now().year
349+
header = header_template.format(year=current_year, author="Author")
350+
test_file.write_text(header + "\nsome content\n", encoding="utf-8")
351+
352+
results = cr_checker.process_files(
353+
[test_file],
354+
{"py": header_template},
355+
False,
356+
use_mmap=False,
357+
encoding="utf-8",
358+
offset=0,
359+
remove_offset=0,
360+
)
361+
362+
assert results["no_copyright"] == 0
363+
364+
365+
# test that fix_copyright inserts a blank line after the header
366+
def test_process_files_fix_inserts_trailing_blank_line(tmp_path):
367+
cr_checker = load_cr_checker_module()
368+
test_file = tmp_path / "file.py"
369+
test_file.write_text("some content\n", encoding="utf-8")
370+
header_template = load_template("py")
371+
current_year = datetime.now().year
372+
author = "Author"
373+
config = write_config(tmp_path, author)
374+
375+
cr_checker.process_files(
376+
[test_file],
377+
{"py": header_template},
378+
True,
379+
config=config,
380+
use_mmap=False,
381+
encoding="utf-8",
382+
offset=0,
383+
remove_offset=0,
384+
)
385+
386+
expected_header = header_template.format(year=current_year, author=author)
387+
assert test_file.read_text(encoding="utf-8").startswith(expected_header + "\n")
388+
389+
390+
# test that has_duplicate_copyright detects a header that appears twice
391+
def test_has_duplicate_copyright_detects_duplicate(tmp_path):
392+
cr_checker = load_cr_checker_module()
393+
test_file = tmp_path / "file.py"
394+
header_template = load_template("py")
395+
current_year = datetime.now().year
396+
header = header_template.format(year=current_year, author="Author")
397+
test_file.write_text(header + header + "some content\n", encoding="utf-8")
398+
399+
result = cr_checker.has_duplicate_copyright(
400+
test_file, header_template, False, "utf-8", 0
401+
)
402+
403+
assert result is True
404+
405+
406+
# test that has_duplicate_copyright returns False for a single header
407+
def test_has_duplicate_copyright_single_header(tmp_path):
408+
cr_checker = load_cr_checker_module()
409+
test_file = tmp_path / "file.py"
410+
header_template = load_template("py")
411+
current_year = datetime.now().year
412+
header = header_template.format(year=current_year, author="Author")
413+
test_file.write_text(header + "some content\n", encoding="utf-8")
414+
415+
result = cr_checker.has_duplicate_copyright(
416+
test_file, header_template, False, "utf-8", 0
417+
)
418+
419+
assert result is False
420+
421+
422+
# test that process_files counts duplicate headers separately from missing headers
423+
def test_process_files_detects_duplicate_header(tmp_path):
424+
cr_checker = load_cr_checker_module()
425+
test_file = tmp_path / "file.py"
426+
header_template = load_template("py")
427+
current_year = datetime.now().year
428+
header = header_template.format(year=current_year, author="Author")
429+
test_file.write_text(header + header + "some content\n", encoding="utf-8")
430+
431+
results = cr_checker.process_files(
432+
[test_file],
433+
{"py": header_template},
434+
False,
435+
use_mmap=False,
436+
encoding="utf-8",
437+
offset=0,
438+
remove_offset=0,
439+
)
440+
441+
assert results["duplicate_copyright"] == 1
442+
assert results["no_copyright"] == 0

cr_checker/tool/cr_checker.py

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
BYTES_TO_READ = 4 * 1024
3030
DEFAULT_AUTHOR = "Contributors to the Eclipse Foundation"
3131

32+
BORDER_FILL_PATTERN = re.compile(r"([/*#'\-=+])\1{4,}")
33+
FILL_CHARS_REGEX = r"[/*#'\-=+]+"
34+
3235
LOGGER = logging.getLogger()
3336

3437
COLORS = {
@@ -139,6 +142,28 @@ def convert_bre_to_regex(template: str) -> str:
139142
return escaped
140143

141144

145+
def line_to_flexible_regex(line: str) -> str:
146+
"""
147+
Convert a border line to a regex that accepts any fill characters.
148+
149+
Runs of 5+ identical fill characters (e.g. ``****``) are replaced with
150+
``[/*#'\\-=+]+`` so that alternative styles (e.g. ``////``) are also
151+
accepted.
152+
"""
153+
stripped = line.rstrip("\n")
154+
has_newline = line.endswith("\n")
155+
result = []
156+
last_end = 0
157+
for m in BORDER_FILL_PATTERN.finditer(stripped):
158+
result.append(re.escape(stripped[last_end : m.start()]))
159+
result.append(FILL_CHARS_REGEX)
160+
last_end = m.end()
161+
result.append(re.escape(stripped[last_end:]))
162+
if has_newline:
163+
result.append("\n")
164+
return "".join(result)
165+
166+
142167
def load_templates(path):
143168
"""
144169
Loads the copyright templates from a configuration file.
@@ -196,7 +221,7 @@ def load_exclusion(path):
196221
path (str): Path to the exclusion file.
197222
198223
Returns:
199-
tuple(list, bool): a list of files that are excluded from the coypright check and a boolean indicating whether
224+
tuple(list, bool): a list of files that are excluded from the copyright check and a boolean indicating whether
200225
all paths listed in the exclusion file exist and are files.
201226
"""
202227

@@ -362,13 +387,18 @@ def has_copyright(path, template, use_mmap, encoding, offset, config=None):
362387
IOError: If there is an error opening or reading the file.
363388
"""
364389

365-
load_text = load_text_from_file
366-
if use_mmap:
367-
load_text = load_text_from_file_with_mmap
390+
load_text = load_text_from_file_with_mmap if use_mmap else load_text_from_file
368391

369-
template_regex = convert_bre_to_regex(
370-
template.format(year=r"\\d\{4\}", author=r"\.\*")
371-
)
392+
lines = template.splitlines(keepends=True)
393+
regex_parts = []
394+
for line in lines:
395+
stripped_line = line.rstrip("\n")
396+
if BORDER_FILL_PATTERN.search(stripped_line):
397+
regex_parts.append(line_to_flexible_regex(line))
398+
else:
399+
formatted = line.format(year=r"\\d\{4\}", author=r"\.\*")
400+
regex_parts.append(convert_bre_to_regex(formatted))
401+
template_regex = "".join(regex_parts) + "\n?"
372402

373403
if re.match(template_regex, load_text(path, BYTES_TO_READ, encoding, offset)):
374404
LOGGER.debug("File %s has copyright.", path)
@@ -378,6 +408,41 @@ def has_copyright(path, template, use_mmap, encoding, offset, config=None):
378408
return False
379409

380410

411+
def has_duplicate_copyright(path, template, use_mmap, encoding, offset):
412+
"""
413+
Checks if the copyright header appears more than once in the file.
414+
415+
Args:
416+
path (Path): A `pathlib.Path` object pointing to the file to check.
417+
template (str): The copyright template to search for.
418+
use_mmap (bool): If True, uses memory-mapped file reading.
419+
encoding (str): Encoding type to use when reading the file.
420+
offset (int): Byte offset to skip (e.g. shebang line).
421+
422+
Returns:
423+
bool: True if the copyright header appears more than once, False otherwise.
424+
"""
425+
load_text = load_text_from_file_with_mmap if use_mmap else load_text_from_file
426+
427+
lines = template.splitlines(keepends=True)
428+
regex_parts = []
429+
for line in lines:
430+
stripped_line = line.rstrip("\n")
431+
if BORDER_FILL_PATTERN.search(stripped_line):
432+
regex_parts.append(line_to_flexible_regex(line))
433+
else:
434+
formatted = line.format(year=r"\\d\{4\}", author=r"\.\*")
435+
regex_parts.append(convert_bre_to_regex(formatted))
436+
template_regex = "\n?".join(regex_parts)
437+
438+
content = load_text(path, 2 * BYTES_TO_READ, encoding, offset)
439+
matches = list(re.finditer(template_regex, content))
440+
if len(matches) > 1:
441+
LOGGER.debug("File %s has %d copyright headers.", path, len(matches))
442+
return True
443+
return False
444+
445+
381446
def get_files_from_dir(directory, exts=None):
382447
"""
383448
Finds files in the specified directories. Filters by extensions if provided.
@@ -520,6 +585,7 @@ def fix_copyright(path, copyright_text, encoding, offset, config=None) -> bool:
520585
copyright_text.format(
521586
year=datetime.now().year, author=get_author_from_config(config)
522587
)
588+
+ "\n"
523589
)
524590
for chunk in iter(lambda: temp.read(4096), ""):
525591
handle.write(chunk)
@@ -562,7 +628,7 @@ def process_files(
562628
Returns:
563629
int: The number of files that do not contain the required copyright text.
564630
"""
565-
results = {"no_copyright": 0, "fixed": 0}
631+
results = {"no_copyright": 0, "fixed": 0, "duplicate_copyright": 0}
566632
for item in files:
567633
name = Path(item).name
568634
key = name if name == "BUILD" else Path(item).suffix[1:]
@@ -584,7 +650,12 @@ def process_files(
584650
shebang_offset = detect_shebang_offset(item, encoding)
585651
effective_offset = offset + shebang_offset if offset == 0 else offset
586652

587-
if not has_copyright(
653+
if has_duplicate_copyright(
654+
item, templates[key], use_mmap, encoding, effective_offset
655+
):
656+
LOGGER.error("Duplicate copyright header in: %s", item)
657+
results["duplicate_copyright"] += 1
658+
elif not has_copyright(
588659
item, templates[key], use_mmap, encoding, effective_offset, config
589660
):
590661
if fix:
@@ -771,6 +842,7 @@ def main(argv=None):
771842
)
772843
total_no = results["no_copyright"]
773844
total_fixes = results["fixed"]
845+
total_duplicates = results["duplicate_copyright"]
774846

775847
LOGGER.info("=" * 64)
776848
LOGGER.info("Process completed.")
@@ -780,6 +852,12 @@ def main(argv=None):
780852
total_no,
781853
COLORS["ENDC"],
782854
)
855+
LOGGER.info(
856+
"Total files with duplicate copyright: %s%d%s",
857+
COLORS["RED"] if total_duplicates > 0 else COLORS["GREEN"],
858+
total_duplicates,
859+
COLORS["ENDC"],
860+
)
783861
if not exclusion_valid:
784862
LOGGER.info("The exclusion file contains paths that do not exist.")
785863
if args.fix:
@@ -798,7 +876,7 @@ def main(argv=None):
798876
)
799877
LOGGER.info("=" * 64)
800878

801-
return 0 if (total_no == 0 and exclusion_valid) else 1
879+
return 0 if (total_no == 0 and total_duplicates == 0 and exclusion_valid) else 1
802880

803881

804882
if __name__ == "__main__":

0 commit comments

Comments
 (0)