Skip to content

Commit 691e5f2

Browse files
committed
[cr checker]: fix duplicate header check
1 parent 55ff194 commit 691e5f2

5 files changed

Lines changed: 53 additions & 17 deletions

File tree

BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ copyright_checker(
3737
# Add other directories/files you want to check
3838
],
3939
config = "//cr_checker/resources:config",
40+
exclusion = "//cr_checker/resources:exclusion",
4041
template = "//cr_checker/resources:templates",
4142
visibility = ["//visibility:public"],
4243
)

cr_checker/resources/BUILD

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,11 @@ filegroup(
2626
],
2727
visibility = ["//visibility:public"],
2828
)
29+
30+
filegroup(
31+
name = "exclusion",
32+
srcs = [
33+
"exclusion.txt",
34+
],
35+
visibility = ["//visibility:public"],
36+
)

cr_checker/resources/exclusion.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
cr_checker/resources/templates.ini

cr_checker/tool/cr_checker.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -408,35 +408,50 @@ def has_copyright(path, template, use_mmap, encoding, offset, config=None):
408408
return False
409409

410410

411-
def has_duplicate_copyright(path, template, use_mmap, encoding, offset):
411+
def has_any_copyright(path, use_mmap, encoding, offset):
412412
"""
413-
Checks if the copyright header appears more than once in the file.
413+
Checks if any copyright notice is present in the file header, regardless of format.
414414
415415
Args:
416416
path (Path): A `pathlib.Path` object pointing to the file to check.
417-
template (str): The copyright template to search for.
418417
use_mmap (bool): If True, uses memory-mapped file reading.
419418
encoding (str): Encoding type to use when reading the file.
420419
offset (int): Byte offset to skip (e.g. shebang line).
421420
422421
Returns:
423-
bool: True if the copyright header appears more than once, False otherwise.
422+
bool: True if any copyright notice is found, False otherwise.
424423
"""
425424
load_text = load_text_from_file_with_mmap if use_mmap else load_text_from_file
425+
content = load_text(path, BYTES_TO_READ, encoding, offset)
426+
return bool(
427+
re.search(
428+
r"Copyright.*SPDX-License-Identifier", content, re.IGNORECASE | re.DOTALL
429+
)
430+
)
426431

427-
lines = template.splitlines(keepends=True)
428-
regex_parts = []
429-
for line in lines:
430-
stripped_line = line.rstrip("\n")
431-
if BORDER_FILL_PATTERN.search(stripped_line):
432-
regex_parts.append(line_to_flexible_regex(line))
433-
else:
434-
formatted = line.format(year=r"\\d\{4\}\(-\\d\{4\}\)\?", author=r"\.\*")
435-
regex_parts.append(convert_bre_to_regex(formatted))
436-
template_regex = "\n?".join(regex_parts)
437432

438-
content = load_text(path, 2 * BYTES_TO_READ, encoding, offset)
439-
matches = list(re.finditer(template_regex, content))
433+
def has_duplicate_copyright(path, template, use_mmap, encoding, offset):
434+
"""
435+
Checks if more than one copyright notice is present in the file header.
436+
437+
The check is format-agnostic: it counts occurrences of ``SPDX-License-Identifier``
438+
within a window of twice the template length, so that headers written by different
439+
tools (e.g. REUSE vs. cr_checker) are both counted while string literals that
440+
embed copyright text further into the file are ignored.
441+
442+
Args:
443+
path (Path): A `pathlib.Path` object pointing to the file to check.
444+
template (str): The copyright template; its length defines the search window.
445+
use_mmap (bool): If True, uses memory-mapped file reading.
446+
encoding (str): Encoding type to use when reading the file.
447+
offset (int): Byte offset to skip (e.g. shebang line).
448+
449+
Returns:
450+
bool: True if more than one copyright notice is found, False otherwise.
451+
"""
452+
load_text = load_text_from_file_with_mmap if use_mmap else load_text_from_file
453+
content = load_text(path, 2 * len(template), encoding, offset)
454+
matches = list(re.finditer(r"SPDX-License-Identifier", content, re.IGNORECASE))
440455
if len(matches) > 1:
441456
LOGGER.debug("File %s has %d copyright headers.", path, len(matches))
442457
return True
@@ -500,6 +515,8 @@ def collect_inputs(inputs, exts=None):
500515
):
501516
LOGGER.debug("Processing file: %s", item)
502517
all_files.append(item)
518+
elif item.is_file():
519+
LOGGER.debug("Skipped (no configuration for file extension): %s", item)
503520
else:
504521
LOGGER.warning("Skipped (input is not a valid file or directory): %s", item)
505522
return all_files
@@ -658,7 +675,11 @@ def process_files(
658675
elif not has_copyright(
659676
item, templates[key], use_mmap, encoding, effective_offset, config
660677
):
661-
if fix:
678+
if has_any_copyright(item, use_mmap, encoding, effective_offset):
679+
LOGGER.warning(
680+
"Wrong copyright format in: %s, expected format from template", item
681+
)
682+
elif fix:
662683
if remove_offset:
663684
remove_old_header(item, encoding, remove_offset)
664685
fix_result = fix_copyright(

cr_checker/tool/pre-commit_wrapper

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,10 @@ if __name__ == "__main__":
3131
4,
3232
os.path.normpath(os.path.join(script_dir, '..', 'resources', 'config.json')),
3333
)
34+
sys.argv.insert(5, '--exclusion-file')
35+
sys.argv.insert(
36+
6,
37+
os.path.normpath(os.path.join(script_dir, '..', 'resources', 'exclusion.txt')),
38+
)
3439

3540
sys.exit(cr_checker.main(sys.argv[1:]))

0 commit comments

Comments
 (0)