Skip to content

Commit a613969

Browse files
dlvenablesimonelbaz
authored andcommitted
GitHub Action to verify that newly added files have the license header. (opensearch-project#6392)
This includes Python scripts for validation as well as a GitHub Action that runs them and comments on PRs if license headers are missing. Signed-off-by: David Venable <dlv@amazon.com>
1 parent e4cbc77 commit a613969

3 files changed

Lines changed: 321 additions & 0 deletions

File tree

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/usr/bin/env python3
2+
3+
#
4+
# Copyright OpenSearch Contributors
5+
# SPDX-License-Identifier: Apache-2.0
6+
#
7+
# The OpenSearch Contributors require contributions made to
8+
# this file be licensed under the Apache-2.0 license or a
9+
# compatible open source license.
10+
#
11+
12+
"""
13+
License Header Compliance Checker for OpenSearch Data Prepper
14+
15+
This script checks that files contain the required license headers
16+
as specified in CONTRIBUTING.md.
17+
18+
Usage:
19+
python check-license-headers.py file1.java file2.py ...
20+
echo "file1.java\nfile2.py" | python check-license-headers.py
21+
"""
22+
23+
import os
24+
import sys
25+
from pathlib import Path
26+
from typing import List
27+
28+
# File extensions that require license headers
29+
SUPPORTED_EXTENSIONS = {
30+
'.java', '.groovy', '.gradle', # Java ecosystem
31+
'.py', # Python
32+
'.sh', '.bash', '.zsh', # Shell scripts
33+
'.yaml', '.yml', # YAML files
34+
'.properties', # Properties files
35+
}
36+
37+
def needs_license_header(file_path: str) -> bool:
38+
"""Check if a file needs a license header based on its extension."""
39+
path = Path(file_path)
40+
return path.suffix.lower() in SUPPORTED_EXTENSIONS
41+
42+
def check_file_header(file_path: str) -> bool:
43+
"""Check if a file has the required complete license header."""
44+
if not Path(file_path).exists():
45+
return True
46+
47+
try:
48+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
49+
# Read first 15 lines to check for license header
50+
lines = []
51+
for i, line in enumerate(f):
52+
if i >= 15: # Only check first 15 lines
53+
break
54+
lines.append(line)
55+
56+
content = ''.join(lines)
57+
58+
# Check for all 5 required license header components
59+
required_components = [
60+
'Copyright OpenSearch Contributors',
61+
'SPDX-License-Identifier: Apache-2.0',
62+
'The OpenSearch Contributors require contributions made to',
63+
'this file be licensed under the Apache-2.0 license or a',
64+
'compatible open source license.'
65+
]
66+
67+
# All components must be present
68+
for component in required_components:
69+
if component not in content:
70+
return False
71+
72+
return True
73+
74+
except Exception as e:
75+
print(f"Error reading file {file_path}: {e}", file=sys.stderr)
76+
return True # Skip files we can't read
77+
78+
def get_files_to_check() -> List[str]:
79+
"""Get files to check from command line args or stdin."""
80+
if len(sys.argv) > 1:
81+
# Files provided as command line arguments
82+
return sys.argv[1:]
83+
else:
84+
# Read files from stdin
85+
files = []
86+
for line in sys.stdin:
87+
file_path = line.strip()
88+
if file_path:
89+
files.append(file_path)
90+
return files
91+
92+
def main():
93+
"""Main function to check license headers."""
94+
files_to_check = get_files_to_check()
95+
96+
if not files_to_check:
97+
print("No files to check", file=sys.stderr)
98+
return
99+
100+
print(f"Checking {len(files_to_check)} files for license headers.")
101+
102+
violations = []
103+
104+
for file_path in files_to_check:
105+
print(f"Checking: {file_path}")
106+
107+
if not Path(file_path).exists():
108+
print(f" File not found: {file_path}")
109+
continue
110+
111+
# Skip if doesn't need header
112+
if not needs_license_header(file_path):
113+
print(f" Skipped (no header needed): {file_path}")
114+
continue
115+
116+
# Check header
117+
if not check_file_header(file_path):
118+
violations.append(f"- `{file_path}`")
119+
print(f" ❌ Missing license header: {file_path}")
120+
else:
121+
print(f" ✅ Header OK: {file_path}")
122+
123+
# Output results
124+
if violations:
125+
print(f"\n❌ Found {len(violations)} license header violations:")
126+
127+
violation_text = '\n'.join(violations)
128+
129+
# Set output for GitHub Actions
130+
github_output = os.environ.get('GITHUB_OUTPUT')
131+
if github_output:
132+
with open(github_output, 'a') as f:
133+
f.write(f"violations<<EOF\n{violation_text}\nEOF\n")
134+
135+
print("\nViolations:")
136+
for violation in violations:
137+
print(f" {violation}")
138+
139+
sys.exit(1)
140+
else:
141+
print("\n✅ All files have proper license headers!")
142+
# Set empty output for GitHub Actions
143+
github_output = os.environ.get('GITHUB_OUTPUT')
144+
if github_output:
145+
with open(github_output, 'a') as f:
146+
f.write("violations=\n")
147+
148+
if __name__ == "__main__":
149+
main()

.github/scripts/get-new-files.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python3
2+
3+
#
4+
# Copyright OpenSearch Contributors
5+
# SPDX-License-Identifier: Apache-2.0
6+
#
7+
# The OpenSearch Contributors require contributions made to
8+
# this file be licensed under the Apache-2.0 license or a
9+
# compatible open source license.
10+
#
11+
12+
"""
13+
Get newly added files from Git.
14+
15+
This script identifies files added in the current PR and outputs them
16+
one per line to stdout.
17+
"""
18+
19+
import os
20+
import subprocess
21+
import sys
22+
23+
def get_newly_added_files():
24+
"""Get list of files added in this PR."""
25+
try:
26+
# Get the base branch (usually main)
27+
base_ref = os.environ.get('GITHUB_BASE_REF', 'main')
28+
29+
# Get added files in this PR
30+
result = subprocess.run([
31+
'git', 'diff', '--name-only', '--diff-filter=A',
32+
f'origin/{base_ref}...HEAD'
33+
], capture_output=True, text=True, check=True)
34+
35+
files = [f.strip() for f in result.stdout.split('\n') if f.strip()]
36+
return files
37+
38+
except subprocess.CalledProcessError as e:
39+
print(f"Error getting changed files: {e}", file=sys.stderr)
40+
return []
41+
42+
def main():
43+
"""Main function to get newly added files."""
44+
files = get_newly_added_files()
45+
46+
if not files:
47+
print("No newly added files found", file=sys.stderr)
48+
sys.exit(0)
49+
50+
# Output files one per line
51+
for file_path in files:
52+
print(file_path)
53+
54+
if __name__ == "__main__":
55+
main()
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#
2+
# Copyright OpenSearch Contributors
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# The OpenSearch Contributors require contributions made to
6+
# this file be licensed under the Apache-2.0 license or a
7+
# compatible open source license.
8+
#
9+
10+
# Performs a license header check on new files.
11+
# It will comment on PRs if it finds violations.
12+
13+
name: License Header Check
14+
15+
on:
16+
pull_request:
17+
types: [opened, synchronize, reopened]
18+
19+
jobs:
20+
license-header-check:
21+
runs-on: ubuntu-latest
22+
name: Check License Headers on New Files
23+
24+
steps:
25+
- name: Checkout code
26+
uses: actions/checkout@v6
27+
with:
28+
fetch-depth: 0
29+
30+
- name: Setup Python
31+
uses: actions/setup-python@v6
32+
with:
33+
python-version: '3.14'
34+
35+
- name: Run License Header Check
36+
id: license-check
37+
run: |
38+
python .github/scripts/get-new-files.py | python .github/scripts/check-license-headers.py
39+
40+
- name: Comment on PR
41+
if: failure() && steps.license-check.outputs.violations != ''
42+
uses: actions/github-script@v8
43+
with:
44+
script: |
45+
const violations = process.env.VIOLATIONS;
46+
47+
const body = [
48+
'## ⚠️ License Header Violations Found',
49+
'',
50+
'The following newly added files are missing required license headers:',
51+
'',
52+
violations,
53+
'',
54+
'Please add the appropriate license header to each file and push your changes.',
55+
'',
56+
'**See the license header requirements:** https://github.com/opensearch-project/data-prepper/blob/main/CONTRIBUTING.md#license-headers'
57+
].join('\n');
58+
59+
const { data: comments } = await github.rest.issues.listComments({
60+
owner: context.repo.owner,
61+
repo: context.repo.repo,
62+
issue_number: context.issue.number,
63+
});
64+
65+
const botComment = comments.find(comment =>
66+
comment.user.type === 'Bot' &&
67+
comment.body.includes('License Header Violations Found')
68+
);
69+
70+
if (botComment) {
71+
await github.rest.issues.updateComment({
72+
owner: context.repo.owner,
73+
repo: context.repo.repo,
74+
comment_id: botComment.id,
75+
body: body
76+
});
77+
} else {
78+
await github.rest.issues.createComment({
79+
owner: context.repo.owner,
80+
repo: context.repo.repo,
81+
issue_number: context.issue.number,
82+
body: body
83+
});
84+
}
85+
env:
86+
VIOLATIONS: ${{ steps.license-check.outputs.violations }}
87+
88+
- name: Update PR comment (all violations resolved)
89+
if: success()
90+
uses: actions/github-script@v8
91+
with:
92+
script: |
93+
const { data: comments } = await github.rest.issues.listComments({
94+
owner: context.repo.owner,
95+
repo: context.repo.repo,
96+
issue_number: context.issue.number,
97+
});
98+
99+
const botComment = comments.find(comment =>
100+
comment.user.type === 'Bot' &&
101+
(comment.body.includes('License Header Violations Found') || comment.body.includes('License Header Check Passed'))
102+
);
103+
104+
if (botComment) {
105+
const successBody = [
106+
'## ✅ License Header Check Passed',
107+
'',
108+
'All newly added files have proper license headers. Great work! 🎉'
109+
].join('\n');
110+
111+
await github.rest.issues.updateComment({
112+
owner: context.repo.owner,
113+
repo: context.repo.repo,
114+
comment_id: botComment.id,
115+
body: successBody
116+
});
117+
}

0 commit comments

Comments
 (0)