Skip to content

Commit c1c8858

Browse files
Add safety filters and more name variants
1 parent 2cc6b5b commit c1c8858

1 file changed

Lines changed: 16 additions & 15 deletions

File tree

etc/scripts/dataset_pipeline/annotate_composites.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,42 +11,44 @@
1111
MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
1212
VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE)
1313

14-
# extra short forms used in rule text that the license index doesnt have
14+
# short forms that appear in rule text but arent in the license index
1515
EXTRA_NAMES = {
16-
'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'],
16+
'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2', 'GPL Version 2'],
1717
'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'],
1818
'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'],
1919
'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'],
20+
'lgpl-2.0-plus': ['LGPL-2.0', 'LGPL-2.0+', 'LGPL 2.0'],
2021
'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'],
21-
'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'],
22+
'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+', 'LGPL 2.1 or later'],
2223
'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'],
2324
'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'],
2425
'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'],
2526
'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'],
27+
'mpl-1.1': ['MPL-1.1', 'MPL 1.1'],
2628
'mpl-2.0': ['MPL-2.0', 'MPL 2.0'],
2729
'apache-2.0': ['Apache-2.0', 'Apache 2.0'],
28-
'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'],
29-
'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'],
30+
'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause', 'BSD-3-clause'],
31+
'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause', 'BSD 2-clause'],
3032
'mit': ['MIT License', 'MIT license', 'MIT'],
3133
'isc': ['ISC License', 'ISC license', 'ISC'],
3234
'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'],
3335
'epl-1.0': ['EPL-1.0', 'EPL 1.0'],
3436
'epl-2.0': ['EPL-2.0', 'EPL 2.0'],
3537
'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'],
36-
'unlicense': ['Unlicense'],
38+
'unlicense': ['Unlicense', 'UNLICENSE'],
3739
}
3840

3941

4042
def strip_version_suffix(name):
41-
"""removing trailing version from a license name"""
43+
"""remove trailing version number from a license name"""
4244
result = VERSION_SUFFIX_RE.sub('', name).strip()
4345
if len(result) < 10 or result == name:
4446
return None
4547
return result
4648

4749

4850
def get_candidate_names(lic):
49-
"""collect names to search for.longest first"""
51+
"""collect names to search for, longest first"""
5052
names = []
5153
if lic.name:
5254
names.append(lic.name)
@@ -67,7 +69,7 @@ def get_candidate_names(lic):
6769

6870

6971
def find_in_text(text, candidates):
70-
"""case insensitive search.returns matched span having original case"""
72+
"""case insensitive search, returns the matched text in original case"""
7173
text_lower = text.lower()
7274
for name in candidates:
7375
if not name or len(name) < 3:
@@ -85,7 +87,7 @@ def find_in_text(text, candidates):
8587
@click.option('--limit', type=int, default=None)
8688
@click.option('--dry-run', is_flag=True)
8789
def main(rules_dir, expression_filter, limit, dry_run):
88-
"""annotate rules with required phrase markers"""
90+
"""Annotate composite license rules with required phrase markers"""
8991
if not rules_dir:
9092
repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
9193
rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir
@@ -111,6 +113,10 @@ def main(rules_dir, expression_filter, limit, dry_run):
111113
continue
112114
if getattr(rule, 'is_required_phrase', False):
113115
continue
116+
if getattr(rule, 'skip_for_required_phrase_generation', False):
117+
continue
118+
if not getattr(rule, 'is_approx_matchable', True):
119+
continue
114120
text = rule.text or ''
115121
if MARKER_RE.search(text):
116122
continue
@@ -157,8 +163,3 @@ def main(rules_dir, expression_filter, limit, dry_run):
157163

158164
if __name__ == '__main__':
159165
main()
160-
161-
# commands:
162-
# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run
163-
# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20
164-
# python etc/scripts/dataset_pipeline/annotate_composites.py

0 commit comments

Comments
 (0)