1111MARKER_RE = re .compile (r'\{\{([^}]*)\}\}' , re .DOTALL )
1212VERSION_SUFFIX_RE = re .compile (r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$' , re .IGNORECASE )
1313
14- # extra short forms used in rule text that the license index doesnt have
14+ # short forms that appear in rule text but arent in the license index
1515EXTRA_NAMES = {
16- 'gpl-2.0' : ['GPL-2.0' , 'GPLv2' , 'GPL 2.0' , 'GPL version 2' ],
16+ 'gpl-2.0' : ['GPL-2.0' , 'GPLv2' , 'GPL 2.0' , 'GPL version 2' , 'GPL Version 2' ],
1717 'gpl-2.0-plus' : ['GPL-2.0+' , 'GPLv2+' , 'GPL 2.0 or later' ],
1818 'gpl-3.0' : ['GPL-3.0' , 'GPLv3' , 'GPL 3.0' , 'GPL version 3' ],
1919 'gpl-3.0-plus' : ['GPL-3.0+' , 'GPLv3+' , 'GPL 3.0 or later' ],
20+ 'lgpl-2.0-plus' : ['LGPL-2.0' , 'LGPL-2.0+' , 'LGPL 2.0' ],
2021 'lgpl-2.1' : ['LGPL-2.1' , 'LGPLv2.1' , 'LGPL 2.1' ],
21- 'lgpl-2.1-plus' : ['LGPL-2.1+' , 'LGPLv2.1+' ],
22+ 'lgpl-2.1-plus' : ['LGPL-2.1+' , 'LGPLv2.1+' , 'LGPL 2.1 or later' ],
2223 'lgpl-3.0' : ['LGPL-3.0' , 'LGPLv3' , 'LGPL 3.0' ],
2324 'lgpl-3.0-plus' : ['LGPL-3.0+' , 'LGPLv3+' ],
2425 'agpl-3.0' : ['AGPL-3.0' , 'AGPLv3' , 'AGPL 3.0' ],
2526 'agpl-3.0-plus' : ['AGPL-3.0+' , 'AGPLv3+' ],
27+ 'mpl-1.1' : ['MPL-1.1' , 'MPL 1.1' ],
2628 'mpl-2.0' : ['MPL-2.0' , 'MPL 2.0' ],
2729 'apache-2.0' : ['Apache-2.0' , 'Apache 2.0' ],
28- 'bsd-new' : ['BSD-3-Clause' , 'BSD 3-Clause' ],
29- 'bsd-simplified' : ['BSD-2-Clause' , 'BSD 2-Clause' ],
30+ 'bsd-new' : ['BSD-3-Clause' , 'BSD 3-Clause' , 'BSD-3-clause' ],
31+ 'bsd-simplified' : ['BSD-2-Clause' , 'BSD 2-Clause' , 'BSD 2-clause' ],
3032 'mit' : ['MIT License' , 'MIT license' , 'MIT' ],
3133 'isc' : ['ISC License' , 'ISC license' , 'ISC' ],
3234 'artistic-2.0' : ['Artistic-2.0' , 'Artistic 2.0' ],
3335 'epl-1.0' : ['EPL-1.0' , 'EPL 1.0' ],
3436 'epl-2.0' : ['EPL-2.0' , 'EPL 2.0' ],
3537 'cc-by-4.0' : ['CC-BY-4.0' , 'CC BY 4.0' ],
36- 'unlicense' : ['Unlicense' ],
38+ 'unlicense' : ['Unlicense' , 'UNLICENSE' ],
3739}
3840
3941
4042def strip_version_suffix (name ):
41- """removing trailing version from a license name"""
43+ """remove trailing version number from a license name"""
4244 result = VERSION_SUFFIX_RE .sub ('' , name ).strip ()
4345 if len (result ) < 10 or result == name :
4446 return None
4547 return result
4648
4749
4850def get_candidate_names (lic ):
49- """collect names to search for. longest first"""
51+ """collect names to search for, longest first"""
5052 names = []
5153 if lic .name :
5254 names .append (lic .name )
@@ -67,7 +69,7 @@ def get_candidate_names(lic):
6769
6870
6971def find_in_text (text , candidates ):
70- """case insensitive search. returns matched span having original case"""
72+ """case insensitive search, returns the matched text in original case"""
7173 text_lower = text .lower ()
7274 for name in candidates :
7375 if not name or len (name ) < 3 :
@@ -85,7 +87,7 @@ def find_in_text(text, candidates):
8587@click .option ('--limit' , type = int , default = None )
8688@click .option ('--dry-run' , is_flag = True )
8789def main (rules_dir , expression_filter , limit , dry_run ):
88- """annotate rules with required phrase markers"""
90+ """Annotate composite license rules with required phrase markers"""
8991 if not rules_dir :
9092 repo_rules = Path (__file__ ).resolve ().parents [3 ] / 'src' / 'licensedcode' / 'data' / 'rules'
9193 rules_dir = str (repo_rules ) if repo_rules .is_dir () else default_rules_data_dir
@@ -111,6 +113,10 @@ def main(rules_dir, expression_filter, limit, dry_run):
111113 continue
112114 if getattr (rule , 'is_required_phrase' , False ):
113115 continue
116+ if getattr (rule , 'skip_for_required_phrase_generation' , False ):
117+ continue
118+ if not getattr (rule , 'is_approx_matchable' , True ):
119+ continue
114120 text = rule .text or ''
115121 if MARKER_RE .search (text ):
116122 continue
@@ -157,8 +163,3 @@ def main(rules_dir, expression_filter, limit, dry_run):
157163
158164if __name__ == '__main__' :
159165 main ()
160-
161- # commands:
162- # python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run
163- # python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20
164- # python etc/scripts/dataset_pipeline/annotate_composites.py
0 commit comments