-
-
Notifications
You must be signed in to change notification settings - Fork 720
Expand file tree
/
Copy pathtallies.py
More file actions
493 lines (395 loc) · 16.8 KB
/
tallies.py
File metadata and controls
493 lines (395 loc) · 16.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from collections import Counter
import attr
from commoncode.cliutils import POST_SCAN_GROUP, PluggableCommandLineOption
from plugincode.post_scan import PostScanPlugin, post_scan_impl
from summarycode.utils import (get_resource_tallies, set_resource_tallies,
sorted_counter)
# Tracing flags
TRACE = False
TRACE_LIGHT = False
def logger_debug(*args):
pass
if TRACE or TRACE_LIGHT:
import logging
import sys
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
"""
Create summarized scan data.
"""
@post_scan_impl
class Tallies(PostScanPlugin):
"""
Compute tallies for license, copyright and other scans at the codebase level
"""
run_order = 15
sort_order = 15
codebase_attributes = dict(tallies=attr.ib(default=attr.Factory(dict)))
options = [
PluggableCommandLineOption(('--tallies',),
is_flag=True, default=False,
help='Compute tallies for license, copyright and other scans at the codebase level.',
help_group=POST_SCAN_GROUP)
]
def is_enabled(self, tallies, **kwargs):
return tallies
def process_codebase(self, codebase, tallies, **kwargs):
if TRACE_LIGHT: logger_debug('Tallies:process_codebase')
tallies = compute_codebase_tallies(codebase, keep_details=False, **kwargs)
codebase.attributes.tallies.update(tallies)
@post_scan_impl
class TalliesWithDetails(PostScanPlugin):
"""
Compute tallies of different scan attributes of a scan at the codebase level and
keep file and directory details.
The scan attributes that are tallied are:
- detected_license_expression
- copyrights
- holders
- authors
- programming_language
- packages
"""
# mapping of tally data at the codebase level for the whole codebase
codebase_attributes = dict(tallies=attr.ib(default=attr.Factory(dict)))
# store tallies at the file and directory level in this attribute when
# keep details is True
resource_attributes = dict(tallies=attr.ib(default=attr.Factory(dict)))
run_order = 100
sort_order = 100
options = [
PluggableCommandLineOption(('--tallies-with-details',),
is_flag=True, default=False,
help='Compute tallies of license, copyright and other scans at the codebase level, '
'keeping intermediate details at the file and directory level.',
help_group=POST_SCAN_GROUP)
]
def is_enabled(self, tallies_with_details, **kwargs):
return tallies_with_details
def process_codebase(self, codebase, tallies_with_details, **kwargs):
tallies = compute_codebase_tallies(codebase, keep_details=True, **kwargs)
codebase.attributes.tallies.update(tallies)
def compute_codebase_tallies(codebase, keep_details, **kwargs):
"""
Compute tallies of a scan at the codebase level for available scans.
If `keep_details` is True, also keep file and directory details in the
`tallies` file attribute for every file and directory.
"""
from summarycode.copyright_tallies import (author_tallies,
copyright_tallies,
holder_tallies)
attrib_summarizers = [
('detected_license_expression', license_tallies),
('copyrights', copyright_tallies),
('holders', holder_tallies),
('authors', author_tallies),
('programming_language', language_tallies),
('packages', package_tallies),
]
# find which attributes are available for summarization by checking the root
# resource
root = codebase.root
summarizers = [s for a, s in attrib_summarizers if hasattr(root, a)]
if TRACE: logger_debug('compute_codebase_tallies with:', summarizers)
# collect and set resource-level summaries
for resource in codebase.walk(topdown=False):
children = resource.children(codebase)
for summarizer in summarizers:
_summary_data = summarizer(resource, children, keep_details=keep_details)
if TRACE: logger_debug('tallies for:', resource.path, 'after tallies:', summarizer, 'is:', _summary_data)
codebase.save_resource(resource)
# set the tallies from the root resource at the codebase level
if keep_details:
tallies = root.tallies
else:
tallies = root.extra_data.get('tallies', {})
if TRACE: logger_debug('codebase tallies:', tallies)
return tallies
def license_tallies(resource, children, keep_details=False):
"""
Populate a license_expressions list of mappings such as
{value: "expression", count: "count of occurences"}
sorted by decreasing count.
"""
LIC_EXP = 'detected_license_expression'
LIC_DET = 'license_detections'
LIC_CLUE = 'license_clues'
license_expressions = []
# Collect current data
detected_expressions = []
for detection in getattr(resource, LIC_DET, []):
detected_expressions.append(detection["license_expression"])
for match in getattr(resource, LIC_CLUE, []):
detected_expressions.append(match["license_expression"])
package_license_detections = []
PACKAGE_DATA = 'package_data'
package_data = getattr(resource, PACKAGE_DATA, [])
if package_data:
package_license_detections.extend(
[
detection
for detection in getattr(package_data, LIC_DET, [])
if detection
]
)
for detection in package_license_detections:
detected_expressions.append(detection["license_expression"])
if not detected_expressions and resource.is_file:
# also count files with no detection
license_expressions.append(None)
else:
license_expressions.extend(detected_expressions)
# Collect direct children expression tallies
for child in children:
child_tallies = get_resource_tallies(child, key=LIC_EXP, as_attribute=keep_details) or []
for child_tally in child_tallies:
# TODO: review this: this feels rather weird
child_sum_val = child_tally.get('value')
values = [child_sum_val] * child_tally['count']
license_expressions.extend(values)
# summarize proper
licenses_counter = tally_licenses(license_expressions)
tallied = sorted_counter(licenses_counter)
set_resource_tallies(resource, key=LIC_EXP, value=tallied, as_attribute=keep_details)
return tallied
def tally_licenses(license_expressions):
"""
Given a list of license expressions, return a mapping of {expression: count
of occurences}
"""
# TODO: we could normalize and/or sort each license_expression before
# summarization and consider other equivalence or containment checks
return Counter(license_expressions)
def language_tallies(resource, children, keep_details=False):
"""
Populate a programming_language tallies list of mappings such as
{value: "programming_language", count: "count of occurences"}
sorted by decreasing count.
"""
PROG_LANG = 'programming_language'
languages = []
prog_lang = getattr(resource, PROG_LANG , [])
if not prog_lang:
if resource.is_file:
# also count files with no detection
languages.append(None)
else:
languages.append(prog_lang)
# Collect direct children expression summaries
for child in children:
child_tallies = get_resource_tallies(child, key=PROG_LANG, as_attribute=keep_details) or []
for child_tally in child_tallies:
child_sum_val = child_tally.get('value')
if child_sum_val:
values = [child_sum_val] * child_tally['count']
languages.extend(values)
# summarize proper
languages_counter = tally_languages(languages)
tallied = sorted_counter(languages_counter)
set_resource_tallies(resource, key=PROG_LANG, value=tallied, as_attribute=keep_details)
return tallied
def tally_languages(languages):
"""
Given a list of languages, return a mapping of {language: count
of occurences}
"""
# TODO: consider aggregating related langauges (C/C++, etc)
return Counter(languages)
TALLYABLE_ATTRS = set([
'detected_license_expression',
'copyrights',
'holders',
'authors',
'programming_language',
# 'packages',
])
def tally_values(values, attribute):
"""
Given a list of `values` for a given `attribute`, return a mapping of
{value: count of occurences} using a tallier specific to the attribute.
"""
if attribute not in TALLYABLE_ATTRS:
return {}
from summarycode.copyright_tallies import tally_copyrights, tally_persons
value_talliers_by_attr = dict(
detected_license_expression=tally_licenses,
copyrights=tally_copyrights,
holders=tally_persons,
authors=tally_persons,
programming_language=tally_languages,
)
return value_talliers_by_attr[attribute](values)
@post_scan_impl
class KeyFilesTallies(PostScanPlugin):
"""
Compute tallies of a scan at the codebase level for only key files.
"""
run_order = 150
sort_order = 150
# mapping of tally data at the codebase level for key files
codebase_attributes = dict(tallies_of_key_files=attr.ib(default=attr.Factory(dict)))
options = [
PluggableCommandLineOption(('--tallies-key-files',),
is_flag=True, default=False,
help='Compute tallies for license, copyright and other scans for key, '
'top-level files. Key files are top-level codebase files such '
'as COPYING, README and package manifests as reported by the '
'--classify option "is_legal", "is_readme", "is_manifest", "is_notice" '
'and "is_top_level" flags.',
help_group=POST_SCAN_GROUP,
required_options=['classify', 'tallies']
)
]
def is_enabled(self, tallies_key_files, **kwargs):
return tallies_key_files
def process_codebase(self, codebase, tallies_key_files, **kwargs):
tally_codebase_key_files(codebase, **kwargs)
def tally_codebase_key_files(codebase, field='tallies', **kwargs):
"""
Summarize codebase key files.
"""
talliables = codebase.attributes.tallies.keys()
if TRACE: logger_debug('tallieables:', talliables)
# TODO: we cannot summarize packages with "key files" for now
talliables = [k for k in talliables if k in TALLYABLE_ATTRS]
# create one counter for each summarized attribute
talliable_values_by_key = dict([(key, []) for key in talliables])
# filter to get only key files
key_files = (res for res in codebase.walk(topdown=True)
if (res.is_file and res.is_top_level
and (res.is_readme or res.is_legal or res.is_manifest)))
for resource in key_files:
for key, values in talliable_values_by_key.items():
# note we assume things are stored as extra-data, not as direct
# Resource attributes
res_tallies = get_resource_tallies(resource, key=key, as_attribute=False) or []
for tally in res_tallies:
# each tally is a mapping with value/count: we transform back to values
tally_value = tally.get('value')
if tally_value:
values.extend([tally_value] * tally['count'])
tally_counters = []
for key, values in talliable_values_by_key.items():
if key not in TALLYABLE_ATTRS:
continue
tallied = tally_values(values, key)
tally_counters.append((key, tallied))
sorted_tallies = dict(
[(key, sorted_counter(counter)) for key, counter in tally_counters])
codebase.attributes.tallies_of_key_files = sorted_tallies
if TRACE: logger_debug('codebase tallies_of_key_files:', sorted_tallies)
@post_scan_impl
class FacetTallies(PostScanPlugin):
"""
Compute tallies for a scan at the codebase level, grouping by facets.
"""
run_order = 200
sort_order = 200
codebase_attributes = dict(tallies_by_facet=attr.ib(default=attr.Factory(list)))
options = [
PluggableCommandLineOption(('--tallies-by-facet',),
is_flag=True, default=False,
help='Compute tallies for license, copyright and other scans and group the '
'results by facet.',
help_group=POST_SCAN_GROUP,
required_options=['facet', 'tallies']
)
]
def is_enabled(self, tallies_by_facet, **kwargs):
return tallies_by_facet
def process_codebase(self, codebase, tallies_by_facet, **kwargs):
if TRACE_LIGHT: logger_debug('FacetTallies:process_codebase')
tally_codebase_by_facet(codebase, **kwargs)
def tally_codebase_by_facet(codebase, **kwargs):
"""
Summarize codebase by facte.
"""
from summarycode import facet as facet_module
talliable = codebase.attributes.tallies.keys()
if TRACE:
logger_debug('tally_codebase_by_facet for attributes:', talliable)
# create one group of by-facet values lists for each summarized attribute
talliable_values_by_key_by_facet = dict([
(facet, dict([(key, []) for key in talliable]))
for facet in facet_module.FACETS
])
for resource in codebase.walk(topdown=True):
if not resource.is_file:
continue
for facet in resource.facets:
# note: this will fail loudly if the facet is not a known one
values_by_attribute = talliable_values_by_key_by_facet[facet]
for key, values in values_by_attribute.items():
# note we assume things are stored as extra-data, not as direct
# Resource attributes
res_tallies = get_resource_tallies(resource, key=key, as_attribute=False) or []
for tally in res_tallies:
# each tally is a mapping with value/count: we transform back to discrete values
tally_value = tally.get('value')
if tally_value:
values.extend([tally_value] * tally['count'])
final_tallies = []
for facet, talliable_values_by_key in talliable_values_by_key_by_facet.items():
tally_counters = (
(key, tally_values(values, key))
for key, values in talliable_values_by_key.items()
)
sorted_tallies = dict(
[(key, sorted_counter(counter)) for key, counter in tally_counters])
facet_tally = dict(facet=facet)
facet_tally['tallies'] = sorted_tallies
final_tallies.append(facet_tally)
codebase.attributes.tallies_by_facet.extend(final_tallies)
if TRACE: logger_debug('codebase tallies_by_facet:', final_tallies)
def add_files(packages, resource):
"""
Update in-place every package mapping in the `packages` list by updating or
creating the the "files" attribute from the `resource`. Yield back the
packages.
"""
for package in packages:
files = package['files'] = package.get('files') or []
fil = resource.to_dict(skinny=True)
if fil not in files:
files.append(fil)
yield package
def package_tallies(resource, children, keep_details=False):
"""
Populate a packages tally list of packages mappings.
Note: `keep_details` is never used, as we are not keeping details of
packages as this has no value.
"""
packages = []
# Collect current data
current_packages = getattr(resource, 'packages') or []
if TRACE_LIGHT and current_packages:
from packagedcode.models import Package
packs = [Package(**p) for p in current_packages]
logger_debug('package_tallier: for:', resource,
'current_packages are:', packs)
current_packages = add_files(current_packages, resource)
packages.extend(current_packages)
if TRACE_LIGHT and packages:
logger_debug()
from packagedcode.models import Package # NOQA
packs = [Package(**p) for p in packages]
logger_debug('package_tallier: for:', resource,
'packages are:', packs)
# Collect direct children packages tallies
for child in children:
child_tallies = get_resource_tallies(child, key='packages', as_attribute=False) or []
packages.extend(child_tallies)
# summarize proper
set_resource_tallies(resource, key='packages', value=packages, as_attribute=False)
return packages