Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ scancode_post_scan =
filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
consolidate = summarycode.plugin_consolidate:Consolidator
licenses-reference = licensedcode.plugin_licenses_reference:LicensesReference
review = summarycode.review:AmbiguousDetectionsReviewPlugin


# scancode_output_filter is the entry point for filter plugins executed after
Expand Down
175 changes: 166 additions & 9 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.models import Rule
from licensedcode.models import BasicRule
from licensedcode.models import compute_relevance
from licensedcode.spans import Span
from licensedcode.tokenize import query_tokenizer
Expand Down Expand Up @@ -128,6 +129,7 @@ class DetectionRule(Enum):
PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file'
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
PACKAGE_ADD_FROM_FILE = 'from-package-file'
PACKAGE_LICENSE = 'package-license'


@attr.s
Expand Down Expand Up @@ -159,7 +161,6 @@ class LicenseDetection:
)

detection_log = attr.ib(
repr=False,
default=attr.Factory(list),
metadata=dict(
help='A list of detection DetectionRule explaining how '
Expand Down Expand Up @@ -263,12 +264,20 @@ def identifier(self):
"""
data = []
for match in self.matches:
tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
identifier = (
match['rule_identifier'],
match['match_coverage'],
tokenized_matched_text,
)
if isinstance(match, dict):
tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
identifier = (
match['rule_identifier'],
match['match_coverage'],
tokenized_matched_text,
)
else:
tokenized_matched_text = tuple(query_tokenizer(match.matched_text))
identifier = (
match.identifier,
match.coverage(),
tokenized_matched_text,
)
data.append(identifier)

# Return a positive hash value for the tuple
Expand All @@ -279,8 +288,12 @@ def get_start_end_line(self):
Returns start and end line for a license detection issue, from the
license match(es).
"""
start_line = min([match['start_line'] for match in self.matches])
end_line = max([match['end_line'] for match in self.matches])
if isinstance(self.matches[0], dict):
start_line = min([match['start_line'] for match in self.matches])
end_line = max([match['end_line'] for match in self.matches])
else:
start_line = min([match.start_line for match in self.matches])
end_line = max([match.end_line for match in self.matches])
return start_line, end_line

def rules_length(self):
Expand Down Expand Up @@ -432,6 +445,150 @@ def dict_fields(attr, value):
return detection


@attr.s
class LicenseDetectionFromResult(LicenseDetection):
"""
A LicenseDetection object that is created from a LicenseDetection
mapping, i.e. results mappings. The LicenseMatch objects in the
`matches` will be LicenseMatchFromResult objects too, as these are
created from data mappings and don't have the input text/spans
available.
"""

@classmethod
def from_license_detection_mapping(cls, license_detection_mapping, file_path):

matches_from_results = matches_from_license_match_mappings(
license_match_mappings=license_detection_mapping["matches"]
)

detection = cls(
license_expression=license_detection_mapping["license_expression"],
detection_log=license_detection_mapping["detection_log"],
matches=matches_from_results,
file_region=None,
)
detection.file_region = detection.get_file_region(path=file_path)
return detection


def detections_from_license_detection_mappings(license_detection_mappings, file_path):

license_detections = []

for license_detection_mapping in license_detection_mappings:
license_detections.append(
LicenseDetectionFromResult.from_license_detection_mapping(
license_detection_mapping=license_detection_mapping,
file_path=file_path,
)
)

return license_detections


@attr.s
class LicenseMatchFromResult(LicenseMatch):

match_score = attr.ib(
default=None,
metadata=dict(
help='License Detection Score')
)

matched_length = attr.ib(
default=None,
metadata=dict(
help='License match length')
)

match_coverage = attr.ib(
default=None,
metadata=dict(
help='License match coverage')
)

text = attr.ib(
default=None,
metadata=dict(
help='Text which was matched')
)

def score(self):
return self.match_score

def len(self):
return self.matched_length

def coverage(self):
return self.match_coverage

@property
def matched_text(self):
return self.text

@property
def identifier(self):
return self.rule.identifier

@classmethod
def from_license_match_mapping(cls, license_match_mapping):

rule = RuleFromResult.from_license_match_mapping(
license_match_mapping=license_match_mapping,
)

if "matched_text" in license_match_mapping:
matched_text = license_match_mapping["matched_text"]
else:
matched_text = None

return cls(
start_line=license_match_mapping["start_line"],
end_line=license_match_mapping["end_line"],
match_score=license_match_mapping["score"],
matched_length=license_match_mapping["matched_length"],
match_coverage=license_match_mapping["match_coverage"],
matcher=license_match_mapping["matcher"],
text=matched_text,
rule=rule,
qspan=None,
ispan=None,
)

def matches_from_license_match_mappings(license_match_mappings):

license_matches = []

for license_match_mapping in license_match_mappings:
license_matches.append(
LicenseMatchFromResult.from_license_match_mapping(
license_match_mapping=license_match_mapping
)
)

return license_matches


@attr.s
class RuleFromResult(BasicRule):

@classmethod
def from_license_match_mapping(cls, license_match_mapping):
return cls(
license_expression=license_match_mapping["license_expression"],
identifier=license_match_mapping["rule_identifier"],
referenced_filenames=license_match_mapping["referenced_filenames"],
is_license_text=license_match_mapping["is_license_text"],
is_license_notice=license_match_mapping["is_license_notice"],
is_license_reference=license_match_mapping["is_license_reference"],
is_license_tag=license_match_mapping["is_license_tag"],
is_license_intro=license_match_mapping["is_license_intro"],
length=license_match_mapping["rule_length"],
relevance=license_match_mapping["rule_relevance"],
)


def get_detections_from_mappings(detection_mappings):
"""
Return a list of LicenseDetection objects from a list of
Expand Down
82 changes: 1 addition & 81 deletions src/licensedcode/plugin_licenses_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class LicensesReference(PostScanPlugin):
def is_enabled(self, licenses_reference, **kwargs):
return licenses_reference

def process_codebase(self, codebase, licenses_reference, **kwargs):
def process_codebase(self, codebase, **kwargs):
"""
Get unique License and Rule data from all license detections in a codebase-level
list and only refer to them in the resource level detections.
Expand Down Expand Up @@ -221,83 +221,3 @@ def get_reference_data(match):
_ = match.pop('licenses')

return ref_data


def get_license_detection_references(license_detections_by_path):
"""
Get LicenseDetection data for references from a mapping of path:[LicenseDetection],
i.e. path and a list of LicenseDetection at that path.

Also removes `matches` and `detection_log` from each LicenseDetection mapping
and only keeps a LicenseExpression string and an computed identifier per detection,
as this LicenseDetection data is referenced at top-level by the identifier.
"""
detection_objects = []

for path, detections in license_detections_by_path.items():

for detection in detections:
detection_obj = LicenseDetection(**detection)
_matches = detection.pop('matches')
_detection_log = detection.pop('detection_log')
detection_obj.file_region = detection_obj.get_file_region(path=path)
detection["id"] = detection_obj.identifier

detection_objects.append(detection_obj)

detection_references = UniqueDetection.get_unique_detections(detection_objects)
return detection_references


@attr.s
class UniqueDetection:
"""
An unique License Detection.
"""
unique_identifier = attr.ib(type=int)
license_detection = attr.ib()
files = attr.ib(factory=list)

@classmethod
def get_unique_detections(cls, license_detections):
"""
Get all unique license detections from a list of
LicenseDetections.
"""
identifiers = get_identifiers(license_detections)
unique_detection_counts = dict(Counter(identifiers))

unique_license_detections = []
for detection_identifier in unique_detection_counts.keys():
file_regions = (
detection.file_region
for detection in license_detections
if detection_identifier == detection.identifier
)
all_detections = (
detection
for detection in license_detections
if detection_identifier == detection.identifier
)

detection = next(all_detections)
unique_license_detections.append(
cls(
files=list(file_regions),
license_detection=attr.asdict(detection),
unique_identifier=detection.identifier,
)
)

return unique_license_detections


def get_identifiers(license_detections):
"""
Get identifiers for all license detections.
"""
identifiers = (
detection.identifier
for detection in license_detections
)
return identifiers
2 changes: 1 addition & 1 deletion src/summarycode/plugin_consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def process_codebase(self, codebase, **kwargs):
# TODO: Have a "catch-all" Component for the things that we haven't grouped
consolidations = []
root = codebase.root
if hasattr(root, 'packages') and hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
if hasattr(root, 'package_data') and hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
consolidations.extend(get_consolidated_packages(codebase))
if hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
consolidations.extend(get_holders_consolidated_components(codebase))
Expand Down
Loading