aboutcode-org · AyanSinhaMahapatra · Nov 17, 2022
diff --git a/setup.cfg b/setup.cfg
@@ -191,6 +191,7 @@ scancode_post_scan =
     filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
     consolidate = summarycode.plugin_consolidate:Consolidator
     licenses-reference = licensedcode.plugin_licenses_reference:LicensesReference
+    review = summarycode.review:AmbiguousDetectionsReviewPlugin
 
 
 # scancode_output_filter is the entry point for filter plugins executed after

diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -23,6 +23,7 @@
 from licensedcode.match import LicenseMatch
 from licensedcode.match import set_matched_lines
 from licensedcode.models import Rule
+from licensedcode.models import BasicRule
 from licensedcode.models import compute_relevance
 from licensedcode.spans import Span
 from licensedcode.tokenize import query_tokenizer
@@ -128,6 +129,7 @@ class DetectionRule(Enum):
     PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file'
     PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
     PACKAGE_ADD_FROM_FILE = 'from-package-file'
+    PACKAGE_LICENSE = 'package-license'
 
 
 @attr.s
@@ -159,7 +161,6 @@ class LicenseDetection:
     )
 
     detection_log = attr.ib(
-        repr=False,
         default=attr.Factory(list),
         metadata=dict(
             help='A list of detection DetectionRule explaining how '
@@ -263,12 +264,20 @@ def identifier(self):
         """
         data = []
         for match in self.matches:
-            tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
-            identifier = (
-                match['rule_identifier'],
-                match['match_coverage'],
-                tokenized_matched_text,
-            )
+            if isinstance(match, dict):
+                tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
+                identifier = (
+                    match['rule_identifier'],
+                    match['match_coverage'],
+                    tokenized_matched_text,
+                )
+            else:
+                tokenized_matched_text = tuple(query_tokenizer(match.matched_text))
+                identifier = (
+                    match.identifier,
+                    match.coverage(),
+                    tokenized_matched_text,
+                )
             data.append(identifier)
 
         # Return a positive hash value for the tuple
@@ -279,8 +288,12 @@ def get_start_end_line(self):
         Returns start and end line for a license detection issue, from the
         license match(es).
         """
-        start_line = min([match['start_line'] for match in self.matches])
-        end_line = max([match['end_line'] for match in self.matches])
+        if isinstance(self.matches[0], dict): 
+            start_line = min([match['start_line'] for match in self.matches])
+            end_line = max([match['end_line'] for match in self.matches])
+        else:
+            start_line = min([match.start_line for match in self.matches])
+            end_line = max([match.end_line for match in self.matches])
         return start_line, end_line
 
     def rules_length(self):
@@ -432,6 +445,150 @@ def dict_fields(attr, value):
         return detection
 
 
+@attr.s
+class LicenseDetectionFromResult(LicenseDetection):
+    """
+    A LicenseDetection object that is created from a LicenseDetection
+    mapping, i.e. results mappings. The LicenseMatch objects in the
+    `matches` will be LicenseMatchFromResult objects too, as these are
+    created from data mappings and don't have the input text/spans
+    available.
+    """
+
+    @classmethod
+    def from_license_detection_mapping(cls, license_detection_mapping, file_path):
+
+        matches_from_results = matches_from_license_match_mappings(
+            license_match_mappings=license_detection_mapping["matches"]
+        )
+
+        detection = cls(
+            license_expression=license_detection_mapping["license_expression"],
+            detection_log=license_detection_mapping["detection_log"],
+            matches=matches_from_results,
+            file_region=None,
+        )
+        detection.file_region = detection.get_file_region(path=file_path)
+        return detection
+
+
+def detections_from_license_detection_mappings(license_detection_mappings, file_path):
+
+    license_detections = []
+
+    for license_detection_mapping in license_detection_mappings:
+        license_detections.append(
+            LicenseDetectionFromResult.from_license_detection_mapping(
+                license_detection_mapping=license_detection_mapping,
+                file_path=file_path,
+            )
+        )
+
+    return license_detections
+
+
+@attr.s
+class LicenseMatchFromResult(LicenseMatch):
+
+    match_score = attr.ib(
+        default=None,
+        metadata=dict(
+            help='License Detection Score')
+    )
+
+    matched_length = attr.ib(
+        default=None,
+        metadata=dict(
+            help='License match length')
+    )
+
+    match_coverage = attr.ib(
+        default=None,
+        metadata=dict(
+            help='License match coverage')
+    )
+
+    text = attr.ib(
+        default=None,
+        metadata=dict(
+            help='Text which was matched')
+    )
+
+    def score(self):
+        return self.match_score
+
+    def len(self):
+        return self.matched_length
+
+    def coverage(self):
+        return self.match_coverage
+
+    @property
+    def matched_text(self):
+        return self.text
+
+    @property
+    def identifier(self):
+        return self.rule.identifier
+
+    @classmethod
+    def from_license_match_mapping(cls, license_match_mapping):
+
+        rule = RuleFromResult.from_license_match_mapping(
+            license_match_mapping=license_match_mapping,
+        )
+
+        if "matched_text" in license_match_mapping:
+            matched_text = license_match_mapping["matched_text"]
+        else:
+            matched_text = None
+
+        return cls(
+            start_line=license_match_mapping["start_line"],
+            end_line=license_match_mapping["end_line"],
+            match_score=license_match_mapping["score"],
+            matched_length=license_match_mapping["matched_length"],
+            match_coverage=license_match_mapping["match_coverage"],
+            matcher=license_match_mapping["matcher"],
+            text=matched_text,
+            rule=rule,
+            qspan=None,
+            ispan=None,
+        )
+
+def matches_from_license_match_mappings(license_match_mappings):
+
+    license_matches = []
+
+    for license_match_mapping in license_match_mappings:
+        license_matches.append(
+            LicenseMatchFromResult.from_license_match_mapping(
+                license_match_mapping=license_match_mapping
+            )
+        )
+
+    return license_matches
+
+
+@attr.s
+class RuleFromResult(BasicRule):
+
+    @classmethod
+    def from_license_match_mapping(cls, license_match_mapping):
+        return cls(
+            license_expression=license_match_mapping["license_expression"],
+            identifier=license_match_mapping["rule_identifier"],
+            referenced_filenames=license_match_mapping["referenced_filenames"],
+            is_license_text=license_match_mapping["is_license_text"],
+            is_license_notice=license_match_mapping["is_license_notice"],
+            is_license_reference=license_match_mapping["is_license_reference"],
+            is_license_tag=license_match_mapping["is_license_tag"],
+            is_license_intro=license_match_mapping["is_license_intro"],
+            length=license_match_mapping["rule_length"],
+            relevance=license_match_mapping["rule_relevance"],
+        )
+
+
 def get_detections_from_mappings(detection_mappings):
     """
     Return a list of LicenseDetection objects from a list of

diff --git a/src/licensedcode/plugin_licenses_reference.py b/src/licensedcode/plugin_licenses_reference.py
@@ -61,7 +61,7 @@ class LicensesReference(PostScanPlugin):
     def is_enabled(self, licenses_reference, **kwargs):
         return licenses_reference
 
-    def process_codebase(self, codebase, licenses_reference, **kwargs):
+    def process_codebase(self, codebase, **kwargs):
         """
         Get unique License and Rule data from all license detections in a codebase-level
         list and only refer to them in the resource level detections. 
@@ -221,83 +221,3 @@ def get_reference_data(match):
     _ = match.pop('licenses')
 
     return ref_data
-
-
-def get_license_detection_references(license_detections_by_path):
-    """
-    Get LicenseDetection data for references from a mapping of path:[LicenseDetection],
-    i.e. path and a list of LicenseDetection at that path.
-
-    Also removes `matches` and `detection_log` from each LicenseDetection mapping
-    and only keeps a LicenseExpression string and an computed identifier per detection,
-    as this LicenseDetection data is referenced at top-level by the identifier.
-    """
-    detection_objects = []
-
-    for path, detections in license_detections_by_path.items():
-
-        for detection in detections:
-            detection_obj = LicenseDetection(**detection)
-            _matches = detection.pop('matches')
-            _detection_log = detection.pop('detection_log')
-            detection_obj.file_region = detection_obj.get_file_region(path=path)
-            detection["id"] = detection_obj.identifier
-
-            detection_objects.append(detection_obj)
-
-    detection_references = UniqueDetection.get_unique_detections(detection_objects)
-    return detection_references
-
-
-@attr.s
-class UniqueDetection:
-    """
-    An unique License Detection.
-    """
-    unique_identifier = attr.ib(type=int)
-    license_detection = attr.ib()
-    files = attr.ib(factory=list)
-
-    @classmethod
-    def get_unique_detections(cls, license_detections):
-        """
-        Get all unique license detections from a list of
-        LicenseDetections.
-        """
-        identifiers = get_identifiers(license_detections)
-        unique_detection_counts = dict(Counter(identifiers))
-
-        unique_license_detections = []
-        for detection_identifier in unique_detection_counts.keys():
-            file_regions = (
-                detection.file_region
-                for detection in license_detections
-                if detection_identifier == detection.identifier
-            )
-            all_detections = (
-                detection
-                for detection in license_detections
-                if detection_identifier == detection.identifier
-            )
-
-            detection = next(all_detections)
-            unique_license_detections.append(
-                cls(
-                    files=list(file_regions),
-                    license_detection=attr.asdict(detection),
-                    unique_identifier=detection.identifier,
-                )
-            )
-
-        return unique_license_detections
-
-
-def get_identifiers(license_detections):
-    """
-    Get identifiers for all license detections.
-    """
-    identifiers = (
-        detection.identifier
-        for detection in license_detections
-    )
-    return identifiers
diff --git a/src/summarycode/plugin_consolidate.py b/src/summarycode/plugin_consolidate.py
@@ -170,7 +170,7 @@ def process_codebase(self, codebase, **kwargs):
         # TODO: Have a "catch-all" Component for the things that we haven't grouped
         consolidations = []
         root = codebase.root
-        if hasattr(root, 'packages') and hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
+        if hasattr(root, 'package_data') and hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
             consolidations.extend(get_consolidated_packages(codebase))
         if hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
             consolidations.extend(get_holders_consolidated_components(codebase))