Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion scanpipe/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ def flag_ignored_resources(self):
ignored_patterns = ignored_patterns.splitlines()
ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS)

flag.flag_ignored_patterns(self.project, patterns=ignored_patterns)
flag.flag_ignored_patterns(
codebaseresources=self.project.codebaseresources.no_status(),
patterns=ignored_patterns,
)

def extract_archive(self, location, target):
"""Extract archive at `location` to `target`. Save errors as messages."""
Expand Down
47 changes: 19 additions & 28 deletions scanpipe/pipelines/deploy_to_develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@
from scanpipe.pipes import matchcode
from scanpipe.pipes import purldb
from scanpipe.pipes import scancode
from scanpipe.pipes.d2d_config import DefaultEcosystemConfig


class DeployToDevelop(Pipeline):
class DeployToDevelop(Pipeline, DefaultEcosystemConfig):
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
Outdated
"""
Establish relationships between two code trees: deployment and development.

Expand Down Expand Up @@ -64,6 +65,8 @@ def steps(cls):
cls.flag_empty_files,
cls.flag_whitespace_files,
cls.flag_ignored_resources,
cls.load_ecosystem_config,
cls.load_ecosystem_config_ruby,
cls.map_about_files,
cls.map_checksum,
cls.match_archives_to_purldb,
Expand Down Expand Up @@ -91,33 +94,6 @@ def steps(cls):
cls.create_local_files_packages,
)

purldb_package_extensions = [".jar", ".war", ".zip"]
purldb_resource_extensions = [
".map",
".js",
".mjs",
".ts",
".d.ts",
".jsx",
".tsx",
".css",
".scss",
".less",
".sass",
".soy",
".class",
]
doc_extensions = [
".pdf",
".doc",
".docx",
".ppt",
".pptx",
".tex",
".odt",
".odp",
]

def get_inputs(self):
"""Locate the ``from`` and ``to`` input files."""
self.from_files, self.to_files = d2d.get_inputs(self.project)
Expand Down Expand Up @@ -152,6 +128,15 @@ def flag_whitespace_files(self):
"""Flag whitespace files with size less than or equal to 100 byte as ignored."""
d2d.flag_whitespace_files(project=self.project)

def load_ecosystem_config(self):
"""Load ecosystem specific configurations for d2d steps for selected options."""
d2d.load_ecosystem_config(pipeline=self, options=self.selected_groups)

@optional_step("Ruby")
def load_ecosystem_config_ruby(self):
"""Load Ruby specific configurations for d2d steps."""
pass
Comment on lines +135 to +141
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a leftover or are we planning to duplicate those methods?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ruby is the only option which does not require any code to match source/binary as we already have sha1 matching working there correctly. But the loading of ecosystem configurations and showing/selecting these options in the UI happens on the basis of having optional steps, so I'm having a dummy step here to enable loading the config and showing this as an option everywhere that we can select.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ruby should not be special. I get that the mapping is already done with sha1matching, but I think you should not name this "load_ecosystem_config_ruby" as a "tag" step for ruby: this is confusing. I suggest instead to rename this to "map_ruby": it may not do anything just now, but it will likely do things later as explained in:
#1436 (comment)

therefore the "pass" there will be temporary.


def map_about_files(self):
"""Map ``from/`` .ABOUT files to their related ``to/`` resources."""
d2d.map_about_files(project=self.project, logger=self.log)
Expand Down Expand Up @@ -268,6 +253,7 @@ def flag_mapped_resources_archives_and_ignored_directories(self):
def perform_house_keeping_tasks(self):
"""
On deployed side
- Ignore specific files based on ecosystem based configurations.
- PurlDB match files with ``no-java-source`` and empty status,
if no match is found update status to ``requires-review``.
- Update status for uninteresting files.
Expand All @@ -278,6 +264,11 @@ def perform_house_keeping_tasks(self):
"""
d2d.match_resources_with_no_java_source(project=self.project, logger=self.log)
d2d.handle_dangling_deployed_legal_files(project=self.project, logger=self.log)
d2d.ignore_unmapped_resources_from_config(
project=self.project,
patterns_to_ignore=self.deployed_resource_path_exclusions,
logger=self.log,
)
d2d.match_unmapped_resources(
project=self.project,
matched_extensions=self.purldb_resource_extensions,
Expand Down
74 changes: 74 additions & 0 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from scanpipe.models import CodebaseRelation
from scanpipe.models import CodebaseResource
from scanpipe.models import convert_glob_to_django_regex
from scanpipe.pipes import d2d_config
from scanpipe.pipes import flag
from scanpipe.pipes import get_resource_diff_ratio
from scanpipe.pipes import js
Expand All @@ -66,6 +67,16 @@
TO = "to/"


ECOSYSTEM_CONFIGS = [
d2d_config.DefaultEcosystemConfig,
d2d_config.JavaEcosystemConfig,
d2d_config.JavaScriptEcosystemConfig,
d2d_config.RubyEcosystemConfig,
d2d_config.RustEcosystemConfig,
d2d_config.GoEcosystemConfig,
]
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
Outdated


def get_inputs(project):
"""
Locate the ``from`` and ``to`` input files in project inputs/ directory.
Expand Down Expand Up @@ -114,6 +125,55 @@ def get_best_path_matches(to_resource, matches):
return matches


def load_ecosystem_config(pipeline, options):
"""
Add ecosystem specific configurations for each ecosystem selected
as `options` to the `pipeline`.
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
Outdated
"""
configs_by_ecosystem = {
ecosystem.ecosystem_option: ecosystem for ecosystem in ECOSYSTEM_CONFIGS
}
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
Outdated

# Add default configurations which are common accross ecosystems
add_ecosystem_config(
pipeline=pipeline,
configs_by_ecosystem=configs_by_ecosystem,
selected_option="Default",
)

# Add configurations for each selected ecosystem
for selected_option in options:
if selected_option not in configs_by_ecosystem:
continue

add_ecosystem_config(
pipeline=pipeline,
configs_by_ecosystem=configs_by_ecosystem,
selected_option=selected_option,
)


def add_ecosystem_config(pipeline, configs_by_ecosystem, selected_option):
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
Outdated
d2d_pipeline_configs = [
"purldb_package_extensions",
"purldb_resource_extensions",
"deployed_resource_path_exclusions",
]

ecosystem_config = configs_by_ecosystem.get(selected_option)

for pipeline_config in d2d_pipeline_configs:
config_value = getattr(ecosystem_config, pipeline_config)
pipeline_config_value = getattr(pipeline, pipeline_config)
if config_value:
if not pipeline_config_value:
new_config_value = config_value
else:
new_config_value = pipeline_config_value.extend(config_value)

setattr(pipeline, pipeline_config, new_config_value)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not ideal to set values from all the way down here, shouldn't we return those to a higher location that will explicitly set the values?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I can change this, as this is the cleanest way where we can update a bunch of configuration attributes based on all the ecosystems/options selected. If we return these values, there's no use of this function at all, which refactors and gets rid of duplicate code.



def get_from_files_for_scanning(resources):
"""
Return resources in the "from/" side which has been mapped to the "to/"
Expand Down Expand Up @@ -1460,6 +1520,20 @@ def match_resources_with_no_java_source(project, logger=None):
)


def ignore_unmapped_resources_from_config(project, patterns_to_ignore, logger=None):
"""Ignore unmapped resources for a project using `patterns_to_ignore`."""
ignored_resources_count = flag.flag_ignored_patterns(
codebaseresources=project.codebaseresources.to_codebase().no_status(),
patterns=patterns_to_ignore,
status=flag.IGNORED_FROM_CONFIG,
)
if logger:
logger(
f"Ignoring {ignored_resources_count:,d} to/ resources with "
"from ecosystem specific configurations."
Comment thread
pombredanne marked this conversation as resolved.
Outdated
)


def match_unmapped_resources(project, matched_extensions=None, logger=None):
"""
Match resources with empty status to PurlDB, if unmatched
Expand Down
115 changes: 115 additions & 0 deletions scanpipe/pipes/d2d_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.


class EcosystemConfig:
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
"""
Base class for ecosystem specific configurations to be defined
for each ecosystems.
"""

# This should be defined for each ecosystem which
# are options in the pipelines
ecosystem_option = None

# These are extensions for packages of this ecosystem which
Comment thread
AyanSinhaMahapatra marked this conversation as resolved.
Outdated
# needs to be matched from purldb
purldb_package_extensions = []

# These are extensions for resources of this ecosystem which
# needs to be macthed from purldb
purldb_resource_extensions = []

# Extensions for document files which do not require review
doc_extensions = []

# Paths in the deployed binaries/archives (on the to/ side) which
# do not need review even if they are not matched to the source side
deployed_resource_path_exclusions = []

# Paths in the developement/source archive (on the from/ side) which
# should not be considered even if unmapped to the deployed side when
# assesing what to review on the deployed side
devel_resource_path_exclusions = []

# Symbols which are found in ecosystem specific standard libraries
# which are not so useful in mapping
standard_symbols_to_exclude = []


class DefaultEcosystemConfig(EcosystemConfig):
"""Configurations which are common across multiple ecosystems."""

ecosystem_option = "Default"
purldb_package_extensions = [".zip", ".tar.gz", ".tar.xz"]
devel_resource_path_exclusions = ["*/tests/*"]
doc_extensions = [
".pdf",
".doc",
".docx",
".ppt",
".pptx",
".tex",
".odt",
".odp",
]


class JavaEcosystemConfig(EcosystemConfig):
ecosystem_option = "Java"
purldb_package_extensions = [".jar", ".war"]
purldb_resource_extensions = [".class"]


class JavaScriptEcosystemConfig(EcosystemConfig):
ecosystem_option = "JavaScript"
purldb_resource_extensions = [
".map",
".js",
".mjs",
".ts",
".d.ts",
".jsx",
".tsx",
".css",
".scss",
".less",
".sass",
".soy",
]


class GoEcosystemConfig(EcosystemConfig):
ecosystem_option = "Go"
purldb_resource_extensions = [".go"]


class RustEcosystemConfig(EcosystemConfig):
ecosystem_option = "Rust"
purldb_resource_extensions = [".rs"]


class RubyEcosystemConfig(EcosystemConfig):
ecosystem_option = "Ruby"
purldb_package_extensions = [".gem"]
purldb_resource_extensions = [".rb"]
deployed_resource_path_exclusions = ["*checksums.yaml.gz*", "*metadata.gz*"]
7 changes: 4 additions & 3 deletions scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
IGNORED_DEFAULT_IGNORES = "ignored-default-ignores"
IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues"
IGNORED_DOC_FILE = "ignored-doc-file"
IGNORED_FROM_CONFIG = "ignored-from-config"
IGNORED_BY_MAX_FILE_SIZE = "ignored-by-max-file-size"

COMPLIANCE_LICENSES = "compliance-licenses"
Expand Down Expand Up @@ -92,15 +93,15 @@ def flag_ignored_directories(project):
return qs.update(status=IGNORED_DIRECTORY)


def flag_ignored_patterns(project, patterns):
def flag_ignored_patterns(codebaseresources, patterns, status=IGNORED_PATTERN):
"""Flag codebase resource as ``ignored`` status from list of ``patterns``."""
if isinstance(patterns, str):
patterns = patterns.splitlines()

update_count = 0
for pattern in patterns:
qs = project.codebaseresources.no_status().path_pattern(pattern)
update_count += qs.update(status=IGNORED_PATTERN)
qs = codebaseresources.path_pattern(pattern)
update_count += qs.update(status=status)

return update_count

Expand Down
11 changes: 8 additions & 3 deletions scanpipe/tests/pipes/test_flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ def test_scanpipe_pipes_flag_flag_ignored_directories(self):

def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
patterns = ["*.ext", "dir/*"]
updated = flag.flag_ignored_patterns(self.project1, patterns)
updated = flag.flag_ignored_patterns(
self.project1.codebaseresources.no_status(), patterns
)

self.assertEqual(3, updated)
self.resource1.refresh_from_db()
Expand All @@ -86,7 +88,8 @@ def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
make_resource_file(self.project1, "path/deeper/policies.yml")
make_resource_file(self.project1, "path/other-policies.yml")
updated = flag.flag_ignored_patterns(
self.project1, flag.DEFAULT_IGNORED_PATTERNS
self.project1.codebaseresources.no_status(),
flag.DEFAULT_IGNORED_PATTERNS,
)
self.assertEqual(3, updated)

Expand All @@ -97,7 +100,9 @@ def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
project2, "a.cdx.json.zip-extract/__MACOSX/._a.cdx.json"
)
make_resource_file(project2, "a.cdx.json.zip-extract/a.cdx.json")
updated = flag.flag_ignored_patterns(project2, flag.DEFAULT_IGNORED_PATTERNS)
updated = flag.flag_ignored_patterns(
project2.codebaseresources.no_status(), flag.DEFAULT_IGNORED_PATTERNS
)
self.assertEqual(2, updated)
ignored_qs = project2.codebaseresources.status(flag.IGNORED_PATTERN)
self.assertEqual(2, ignored_qs.count())
Expand Down
5 changes: 4 additions & 1 deletion scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,11 @@ def test_scanpipe_pipeline_class_flag_ignored_resources(self):
with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag:
mock_flag.return_value = None
pipeline.flag_ignored_resources()

mock_flag.assert_called_once()
patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS]
mock_flag.assert_called_with(project1, patterns=patterns_args)
self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args)
self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0)

def test_scanpipe_pipeline_class_extract_archive(self):
project1 = Project.objects.create(name="Analysis")
Expand Down