Skip to content

Commit bfd69f3

Browse files
authored
feat: Scan Dockerfile for typos (#422)
1 parent 00090ff commit bfd69f3

File tree

18 files changed

+530
-9
lines changed

18 files changed

+530
-9
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ docker run elementsinteractive/twyn --help
7070
| `--dependency-file` | `str` (path) | Dependency file to analyze. Supported: `requirements.txt`, `poetry.lock`, `uv.lock`, etc. |
7171
| `--dependency` | `str` (multiple allowed) | Dependency to analyze directly. Can be specified multiple times. |
7272
| `--selector-method` | `all`, `first-letter`, `nearby-letter` | Method for selecting possible typosquats. |
73-
| `--package-ecosystem` | `pypi`, `npm` | Package ecosystem for analysis. |
73+
| `--package-ecosystem` | `pypi`, `npm`, `dockerhub` | Package ecosystem for analysis. |
7474
| `-v` | flag | Enable info-level logging. |
7575
| `-vv` | flag | Enable debug-level logging. |
7676
| `--no-cache` | flag | Disable use of trusted packages cache. Always fetch from the source. |
@@ -170,6 +170,7 @@ The following dependency file formats are supported:
170170
- `package-lock.json` (v1, v2, v3)
171171
- `yarn.lock` (v1, v2)
172172
- `pnpm-lock.yaml` (v9)
173+
- `Dockerfile`
173174

174175
### Check dependencies introduced through the CLI
175176

@@ -226,6 +227,7 @@ logging_level="debug"
226227
allowlist=["my_package"]
227228
pypi_source="https://mirror-with-trusted-dependencies.com/file-pypi.json"
228229
npm_source="https://mirror-with-trusted-dependencies.com/file-npm.json"
230+
dockerhub_source="https://mirror-with-trusted-dependencies.com/file-dh.json"
229231
```
230232

231233
The file format for each reference is as follows:

src/twyn/base/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"package-lock.json": dependency_parser.PackageLockJsonParser,
3333
"pnpm-lock.yaml": dependency_parser.PnpmLockParser,
3434
"yarn.lock": dependency_parser.YarnLockParser,
35+
"Dockerfile": dependency_parser.DockerfileParser,
3536
}
3637
"""Mapping of dependency file names to their parser classes."""
3738

src/twyn/cli.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def entry_point() -> None:
7474
)
7575
@click.option(
7676
"--package-ecosystem",
77-
type=click.Choice(["pypi", "npm"]),
77+
type=click.Choice(["pypi", "npm", "dockerhub"]),
7878
default=None,
7979
help="Package ecosystem for dependency analysis (pypi or npm).",
8080
)
@@ -129,6 +129,11 @@ def entry_point() -> None:
129129
type=str,
130130
help="Alternative npm source URL to use for fetching trusted packages.",
131131
)
132+
@click.option(
133+
"--dockerhub-source",
134+
type=str,
135+
help="Alternative DockerHub source URL to use for fetching trusted packages.",
136+
)
132137
def run( # noqa: C901, PLR0912
133138
config: str,
134139
dependency_file: tuple[str],
@@ -144,6 +149,7 @@ def run( # noqa: C901, PLR0912
144149
recursive: bool,
145150
pypi_source: str | None,
146151
npm_source: str | None,
152+
dockerhub_source: str | None,
147153
) -> NoReturn:
148154
if vv:
149155
logger.setLevel(logging.DEBUG)
@@ -175,6 +181,7 @@ def run( # noqa: C901, PLR0912
175181
recursive=recursive,
176182
pypi_source=pypi_source,
177183
npm_source=npm_source,
184+
dockerhub_source=dockerhub_source,
178185
)
179186
except TwynError as e:
180187
raise CliError(str(e)) from e

src/twyn/config/config_handler.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ class TwynConfiguration:
4242
"""Alternative PyPI source URL."""
4343
npm_source: str | None
4444
"""Alternative npm source URL."""
45+
dockerhub_source: str | None
46+
"""Alternative DockerHub source URL."""
4547
use_cache: bool
4648
"""Whether to use cached trusted packages."""
4749
package_ecosystem: PackageEcosystems | None
@@ -64,6 +66,8 @@ class ReadTwynConfiguration:
6466
"""Optional alternative PyPI source URL."""
6567
npm_source: str | None = None
6668
"""Optional alternative npm source URL."""
69+
dockerhub_source: str | None = None
70+
"""Optional alternative DockerHub source URL."""
6771
use_cache: bool | None = None
6872
"""Optional setting for using cached trusted packages."""
6973
package_ecosystem: PackageEcosystems | None = None
@@ -87,6 +91,7 @@ def resolve_config( # noqa: C901, PLR0912
8791
recursive: bool | None = None,
8892
pypi_source: str | None = None,
8993
npm_source: str | None = None,
94+
dockerhub_source: str | None = None,
9095
) -> TwynConfiguration:
9196
"""Resolve the configuration for Twyn.
9297
@@ -141,12 +146,21 @@ def resolve_config( # noqa: C901, PLR0912
141146
else:
142147
final_npm_source = None
143148

149+
# Determine final dockerhub_source from CLI, config file, or default
150+
if dockerhub_source is not None:
151+
final_dockerhub_source = dockerhub_source
152+
elif read_config.dockerhub_source is not None:
153+
final_dockerhub_source = read_config.dockerhub_source
154+
else:
155+
final_dockerhub_source = None
156+
144157
return TwynConfiguration(
145158
dependency_files=dependency_files or read_config.dependency_files or set(),
146159
selector_method=final_selector_method,
147160
allowlist=read_config.allowlist,
148161
pypi_source=final_pypi_source,
149162
npm_source=final_npm_source,
163+
dockerhub_source=final_dockerhub_source,
150164
use_cache=final_use_cache,
151165
package_ecosystem=package_ecosystem or read_config.package_ecosystem,
152166
recursive=final_recursive,
@@ -196,6 +210,7 @@ def _get_read_config(self, toml: TOMLDocument) -> ReadTwynConfiguration:
196210
allowlist=allowlist,
197211
pypi_source=twyn_config_data.get("pypi_source"),
198212
npm_source=twyn_config_data.get("npm_source"),
213+
dockerhub_source=twyn_config_data.get("dockerhub_source"),
199214
use_cache=twyn_config_data.get("use_cache"),
200215
package_ecosystem=twyn_config_data.get("package_ecosystem"),
201216
recursive=twyn_config_data.get("recursive"),

src/twyn/dependency_managers/managers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from twyn.dependency_managers.exceptions import NoMatchingDependencyManagerError
55
from twyn.dependency_parser.parsers.constants import (
6+
DOCKERFILE,
67
PACKAGE_LOCK_JSON,
78
PNPM_LOCK_YAML,
89
POETRY_LOCK,
@@ -11,9 +12,11 @@
1112
YARN_LOCK,
1213
)
1314
from twyn.trusted_packages.managers.base import TrustedPackagesProtocol
15+
from twyn.trusted_packages.managers.trusted_dockerhub_packages_manager import TrustedDockerHubPackageManager
1416
from twyn.trusted_packages.managers.trusted_npm_packages_manager import TrustedNpmPackageManager
1517
from twyn.trusted_packages.managers.trusted_pypi_packages_manager import TrustedPackages
1618
from twyn.trusted_packages.references.base import AbstractPackageReference
19+
from twyn.trusted_packages.references.top_dockerhub_reference import TopDockerHubReference
1720
from twyn.trusted_packages.references.top_npm_reference import TopNpmReference
1821
from twyn.trusted_packages.references.top_pypi_reference import TopPyPiReference
1922

@@ -52,15 +55,27 @@ def get_alternative_source(self, sources: dict[str, str]) -> str | None:
5255
dependency_files={PACKAGE_LOCK_JSON, YARN_LOCK, PNPM_LOCK_YAML},
5356
trusted_packages_manager=TrustedNpmPackageManager,
5457
)
58+
5559
pypi_dependency_manager = DependencyManager(
5660
name="pypi",
5761
trusted_packages_source=TopPyPiReference,
5862
dependency_files={UV_LOCK, POETRY_LOCK, REQUIREMENTS_TXT},
5963
trusted_packages_manager=TrustedPackages,
6064
)
6165

66+
dockerhub_dependency_manager = DependencyManager(
67+
name="dockerhub",
68+
trusted_packages_source=TopDockerHubReference,
69+
dependency_files={DOCKERFILE},
70+
trusted_packages_manager=TrustedDockerHubPackageManager,
71+
)
72+
6273

63-
DEPENDENCY_MANAGERS: list[DependencyManager] = [pypi_dependency_manager, npm_dependency_manager]
74+
DEPENDENCY_MANAGERS: list[DependencyManager] = [
75+
pypi_dependency_manager,
76+
npm_dependency_manager,
77+
dockerhub_dependency_manager,
78+
]
6479
"""List of available dependency manager classes."""
6580

6681
PACKAGE_ECOSYSTEMS = {x.name for x in DEPENDENCY_MANAGERS}

src/twyn/dependency_parser/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Dependency parsers."""
22

3+
from twyn.dependency_parser.parsers.dockerfile_parser import DockerfileParser
34
from twyn.dependency_parser.parsers.lock_parser import PoetryLockParser, UvLockParser
45
from twyn.dependency_parser.parsers.package_lock_json import PackageLockJsonParser
56
from twyn.dependency_parser.parsers.pnpm_lock_parser import PnpmLockParser
@@ -13,4 +14,5 @@
1314
"PackageLockJsonParser",
1415
"YarnLockParser",
1516
"PnpmLockParser",
17+
"DockerfileParser",
1618
]

src/twyn/dependency_parser/parsers/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,6 @@
1515

1616
YARN_LOCK = "yarn.lock"
1717
"""Filename for Yarn package lock files."""
18+
19+
DOCKERFILE = "Dockerfile"
20+
"""Filename for Docker container definition files."""
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import logging
2+
import re
3+
4+
from typing_extensions import override
5+
6+
from twyn.dependency_parser.parsers.abstract_parser import AbstractParser
7+
from twyn.dependency_parser.parsers.constants import DOCKERFILE
8+
9+
logger = logging.getLogger("twyn")
10+
11+
12+
class DockerfileParser(AbstractParser):
13+
"""Parser for Dockerfile dependencies (FROM instructions)."""
14+
15+
# Pattern for variable substitution in Dockerfile
16+
VARIABLE_PATTERN = re.compile(
17+
r"\$\{(?P<name>[a-zA-Z_][a-zA-Z0-9_]*)(?::-(?P<default>[^}$]+))?\}|\$(?P<short_name>[a-zA-Z_][a-zA-Z0-9_]*)"
18+
)
19+
20+
def __init__(self, file_path: str = DOCKERFILE) -> None:
21+
super().__init__(file_path)
22+
23+
@override
24+
def parse(self) -> set[str]:
25+
"""Parse Dockerfile and return base image names from FROM instructions.
26+
27+
Handles variable substitution and excludes stage names from previous FROM instructions.
28+
"""
29+
with self.file_handler.open("r") as fp:
30+
lines = fp.readlines()
31+
32+
# Handle line continuations (\)
33+
raw_instructions = self._handle_line_continuations(lines)
34+
35+
# Parse instructions and resolve variables
36+
return self._extract_base_images(raw_instructions)
37+
38+
def _handle_line_continuations(self, lines: list[str]) -> list[str]:
39+
"""Handle Dockerfile line continuations with backslash."""
40+
raw_instructions = []
41+
buffer = ""
42+
43+
for line in lines:
44+
line = line.strip() # noqa: PLW2901
45+
if not line or line.startswith("#"):
46+
continue
47+
48+
if line.endswith("\\"):
49+
buffer += line[:-1] + " "
50+
else:
51+
buffer += line
52+
raw_instructions.append(buffer)
53+
buffer = ""
54+
55+
return raw_instructions
56+
57+
def _extract_base_images(self, instructions: list[str]) -> set[str]:
58+
"""Extract base images from Dockerfile instructions."""
59+
env: dict[str, str] = {}
60+
images: set[str] = set()
61+
stages: set[str] = set()
62+
63+
for instruction in instructions:
64+
parts = instruction.split(None, 1)
65+
if len(parts) < 2:
66+
continue
67+
68+
cmd = parts[0].upper()
69+
args = parts[1]
70+
71+
if cmd in ("ARG", "ENV"):
72+
self._parse_variable_assignment(args, env)
73+
elif cmd == "FROM":
74+
self._parse_from_instruction(args, env, images, stages)
75+
return images
76+
77+
def _parse_variable_assignment(self, args: str, env: dict[str, str]) -> None:
78+
"""Parse ARG or ENV instruction and update environment variables."""
79+
if "=" in args:
80+
# Handle KEY=VALUE pairs
81+
for part in args.split():
82+
if "=" in part:
83+
key, val = part.split("=", 1)
84+
env[key] = self._resolve_variables(val.strip("\"'"), env)
85+
else:
86+
# Handle KEY VALUE pairs (space-separated)
87+
parts = args.split(None, 1)
88+
if parts:
89+
key = parts[0]
90+
val = parts[1] if len(parts) > 1 else ""
91+
env[key] = self._resolve_variables(val.strip("\"'"), env)
92+
93+
def _parse_from_instruction(self, args: str, env: dict[str, str], images: set[str], stages: set[str]) -> None:
94+
"""Parse FROM instruction and extract base image."""
95+
# Strip flags like --platform=...
96+
clean_args = re.sub(r"--\S+", "", args).strip().split()
97+
if not clean_args:
98+
return
99+
100+
image_name = clean_args[0]
101+
resolved_image = self._resolve_variables(image_name, env)
102+
103+
if resolved_image not in stages:
104+
image_name_only = self._extract_image_name(resolved_image)
105+
106+
# Ignore the special 'scratch' no-op image
107+
if image_name_only.lower() != "scratch":
108+
images.add(image_name_only)
109+
110+
for i, part in enumerate(clean_args):
111+
if part.lower() == "as" and i + 1 < len(clean_args):
112+
stages.add(clean_args[i + 1])
113+
114+
def _extract_image_name(self, image_with_tag: str) -> str:
115+
"""Extract image name without tag/version/digest from a Docker image reference.
116+
117+
Examples:
118+
ubuntu:20.04 -> ubuntu
119+
node:16-alpine -> node
120+
registry.hub.docker.com/library/nginx:latest -> registry.hub.docker.com/library/nginx
121+
localhost:5000/myapp:v1.0 -> localhost:5000/myapp
122+
nginx@sha256:23q... -> nginx
123+
"""
124+
# Strip off the digest FIRST
125+
if "@" in image_with_tag:
126+
image_with_tag = image_with_tag.split("@")[0]
127+
128+
# Find the last ':' in the string
129+
last_colon_idx = image_with_tag.rfind(":")
130+
131+
if last_colon_idx == -1:
132+
# No colon found, return as-is
133+
return image_with_tag
134+
135+
potential_tag = image_with_tag[last_colon_idx + 1 :]
136+
name_part = image_with_tag[:last_colon_idx]
137+
138+
if (
139+
potential_tag.isdigit() and "/" not in potential_tag and "/" not in name_part.split("/")[-1]
140+
if name_part
141+
else True
142+
):
143+
# This looks like a registry with port, don't strip it
144+
return image_with_tag
145+
146+
# Otherwise, strip the tag
147+
return name_part
148+
149+
def _resolve_variables(self, text: str, env: dict[str, str]) -> str:
150+
"""Resolve variable substitutions in text using environment variables."""
151+
152+
def replace(match: re.Match[str]) -> str:
153+
name = match.group("name") or match.group("short_name")
154+
default = match.group("default")
155+
return env.get(name, default if default is not None else match.group(0))
156+
157+
result = text
158+
iterations = 0
159+
max_iterations = 20 # Circuit breaker for recursive variables like PATH=$PATH
160+
161+
while iterations < max_iterations:
162+
new_result = self.VARIABLE_PATTERN.sub(replace, result)
163+
if new_result == result:
164+
break
165+
result = new_result
166+
iterations += 1
167+
168+
return result

0 commit comments

Comments
 (0)