Skip to content

Commit 0d673a8

Browse files
authored
Add option to exclude base image components (#1825)
* Add option to exclude base image components * Address review feedback * Address feedback * Remove references to filtered out components from dependency roots * Handle null layerIndices * Properly handle components with 0 layer indices * Use files other than system package manager DBs when possible to determine component to layer map * Respond to feedback * Rename flag to ExcludeBaseImageComponents and add docs * Ensure duplicate layer IDs are handled properly
1 parent 63dafd9 commit 0d673a8

6 files changed

Lines changed: 796 additions & 5 deletions

File tree

docs/detectors/linux.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,15 @@ For example:
5454

5555
- Windows container scanning is not supported
5656
- Multiplatform images are not supported
57+
58+
## Excluding Base Image Components
59+
60+
When scanning container images, many detected components may originate from the base image rather than from layers added by the user's Dockerfile. The `--ExcludeBaseImageComponents` flag filters out components that exist exclusively in base image layers, so only components introduced by the user's own layers are reported.
61+
62+
```sh
63+
--ExcludeBaseImageComponents
64+
```
65+
66+
A component is excluded only if **all** of its associated container layers are marked as base image layers. If a component appears in at least one non-base-image layer, it is retained.
67+
68+
This flag has no effect on non-container scans (i.e., directory-based detection).

src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,21 @@ internal class LinuxScanner : ILinuxScanner
3030

3131
private static readonly IList<string> ScopeSquashedParameter = ["--scope", "squashed"];
3232

33+
/// <summary>
34+
/// Well-known package manager database paths whose layer attribution should be ignored
35+
/// when determining which layer a system package belongs to. These files are shared across
36+
/// all packages managed by the same package manager and get updated whenever any package is
37+
/// installed or removed, causing unrelated packages to appear as modified in later layers.
38+
/// </summary>
39+
private static readonly HashSet<string> PackageManagerDatabasePaths = new(StringComparer.OrdinalIgnoreCase)
40+
{
41+
"/var/lib/dpkg/status",
42+
"/lib/apk/db/installed",
43+
"/var/lib/rpm/Packages",
44+
"/var/lib/rpm/Packages.db",
45+
"/var/lib/rpm/rpmdb.sqlite",
46+
};
47+
3348
private static readonly SemaphoreSlim ContainerSemaphore = new SemaphoreSlim(2);
3449

3550
/// <summary>
@@ -179,11 +194,16 @@ private IEnumerable<LayerMappedLinuxComponents> ProcessSyftOutputWithTelemetry(
179194
}
180195
}
181196

197+
// Build a file path → layerID map from the top-level files listing.
198+
// This allows us to determine layer attribution for files owned by a package
199+
// even when the artifact's locations only reference the package manager database.
200+
var filePathToLayerId = BuildFilePathToLayerMap(syftOutput.Files);
201+
182202
// Create components using only enabled factories
183203
var componentsWithLayers = validArtifacts
184204
.DistinctBy(artifact => (artifact.Name, artifact.Version, artifact.Type))
185205
.Select(artifact =>
186-
this.CreateComponentWithLayers(artifact, syftOutput.Distro, enabledFactories)
206+
this.CreateComponentWithLayers(artifact, syftOutput.Distro, enabledFactories, filePathToLayerId)
187207
)
188208
.Where(result => result.Component != null)
189209
.Select(result => (Component: result.Component!, result.LayerIds))
@@ -388,7 +408,8 @@ private async Task<string> RunSyftCoreAsync(
388408
private (TypedComponent? Component, IEnumerable<string> LayerIds) CreateComponentWithLayers(
389409
ArtifactElement artifact,
390410
Distro distro,
391-
HashSet<IArtifactComponentFactory> enabledFactories
411+
HashSet<IArtifactComponentFactory> enabledFactories,
412+
Dictionary<string, string> filePathToLayerId
392413
)
393414
{
394415
if (!this.artifactTypeToFactoryLookup.TryGetValue(artifact.Type, out var factory))
@@ -408,12 +429,96 @@ HashSet<IArtifactComponentFactory> enabledFactories
408429
return (null, []);
409430
}
410431

411-
var layerIds = artifact.Locations?.Select(location => location.LayerId).Distinct() ?? [];
412-
return (component, layerIds);
432+
// Collect layer IDs from the artifact's locations, filtering out entries with null/empty layer IDs.
433+
var locationLayerIds = artifact.Locations?
434+
.Where(location => !string.IsNullOrEmpty(location.Path) && !string.IsNullOrEmpty(location.LayerId))
435+
.Select(location => (location.Path, location.LayerId))
436+
.ToList() ?? [];
437+
438+
// Also consult the metadata files property to find additional owned files,
439+
// and look up their layer IDs from the top-level file listing.
440+
if (artifact.Metadata?.Files != null)
441+
{
442+
foreach (var file in artifact.Metadata.Files)
443+
{
444+
// The File union type can be either a FileFile object or a plain string path.
445+
var filePath = file.FileFile?.Path ?? file.String;
446+
if (!string.IsNullOrEmpty(filePath) && filePathToLayerId.TryGetValue(filePath, out var layerId))
447+
{
448+
locationLayerIds.Add((filePath, layerId));
449+
}
450+
}
451+
}
452+
453+
// Exclude well-known package manager database paths from layer attribution,
454+
// unless they are the only known locations for this component.
455+
var nonDbLayerIds = locationLayerIds
456+
.Where(loc => !IsPackageManagerDatabasePath(loc.Path))
457+
.Select(loc => loc.LayerId)
458+
.Distinct()
459+
.ToList();
460+
461+
if (nonDbLayerIds.Count > 0)
462+
{
463+
return (component, nonDbLayerIds);
464+
}
465+
466+
// Fall back to database path layer IDs if no other locations are available.
467+
var allLayerIds = locationLayerIds
468+
.Select(loc => loc.LayerId)
469+
.Where(id => !string.IsNullOrEmpty(id))
470+
.Distinct()
471+
.ToList();
472+
return (component, allLayerIds);
413473
}
414474

415475
/// <summary>
416476
/// Clears the syft run cache. Intended for test isolation only.
417477
/// </summary>
418478
internal static void ResetCache() => SyftRunCache.Clear();
479+
480+
/// <summary>
481+
/// Builds a dictionary mapping file paths to their layer IDs from the top-level
482+
/// files listing in the syft output. This enables layer attribution for files
483+
/// owned by a package even when the artifact's locations only reference the
484+
/// package manager database.
485+
/// </summary>
486+
private static Dictionary<string, string> BuildFilePathToLayerMap(FileElement[] files)
487+
{
488+
var map = new Dictionary<string, string>(StringComparer.Ordinal);
489+
if (files == null)
490+
{
491+
return map;
492+
}
493+
494+
foreach (var file in files)
495+
{
496+
if (file.Location != null && !string.IsNullOrEmpty(file.Location.Path) && !string.IsNullOrEmpty(file.Location.LayerId))
497+
{
498+
map.TryAdd(file.Location.Path, file.Location.LayerId);
499+
}
500+
}
501+
502+
return map;
503+
}
504+
505+
/// <summary>
506+
/// Determines whether a file path is a well-known package manager database file
507+
/// that should be excluded from layer attribution.
508+
/// </summary>
509+
private static bool IsPackageManagerDatabasePath(string path)
510+
{
511+
if (string.IsNullOrEmpty(path))
512+
{
513+
return false;
514+
}
515+
516+
if (PackageManagerDatabasePaths.Contains(path))
517+
{
518+
return true;
519+
}
520+
521+
// Cover any file under /var/lib/rpm/ (RPM database can use multiple files)
522+
return path.StartsWith("/var/lib/rpm/", StringComparison.OrdinalIgnoreCase);
523+
}
419524
}

src/Microsoft.ComponentDetection.Orchestrator/Commands/ScanSettings.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public class ScanSettings : BaseSettings
8686
[Description("Whether or not to cleanup files that are created during detection, based on the rules provided in each detector. Defaults to 'true'.")]
8787
public bool? CleanupCreatedFiles { get; set; }
8888

89+
[CommandOption("--ExcludeBaseImageComponents")]
90+
[Description("When enabled, filters out components that originate exclusively from base image layers when scanning containers.")]
91+
public bool ExcludeBaseImageComponents { get; set; }
92+
8993
/// <inheritdoc />
9094
public override ValidationResult Validate()
9195
{

src/Microsoft.ComponentDetection.Orchestrator/Services/GraphTranslation/DefaultGraphTranslationService.cs

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,175 @@ public ScanResult GenerateScanResultFromProcessingResult(
4141

4242
ReconcileDependencyGraphIds(dependencyGraphs, mergedComponents);
4343

44+
var componentsToOutput = mergedComponents;
45+
if (settings.ExcludeBaseImageComponents)
46+
{
47+
var originalCount = mergedComponents.Count;
48+
componentsToOutput = this.FilterOutBaseImageComponents(componentsToOutput, detectorProcessingResult.ContainersDetailsMap);
49+
var filteredCount = originalCount - componentsToOutput.Count;
50+
51+
if (filteredCount > 0)
52+
{
53+
this.logger.LogInformation("Filtered out {FilteredCount} of {TotalCount} components that originate exclusively from base image layers. {RetainedCount} components remain.", filteredCount, originalCount, componentsToOutput.Count);
54+
}
55+
else
56+
{
57+
this.logger.LogInformation("Base image component filtering is enabled but no components were filtered out ({TotalCount} total).", originalCount);
58+
}
59+
60+
PruneFilteredComponentsFromGraphs(dependencyGraphs, componentsToOutput);
61+
PruneFilteredComponentReferrers(componentsToOutput);
62+
}
63+
4464
return new DefaultGraphScanResult
4565
{
46-
ComponentsFound = mergedComponents.Select(x => this.ConvertToContract(x)).ToList(),
66+
ComponentsFound = componentsToOutput.Select(x => this.ConvertToContract(x)).ToList(),
4767
ContainerDetailsMap = detectorProcessingResult.ContainersDetailsMap,
4868
DependencyGraphs = dependencyGraphs,
4969
SourceDirectory = settings.SourceDirectory.ToString(),
5070
};
5171
}
5272

73+
/// <summary>
74+
/// Filters out components that originate exclusively from base image layers.
75+
/// A component is removed only if it has container layer references and every referenced
76+
/// layer across all containers has <see cref="DockerLayer.IsBaseImage"/> set to true.
77+
/// Components with no container references or with at least one non-base-image layer are retained.
78+
/// </summary>
79+
/// <param name="components">The list of detected components to filter.</param>
80+
/// <param name="containerDetailsMap">The map of container details with layer information.</param>
81+
/// <returns>A filtered list of components excluding those exclusively from base image layers.</returns>
82+
internal List<DetectedComponent> FilterOutBaseImageComponents(
83+
List<DetectedComponent> components,
84+
Dictionary<int, ContainerDetails> containerDetailsMap)
85+
{
86+
if (containerDetailsMap == null || containerDetailsMap.Count == 0)
87+
{
88+
return components;
89+
}
90+
91+
// Build an indexed lookup: containerDetailId → (layerIndex → DockerLayer)
92+
var layerLookup = new Dictionary<int, Dictionary<int, DockerLayer>>();
93+
foreach (var (id, details) in containerDetailsMap)
94+
{
95+
if (details.Layers != null)
96+
{
97+
layerLookup[id] = details.Layers.GroupBy(l => l.LayerIndex).ToDictionary(g => g.Key, g => g.First());
98+
}
99+
}
100+
101+
var retained = new List<DetectedComponent>();
102+
foreach (var component in components)
103+
{
104+
if (IsExclusivelyFromBaseImage(component, layerLookup))
105+
{
106+
this.logger.LogDebug("Filtering out component {ComponentId} because all associated layers are from the base image.", component.Component.Id);
107+
}
108+
else
109+
{
110+
retained.Add(component);
111+
}
112+
}
113+
114+
return retained;
115+
}
116+
117+
private static bool IsExclusivelyFromBaseImage(DetectedComponent component, Dictionary<int, Dictionary<int, DockerLayer>> layerLookup)
118+
{
119+
// Components without container layer references are not from a container scan - keep them.
120+
if (component.ContainerLayerIds == null || component.ContainerLayerIds.Count == 0)
121+
{
122+
return false;
123+
}
124+
125+
foreach (var (containerDetailId, layerIndices) in component.ContainerLayerIds)
126+
{
127+
if (!layerLookup.TryGetValue(containerDetailId, out var layersByIndex))
128+
{
129+
// If we can't resolve the container details, assume it's not a base image component.
130+
return false;
131+
}
132+
133+
if (layerIndices == null || !layerIndices.Any())
134+
{
135+
// No layer indices for this container detail - keep the component.
136+
return false;
137+
}
138+
139+
foreach (var layerIndex in layerIndices)
140+
{
141+
if (!layersByIndex.TryGetValue(layerIndex, out var layer) || !layer.IsBaseImage)
142+
{
143+
// Layer not found or not from base image. Keep this component.
144+
return false;
145+
}
146+
}
147+
}
148+
149+
return true;
150+
}
151+
152+
/// <summary>
153+
/// Removes component IDs from dependency graphs that are no longer present in the output components list.
154+
/// </summary>
155+
private static void PruneFilteredComponentsFromGraphs(DependencyGraphCollection graphs, List<DetectedComponent> retainedComponents)
156+
{
157+
if (graphs == null || graphs.Count == 0)
158+
{
159+
return;
160+
}
161+
162+
var retainedIds = new HashSet<string>(retainedComponents.Select(c => c.Component.Id));
163+
164+
foreach (var graphWithMetadata in graphs.Values)
165+
{
166+
var graph = graphWithMetadata.Graph;
167+
168+
// Remove nodes that are no longer in retained components.
169+
var idsToRemove = graph.Keys.Where(id => !retainedIds.Contains(id)).ToList();
170+
foreach (var id in idsToRemove)
171+
{
172+
graph.Remove(id);
173+
}
174+
175+
// Remove references to removed IDs from remaining nodes' dependency sets.
176+
// Normalize empty edge sets to null for consistent leaf-node serialization.
177+
foreach (var nodeId in graph.Keys.ToList())
178+
{
179+
var edges = graph[nodeId];
180+
if (edges != null)
181+
{
182+
edges.RemoveWhere(id => !retainedIds.Contains(id));
183+
if (edges.Count == 0)
184+
{
185+
graph[nodeId] = null;
186+
}
187+
}
188+
}
189+
190+
// Clean up metadata sets.
191+
graphWithMetadata.ExplicitlyReferencedComponentIds?.RemoveWhere(id => !retainedIds.Contains(id));
192+
graphWithMetadata.DevelopmentDependencies?.RemoveWhere(id => !retainedIds.Contains(id));
193+
graphWithMetadata.Dependencies?.RemoveWhere(id => !retainedIds.Contains(id));
194+
}
195+
}
196+
197+
/// <summary>
198+
/// Removes references to filtered-out components from the DependencyRoots and AncestralDependencyRoots
199+
/// of retained components, so that TopLevelReferrers and AncestralReferrers in the output don't
200+
/// reference components that were removed from ComponentsFound.
201+
/// </summary>
202+
private static void PruneFilteredComponentReferrers(List<DetectedComponent> retainedComponents)
203+
{
204+
var retainedIds = new HashSet<string>(retainedComponents.Select(c => c.Component.Id));
205+
206+
foreach (var component in retainedComponents)
207+
{
208+
component.DependencyRoots?.RemoveWhere(root => !retainedIds.Contains(root.Id));
209+
component.AncestralDependencyRoots?.RemoveWhere(root => !retainedIds.Contains(root.Id));
210+
}
211+
}
212+
53213
private static ConcurrentHashSet<string> MergeTargetFrameworks(ConcurrentHashSet<string> left, ConcurrentHashSet<string> right)
54214
{
55215
if (left == null && right == null)

0 commit comments

Comments
 (0)