Skip to content

Commit 39b2cff

Browse files
committed
Use files other than system package manager DBs when possible to determine component to layer map
1 parent e864d21 commit 39b2cff

3 files changed

Lines changed: 320 additions & 7 deletions

File tree

src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,21 @@ internal class LinuxScanner : ILinuxScanner
3030

3131
private static readonly IList<string> ScopeSquashedParameter = ["--scope", "squashed"];
3232

33+
/// <summary>
34+
/// Well-known package manager database paths whose layer attribution should be ignored
35+
/// when determining which layer a system package belongs to. These files are shared across
36+
/// all packages managed by the same package manager and get updated whenever any package is
37+
/// installed or removed, causing unrelated packages to appear as modified in later layers.
38+
/// </summary>
39+
private static readonly HashSet<string> PackageManagerDatabasePaths = new(StringComparer.OrdinalIgnoreCase)
40+
{
41+
"/var/lib/dpkg/status",
42+
"/lib/apk/db/installed",
43+
"/var/lib/rpm/Packages",
44+
"/var/lib/rpm/Packages.db",
45+
"/var/lib/rpm/rpmdb.sqlite",
46+
};
47+
3348
private static readonly SemaphoreSlim ContainerSemaphore = new SemaphoreSlim(2);
3449

3550
/// <summary>
@@ -179,11 +194,16 @@ private IEnumerable<LayerMappedLinuxComponents> ProcessSyftOutputWithTelemetry(
179194
}
180195
}
181196

197+
// Build a file path → layerID map from the top-level files listing.
198+
// This allows us to determine layer attribution for files owned by a package
199+
// even when the artifact's locations only reference the package manager database.
200+
var filePathToLayerId = BuildFilePathToLayerMap(syftOutput.Files);
201+
182202
// Create components using only enabled factories
183203
var componentsWithLayers = validArtifacts
184204
.DistinctBy(artifact => (artifact.Name, artifact.Version, artifact.Type))
185205
.Select(artifact =>
186-
this.CreateComponentWithLayers(artifact, syftOutput.Distro, enabledFactories)
206+
this.CreateComponentWithLayers(artifact, syftOutput.Distro, enabledFactories, filePathToLayerId)
187207
)
188208
.Where(result => result.Component != null)
189209
.Select(result => (Component: result.Component!, result.LayerIds))
@@ -388,7 +408,8 @@ private async Task<string> RunSyftCoreAsync(
388408
private (TypedComponent? Component, IEnumerable<string> LayerIds) CreateComponentWithLayers(
389409
ArtifactElement artifact,
390410
Distro distro,
391-
HashSet<IArtifactComponentFactory> enabledFactories
411+
HashSet<IArtifactComponentFactory> enabledFactories,
412+
Dictionary<string, string> filePathToLayerId
392413
)
393414
{
394415
if (!this.artifactTypeToFactoryLookup.TryGetValue(artifact.Type, out var factory))
@@ -408,12 +429,93 @@ HashSet<IArtifactComponentFactory> enabledFactories
408429
return (null, []);
409430
}
410431

411-
var layerIds = artifact.Locations?.Select(location => location.LayerId).Distinct() ?? [];
412-
return (component, layerIds);
432+
// Collect layer IDs from the artifact's locations.
433+
var locationLayerIds = artifact.Locations?
434+
.Select(location => (location.Path, location.LayerId))
435+
.ToList() ?? [];
436+
437+
// Also consult the metadata files property to find additional owned files,
438+
// and look up their layer IDs from the top-level file listing.
439+
if (artifact.Metadata?.Files != null)
440+
{
441+
foreach (var file in artifact.Metadata.Files)
442+
{
443+
var filePath = file.FileFile?.Path;
444+
if (!string.IsNullOrEmpty(filePath) && filePathToLayerId.TryGetValue(filePath, out var layerId))
445+
{
446+
locationLayerIds.Add((filePath, layerId));
447+
}
448+
}
449+
}
450+
451+
// Exclude well-known package manager database paths from layer attribution,
452+
// unless they are the only known locations for this component.
453+
var nonDbLayerIds = locationLayerIds
454+
.Where(loc => !IsPackageManagerDatabasePath(loc.Path))
455+
.Select(loc => loc.LayerId)
456+
.Distinct()
457+
.ToList();
458+
459+
if (nonDbLayerIds.Count > 0)
460+
{
461+
return (component, nonDbLayerIds);
462+
}
463+
464+
// Fall back to database path layer IDs if no other locations are available.
465+
var allLayerIds = locationLayerIds
466+
.Select(loc => loc.LayerId)
467+
.Distinct()
468+
.ToList();
469+
return (component, allLayerIds);
413470
}
414471

415472
/// <summary>
416473
/// Clears the syft run cache. Intended for test isolation only.
417474
/// </summary>
418475
internal static void ResetCache() => SyftRunCache.Clear();
476+
477+
/// <summary>
478+
/// Builds a dictionary mapping file paths to their layer IDs from the top-level
479+
/// files listing in the syft output. This enables layer attribution for files
480+
/// owned by a package even when the artifact's locations only reference the
481+
/// package manager database.
482+
/// </summary>
483+
private static Dictionary<string, string> BuildFilePathToLayerMap(FileElement[] files)
484+
{
485+
var map = new Dictionary<string, string>(StringComparer.Ordinal);
486+
if (files == null)
487+
{
488+
return map;
489+
}
490+
491+
foreach (var file in files)
492+
{
493+
if (file.Location != null && !string.IsNullOrEmpty(file.Location.Path) && !string.IsNullOrEmpty(file.Location.LayerId))
494+
{
495+
map.TryAdd(file.Location.Path, file.Location.LayerId);
496+
}
497+
}
498+
499+
return map;
500+
}
501+
502+
/// <summary>
503+
/// Determines whether a file path is a well-known package manager database file
504+
/// that should be excluded from layer attribution.
505+
/// </summary>
506+
private static bool IsPackageManagerDatabasePath(string path)
507+
{
508+
if (string.IsNullOrEmpty(path))
509+
{
510+
return false;
511+
}
512+
513+
if (PackageManagerDatabasePaths.Contains(path))
514+
{
515+
return true;
516+
}
517+
518+
// Cover any file under /var/lib/rpm/ (RPM database can use multiple files)
519+
return path.StartsWith("/var/lib/rpm/", StringComparison.OrdinalIgnoreCase);
520+
}
419521
}

src/Microsoft.ComponentDetection.Orchestrator/Services/GraphTranslation/DefaultGraphTranslationService.cs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,19 @@ public ScanResult GenerateScanResultFromProcessingResult(
4444
var componentsToOutput = mergedComponents;
4545
if (settings.FilterBaseImageComponents)
4646
{
47-
componentsToOutput = FilterOutBaseImageComponents(componentsToOutput, detectorProcessingResult.ContainersDetailsMap);
47+
var originalCount = mergedComponents.Count;
48+
componentsToOutput = this.FilterOutBaseImageComponents(componentsToOutput, detectorProcessingResult.ContainersDetailsMap);
49+
var filteredCount = originalCount - componentsToOutput.Count;
50+
51+
if (filteredCount > 0)
52+
{
53+
this.logger.LogInformation("Filtered out {FilteredCount} of {TotalCount} components that originate exclusively from base image layers. {RetainedCount} components remain.", filteredCount, originalCount, componentsToOutput.Count);
54+
}
55+
else
56+
{
57+
this.logger.LogInformation("Base image component filtering is enabled but no components were filtered out ({TotalCount} total).", originalCount);
58+
}
59+
4860
PruneFilteredComponentsFromGraphs(dependencyGraphs, componentsToOutput);
4961
PruneFilteredComponentReferrers(componentsToOutput);
5062
}
@@ -67,7 +79,7 @@ public ScanResult GenerateScanResultFromProcessingResult(
6779
/// <param name="components">The list of detected components to filter.</param>
6880
/// <param name="containerDetailsMap">The map of container details with layer information.</param>
6981
/// <returns>A filtered list of components excluding those exclusively from base image layers.</returns>
70-
internal static List<DetectedComponent> FilterOutBaseImageComponents(
82+
internal List<DetectedComponent> FilterOutBaseImageComponents(
7183
List<DetectedComponent> components,
7284
Dictionary<int, ContainerDetails> containerDetailsMap)
7385
{
@@ -86,7 +98,20 @@ internal static List<DetectedComponent> FilterOutBaseImageComponents(
8698
}
8799
}
88100

89-
return components.Where(component => !IsExclusivelyFromBaseImage(component, layerLookup)).ToList();
101+
var retained = new List<DetectedComponent>();
102+
foreach (var component in components)
103+
{
104+
if (IsExclusivelyFromBaseImage(component, layerLookup))
105+
{
106+
this.logger.LogDebug("Filtering out component {ComponentId} because all associated layers are from the base image.", component.Component.Id);
107+
}
108+
else
109+
{
110+
retained.Add(component);
111+
}
112+
}
113+
114+
return retained;
90115
}
91116

92117
private static bool IsExclusivelyFromBaseImage(DetectedComponent component, Dictionary<int, Dictionary<int, DockerLayer>> layerLookup)

test/Microsoft.ComponentDetection.Detectors.Tests/LinuxScannerTests.cs

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ public class LinuxScannerTests
9696
"version":"1.0.0",
9797
"type":"deb",
9898
"locations": [
99+
{
100+
"path": "/usr/bin/test",
101+
"layerID": "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971"
102+
},
99103
{
100104
"path": "/var/lib/dpkg/status",
101105
"layerID": "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971"
@@ -1466,4 +1470,186 @@ public async Task TestLinuxScanner_CancelledCaller_DoesNotBlockOnInFlightSyftRun
14661470
var result1 = await task1;
14671471
result1.Should().NotBeEmpty();
14681472
}
1473+
1474+
[TestMethod]
1475+
public void TestLinuxScanner_ProcessSyftOutput_ExcludesPackageManagerDatabasePathsFromLayerAttribution()
1476+
{
1477+
// Simulates a scenario where a package (curl) is installed in layer1,
1478+
// but the dpkg status file is also modified in layer2 by a different package install.
1479+
// curl should only be attributed to layer1.
1480+
var syftOutputJson = """
1481+
{
1482+
"distro": { "id": "ubuntu", "versionID": "22.04" },
1483+
"artifacts": [
1484+
{
1485+
"name": "curl",
1486+
"version": "7.81.0",
1487+
"type": "deb",
1488+
"locations": [
1489+
{
1490+
"path": "/usr/bin/curl",
1491+
"layerID": "sha256:layer1"
1492+
},
1493+
{
1494+
"path": "/var/lib/dpkg/status",
1495+
"layerID": "sha256:layer2"
1496+
}
1497+
]
1498+
}
1499+
],
1500+
"source": {
1501+
"id": "sha256:abc",
1502+
"name": "test-image",
1503+
"type": "image",
1504+
"version": "sha256:abc"
1505+
}
1506+
}
1507+
""";
1508+
var syftOutput = SyftOutput.FromJson(syftOutputJson);
1509+
var containerLayers = new List<DockerLayer>
1510+
{
1511+
new() { DiffId = "sha256:layer1", LayerIndex = 0, IsBaseImage = true },
1512+
new() { DiffId = "sha256:layer2", LayerIndex = 1, IsBaseImage = false },
1513+
};
1514+
var enabledTypes = new HashSet<ComponentType> { ComponentType.Linux };
1515+
1516+
var result = this.linuxScanner.ProcessSyftOutput(syftOutput, containerLayers, enabledTypes).ToList();
1517+
1518+
// curl should only appear in layer1, not layer2
1519+
var layer1Entry = result.FirstOrDefault(r => r.DockerLayer.DiffId == "sha256:layer1");
1520+
var layer2Entry = result.FirstOrDefault(r => r.DockerLayer.DiffId == "sha256:layer2");
1521+
1522+
layer1Entry.Should().NotBeNull();
1523+
layer1Entry.Components.Should().ContainSingle();
1524+
((LinuxComponent)layer1Entry.Components.First()).Name.Should().Be("curl");
1525+
1526+
// layer2 should have no components (or not exist in results)
1527+
layer2Entry?.Components.Should().BeEmpty();
1528+
}
1529+
1530+
[TestMethod]
1531+
public void TestLinuxScanner_ProcessSyftOutput_PackageWithOnlyDatabasePath_FallsBackToDatabaseLayer()
1532+
{
1533+
// If a package only has the database path as its location (no real file paths
1534+
// and no metadata files), the database path layer is used as a fallback.
1535+
var syftOutputJson = """
1536+
{
1537+
"distro": { "id": "alpine", "versionID": "3.18" },
1538+
"artifacts": [
1539+
{
1540+
"name": "musl",
1541+
"version": "1.2.4",
1542+
"type": "apk",
1543+
"locations": [
1544+
{
1545+
"path": "/lib/apk/db/installed",
1546+
"layerID": "sha256:layer1"
1547+
}
1548+
]
1549+
}
1550+
],
1551+
"source": {
1552+
"id": "sha256:abc",
1553+
"name": "test-image",
1554+
"type": "image",
1555+
"version": "sha256:abc"
1556+
}
1557+
}
1558+
""";
1559+
var syftOutput = SyftOutput.FromJson(syftOutputJson);
1560+
var containerLayers = new List<DockerLayer>
1561+
{
1562+
new() { DiffId = "sha256:layer1", LayerIndex = 0, IsBaseImage = true },
1563+
};
1564+
var enabledTypes = new HashSet<ComponentType> { ComponentType.Linux };
1565+
1566+
var result = this.linuxScanner.ProcessSyftOutput(syftOutput, containerLayers, enabledTypes).ToList();
1567+
1568+
// The component should fall back to the database path's layer
1569+
var layer1Entry = result.FirstOrDefault(r => r.DockerLayer.DiffId == "sha256:layer1");
1570+
layer1Entry.Should().NotBeNull();
1571+
layer1Entry.Components.Should().ContainSingle();
1572+
((LinuxComponent)layer1Entry.Components.First()).Name.Should().Be("musl");
1573+
}
1574+
1575+
[TestMethod]
1576+
public void TestLinuxScanner_ProcessSyftOutput_UsesMetadataFilesForLayerAttribution()
1577+
{
1578+
// Simulates a scenario where a package (curl) only has the package DB in its
1579+
// artifact locations, but has owned files in metadata.files. The top-level files[]
1580+
// listing provides the layer mapping for those owned files. The component should
1581+
// be attributed to the layer of its owned files, not the DB layer.
1582+
var syftOutputJson = """
1583+
{
1584+
"distro": { "id": "mariner", "versionID": "3.0" },
1585+
"artifacts": [
1586+
{
1587+
"name": "curl",
1588+
"version": "8.11.1",
1589+
"type": "rpm",
1590+
"locations": [
1591+
{
1592+
"path": "/var/lib/rpm/rpmdb.sqlite",
1593+
"layerID": "sha256:layer2"
1594+
}
1595+
],
1596+
"metadata": {
1597+
"files": [
1598+
{ "path": "/usr/bin/curl" },
1599+
{ "path": "/usr/lib/libcurl.so" }
1600+
]
1601+
}
1602+
}
1603+
],
1604+
"files": [
1605+
{
1606+
"id": "file1",
1607+
"location": {
1608+
"path": "/usr/bin/curl",
1609+
"layerID": "sha256:layer1"
1610+
}
1611+
},
1612+
{
1613+
"id": "file2",
1614+
"location": {
1615+
"path": "/usr/lib/libcurl.so",
1616+
"layerID": "sha256:layer1"
1617+
}
1618+
},
1619+
{
1620+
"id": "file3",
1621+
"location": {
1622+
"path": "/var/lib/rpm/rpmdb.sqlite",
1623+
"layerID": "sha256:layer2"
1624+
}
1625+
}
1626+
],
1627+
"source": {
1628+
"id": "sha256:abc",
1629+
"name": "test-image",
1630+
"type": "image",
1631+
"version": "sha256:abc"
1632+
}
1633+
}
1634+
""";
1635+
var syftOutput = SyftOutput.FromJson(syftOutputJson);
1636+
var containerLayers = new List<DockerLayer>
1637+
{
1638+
new() { DiffId = "sha256:layer1", LayerIndex = 0, IsBaseImage = true },
1639+
new() { DiffId = "sha256:layer2", LayerIndex = 1, IsBaseImage = false },
1640+
};
1641+
var enabledTypes = new HashSet<ComponentType> { ComponentType.Linux };
1642+
1643+
var result = this.linuxScanner.ProcessSyftOutput(syftOutput, containerLayers, enabledTypes).ToList();
1644+
1645+
// curl should be attributed to layer1 (where its real files are), not layer2 (DB layer)
1646+
var layer1Entry = result.FirstOrDefault(r => r.DockerLayer.DiffId == "sha256:layer1");
1647+
var layer2Entry = result.FirstOrDefault(r => r.DockerLayer.DiffId == "sha256:layer2");
1648+
1649+
layer1Entry.Should().NotBeNull();
1650+
layer1Entry.Components.Should().ContainSingle();
1651+
((LinuxComponent)layer1Entry.Components.First()).Name.Should().Be("curl");
1652+
1653+
layer2Entry?.Components.Should().BeEmpty();
1654+
}
14691655
}

0 commit comments

Comments
 (0)