diff --git a/src/Microsoft.ComponentDetection.Common/DockerService.cs b/src/Microsoft.ComponentDetection.Common/DockerService.cs index faf3148b9..20b638bbc 100644 --- a/src/Microsoft.ComponentDetection.Common/DockerService.cs +++ b/src/Microsoft.ComponentDetection.Common/DockerService.cs @@ -2,8 +2,8 @@ namespace Microsoft.ComponentDetection.Common; using System; +using System.Collections.Concurrent; using System.Collections.Generic; -using System.IO; using System.Linq; using System.Text.Json; using System.Threading; @@ -23,6 +23,13 @@ internal class DockerService : IDockerService private const string BaseImageDigestAnnotation = "image.base.digest"; private static readonly DockerClient Client = new DockerClientConfiguration().CreateClient(); + + /// + /// Tracks in-flight image pulls so each image is pulled at most once concurrently. + /// Concurrent callers for the same image await the same task. + /// + private static readonly ConcurrentDictionary> PullCache = new(); + private static int incrementingContainerId; private readonly ILogger logger; @@ -77,11 +84,13 @@ public async Task ImageExistsLocallyAsync(string image, CancellationToken { var imageInspectResponse = await this.InspectImageAndSanitizeVarsAsync(image, cancellationToken); record.ImageInspectResponse = JsonSerializer.Serialize(imageInspectResponse); + this.logger.LogDebug("Image {Image} found locally", image); return true; } catch (Exception e) { record.ExceptionMessage = e.Message; + this.logger.LogDebug("Image {Image} not found locally", image); cancellationToken.ThrowIfCancellationRequested(); return false; } @@ -95,6 +104,48 @@ private async Task InspectImageAndSanitizeVarsAsync(string } public async Task TryPullImageAsync(string image, CancellationToken cancellationToken = default) + { + // Check if already available locally before attempting a pull + if (await this.ImageExistsLocallyAsync(image, cancellationToken)) + { + return true; + } + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var existingTask = PullCache.GetOrAdd(image, tcs.Task); + + if (existingTask != tcs.Task) + { + // Another caller is already pulling this image — await their result + this.logger.LogDebug("Image {Image} is already being pulled by another caller, waiting", image); + return await existingTask.WaitAsync(cancellationToken); + } + + // We own this cache entry — perform the actual pull. + try + { + this.logger.LogDebug("Pulling image {Image}...", image); + var result = await this.PullImageCoreAsync(image, cancellationToken); + this.logger.LogDebug("Pull of image {Image} completed (success={Success})", image, result); + tcs.SetResult(result); + return result; + } + catch (Exception ex) + { + this.logger.LogDebug(ex, "Pull of image {Image} failed", image); + tcs.SetException(ex); + throw; + } + finally + { + // Remove the entry once complete. The cache only deduplicates concurrent + // in-flight pulls — subsequent callers will hit ImageExistsLocallyAsync + // for images that were already pulled successfully. + PullCache.TryRemove(image, out _); + } + } + + private async Task PullImageCoreAsync(string image, CancellationToken cancellationToken) { using var record = new DockerServiceTryPullImageTelemetryRecord { @@ -213,7 +264,7 @@ public async Task InspectImageAsync(string image, Cancellation var stream = await AttachContainerAsync(container.ID, cancellationToken); await StartContainerAsync(container.ID, cancellationToken); - this.logger.LogInformation("Container {ContainerId} started for image {Image}, reading output...", container.ID, image); + this.logger.LogInformation("Container {ContainerId} started with image {Image} to execute {Command}, reading output...", container.ID, image, commandJson); // Flush telemetry before the long-running ReadOutput so we get mid-scan // data in App Insights even if the process hangs during the read. @@ -353,7 +404,6 @@ private static async Task CreateContainerAsync( { var binds = new List { - $"{Path.GetTempPath()}:/tmp", "/var/run/docker.sock:/var/run/docker.sock", }; diff --git a/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs b/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs index 0a5a4009c..019852a31 100644 --- a/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs +++ b/src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs @@ -662,6 +662,24 @@ await this.dockerService.ImageExistsLocallyAsync(refWithDigest, cancellationToke refWithDigest, cancellationToken ); + + if (baseImageDetails == null) + { + record.BaseImageLayerMessage = "Failed to inspect base image after pull"; + this.logger.LogInformation( + "Failed to inspect base image {BaseImage} after pull. Results will not be mapped to base image layers", + refWithDigest + ); + return 0; + } + + this.logger.LogDebug( + "Base image {BaseImage} resolved for {ContainerImage} with {LayerCount} layers", + refWithDigest, + image, + baseImageDetails.Layers.Count() + ); + if (!ValidateBaseImageLayers(scannedImageDetails, baseImageDetails)) { record.BaseImageLayerMessage = diff --git a/src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs b/src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs index 0010604d6..00200f3e7 100644 --- a/src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs +++ b/src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs @@ -1,6 +1,7 @@ namespace Microsoft.ComponentDetection.Detectors.Linux; using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Text.Json; @@ -31,6 +32,13 @@ internal class LinuxScanner : ILinuxScanner private static readonly SemaphoreSlim ContainerSemaphore = new SemaphoreSlim(2); + /// + /// Caches in-flight syft runs. + /// When multiple detectors scan the same image concurrently, the second + /// caller awaits the already-running task instead of launching a new container. + /// + private static readonly ConcurrentDictionary<(string Source, LinuxScannerScope Scope, string Binds), Task> SyftRunCache = new(); + private static readonly int SemaphoreTimeout = Convert.ToInt32( TimeSpan.FromHours(1).TotalMilliseconds ); @@ -253,6 +261,7 @@ private IEnumerable ProcessSyftOutputWithTelemetry( /// /// Runs the Syft scanner container and returns the stdout output. + /// Results are cached so that callers with identical parameters share a single container run. /// private async Task RunSyftAsync( string syftSource, @@ -261,6 +270,51 @@ private async Task RunSyftAsync( LinuxScannerTelemetryRecord record, LinuxScannerSyftTelemetryRecord syftTelemetryRecord, CancellationToken cancellationToken) + { + var bindsKey = string.Join(";", (additionalBinds ?? []).OrderBy(b => b, StringComparer.Ordinal)); + var cacheKey = (syftSource, scope, bindsKey); + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var existingTask = SyftRunCache.GetOrAdd(cacheKey, tcs.Task); + + if (existingTask != tcs.Task) + { + // Another caller is already running syft for this image+scope — await their result, + // but allow this caller's cancellation token to abort the wait. + this.logger.LogDebug("Syft run for {SyftSource} (scope={Scope}) is already in-flight, reusing existing result", syftSource, scope); + return await existingTask.WaitAsync(cancellationToken); + } + + // We own this cache entry — run syft and propagate the result. + try + { + var result = await this.RunSyftCoreAsync(syftSource, scope, additionalBinds ?? [], record, syftTelemetryRecord, cancellationToken); + tcs.SetResult(result); + return result; + } + catch (Exception ex) + { + tcs.SetException(ex); + throw; + } + finally + { + // Remove the entry once complete. The cache only deduplicates concurrent + // in-flight calls — keeping completed entries would leak memory for the + // lifetime of the process. + SyftRunCache.TryRemove(cacheKey, out _); + } + } + + /// + /// Executes the Syft scanner container and returns the stdout output. + /// + private async Task RunSyftCoreAsync( + string syftSource, + LinuxScannerScope scope, + IList additionalBinds, + LinuxScannerTelemetryRecord record, + LinuxScannerSyftTelemetryRecord syftTelemetryRecord, + CancellationToken cancellationToken) { var acquired = false; var stdout = string.Empty; @@ -357,4 +411,9 @@ HashSet enabledFactories var layerIds = artifact.Locations?.Select(location => location.LayerId).Distinct() ?? []; return (component, layerIds); } + + /// + /// Clears the syft run cache. Intended for test isolation only. + /// + internal static void ResetCache() => SyftRunCache.Clear(); } diff --git a/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxContainerDetectorTests.cs b/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxContainerDetectorTests.cs index eb6780186..5fdf30a26 100644 --- a/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxContainerDetectorTests.cs +++ b/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxContainerDetectorTests.cs @@ -1495,4 +1495,50 @@ public async Task ExecuteDetectorAsync_ScanThrowsOce_OtherImageStillScanned() // The first image should have produced components componentRecorder.GetDetectedComponents().Should().NotBeEmpty(); } + + [TestMethod] + public async Task TestLinuxContainerDetector_DuplicateImageReferences_ScansOnlyOnceAsync() + { + var componentRecorder = new ComponentRecorder(); + + // Pass the exact same image reference twice. + var scanRequest = new ScanRequest( + new DirectoryInfo(Path.GetTempPath()), + (_, __) => false, + this.mockLogger.Object, + null, + [NodeLatestImage, NodeLatestImage], + componentRecorder + ); + + var linuxContainerDetector = new LinuxContainerDetector( + this.mockSyftLinuxScanner.Object, + this.mockDockerService.Object, + this.mockLinuxContainerDetectorLogger.Object + ); + + var scanResult = await linuxContainerDetector.ExecuteDetectorAsync(scanRequest); + + scanResult.ResultCode.Should().Be(ProcessingResultCode.Success); + scanResult.ContainerDetails.Should().ContainSingle(); + + var detectedComponents = componentRecorder.GetDetectedComponents().ToList(); + detectedComponents.Should().ContainSingle(); + detectedComponents.First().Component.Id.Should().Be(BashPackageId); + + // Both references resolve to the same ImageId via InspectImageAsync, + // so the ConcurrentDictionary in ProcessImagesAsync deduplicates them. + this.mockSyftLinuxScanner.Verify( + scanner => + scanner.ScanLinuxAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny(), + It.IsAny>(), + It.IsAny(), + It.IsAny() + ), + Times.Once + ); + } } diff --git a/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxScannerTests.cs b/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxScannerTests.cs index 5178eec0b..a315c57d2 100644 --- a/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxScannerTests.cs +++ b/test/Microsoft.ComponentDetection.Detectors.Tests/LinuxScannerTests.cs @@ -228,6 +228,9 @@ public class LinuxScannerTests public LinuxScannerTests() { + // Clear the static syft run cache to prevent cross-test interference. + LinuxScanner.ResetCache(); + this.mockDockerService = new Mock(); this.mockDockerService.Setup(service => service.CanPingDockerAsync(It.IsAny()) @@ -1027,4 +1030,440 @@ public void TestLinuxScanner_ProcessSyftOutput_ReturnsComponentsWithoutLayerInfo entry.Components.Select(c => (c as LinuxComponent)!.Name) .Should().Contain("bash").And.Contain("openssl"); } + + [TestMethod] + public async Task TestLinuxScanner_ConcurrentScansSameImage_RunsSyftOnlyOnceAsync() + { + LinuxScanner.ResetCache(); + + // Use a TCS so the mock doesn't complete synchronously — both callers + // must enter GetOrAdd while the task is still in-flight. + var syftTcs = new TaskCompletionSource<(string, string)>(); + + this.mockDockerService.Setup(service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ) + ) + .Returns(syftTcs.Task); + + var enabledTypes = new HashSet + { + ComponentType.Linux, + ComponentType.Npm, + ComponentType.Pip, + }; + + var layers = new[] + { + new DockerLayer + { + LayerIndex = 0, + DiffId = "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971", + }, + }; + + var scanner1 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + var scanner2 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + + // Both start while the task is still pending — they should share one run. + var task1 = scanner1.ScanLinuxAsync("same_hash", layers, 0, enabledTypes, LinuxScannerScope.AllLayers); + var task2 = scanner2.ScanLinuxAsync("same_hash", layers, 0, enabledTypes, LinuxScannerScope.AllLayers); + + // Complete the single syft run. + syftTcs.SetResult((SyftOutputNoAuthorOrLicense, string.Empty)); + + var results = await Task.WhenAll(task1, task2); + + results[0].Should().NotBeEmpty(); + results[1].Should().NotBeEmpty(); + results[0].First().Components.Should().ContainSingle(); + results[1].First().Components.Should().ContainSingle(); + + this.mockDockerService.Verify( + service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ), + Times.Once + ); + } + + [TestMethod] + public async Task TestLinuxScanner_ConcurrentScansDifferentImages_RunsSyftForEachAsync() + { + LinuxScanner.ResetCache(); + + this.mockDockerService.Setup(service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ) + ) + .ReturnsAsync((SyftOutputNoAuthorOrLicense, string.Empty)); + + var enabledTypes = new HashSet + { + ComponentType.Linux, + ComponentType.Npm, + ComponentType.Pip, + }; + + var layers = new[] + { + new DockerLayer + { + LayerIndex = 0, + DiffId = "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971", + }, + }; + + var scanner1 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + var scanner2 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + + var task1 = scanner1.ScanLinuxAsync("image_hash_A", layers, 0, enabledTypes, LinuxScannerScope.AllLayers); + var task2 = scanner2.ScanLinuxAsync("image_hash_B", layers, 0, enabledTypes, LinuxScannerScope.AllLayers); + + var results = await Task.WhenAll(task1, task2); + + results[0].Should().NotBeEmpty(); + results[1].Should().NotBeEmpty(); + + this.mockDockerService.Verify( + service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ), + Times.Exactly(2) + ); + } + + [TestMethod] + public async Task TestLinuxScanner_ConcurrentScansDifferentScopes_RunsSyftForEachAsync() + { + LinuxScanner.ResetCache(); + + this.mockDockerService.Setup(service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ) + ) + .ReturnsAsync((SyftOutputNoAuthorOrLicense, string.Empty)); + + var enabledTypes = new HashSet + { + ComponentType.Linux, + ComponentType.Npm, + ComponentType.Pip, + }; + + var layers = new[] + { + new DockerLayer + { + LayerIndex = 0, + DiffId = "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971", + }, + }; + + var scanner1 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + var scanner2 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + + var task1 = scanner1.ScanLinuxAsync("same_hash", layers, 0, enabledTypes, LinuxScannerScope.AllLayers); + var task2 = scanner2.ScanLinuxAsync("same_hash", layers, 0, enabledTypes, LinuxScannerScope.Squashed); + + var results = await Task.WhenAll(task1, task2); + + results[0].Should().NotBeEmpty(); + results[1].Should().NotBeEmpty(); + + this.mockDockerService.Verify( + service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ), + Times.Exactly(2) + ); + } + + [TestMethod] + public async Task TestLinuxScanner_SyftCacheKey_BindOrderDoesNotMatterAsync() + { + LinuxScanner.ResetCache(); + + const string syftOutputWithSource = """ + { + "distro": { "id": "test", "versionID": "1.0" }, + "artifacts": [], + "source": { + "id": "sha256:abc", + "name": "/img", + "type": "image", + "version": "sha256:abc", + "metadata": { + "userInput": "/img", + "imageID": "sha256:img", + "layers": [], + "labels": {} + } + } + } + """; + + // Use a TCS so the mock doesn't complete synchronously — both callers + // must enter GetOrAdd while the task is still in-flight. + var syftTcs = new TaskCompletionSource<(string, string)>(); + + this.mockDockerService.Setup(service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ) + ) + .Returns(syftTcs.Task); + + var scanner1 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + var scanner2 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + + // Both calls start concurrently with binds in different order. + var task1 = scanner1.GetSyftOutputAsync( + "oci-dir:/img", + ["/host/a:/container/a:ro", "/host/b:/container/b:ro"], + LinuxScannerScope.AllLayers + ); + var task2 = scanner2.GetSyftOutputAsync( + "oci-dir:/img", + ["/host/b:/container/b:ro", "/host/a:/container/a:ro"], + LinuxScannerScope.AllLayers + ); + + // Complete the single syft run. + syftTcs.SetResult((syftOutputWithSource, string.Empty)); + + var results = await Task.WhenAll(task1, task2); + + results[0].Should().NotBeNull(); + results[1].Should().NotBeNull(); + + // Bind order shouldn't matter — both should share a single container run. + this.mockDockerService.Verify( + service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ), + Times.Once + ); + } + + [TestMethod] + public async Task TestLinuxScanner_FailedSyftRun_RemovesCacheEntry_AllowsRetryAsync() + { + LinuxScanner.ResetCache(); + + var callCount = 0; + this.mockDockerService.Setup(service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ) + ) + .ReturnsAsync(() => + { + var current = Interlocked.Increment(ref callCount); + if (current == 1) + { + throw new InvalidOperationException("Simulated Docker failure"); + } + + return (SyftOutputNoAuthorOrLicense, string.Empty); + }); + + var enabledTypes = new HashSet + { + ComponentType.Linux, + ComponentType.Npm, + ComponentType.Pip, + }; + + var layers = new[] + { + new DockerLayer + { + LayerIndex = 0, + DiffId = "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971", + }, + }; + + // First call should fail. + Func firstCall = async () => + await this.linuxScanner.ScanLinuxAsync( + "retry_hash", + layers, + 0, + enabledTypes, + LinuxScannerScope.AllLayers + ); + + await firstCall.Should().ThrowAsync(); + + // Second call should succeed because the failed cache entry was removed. + var result = await this.linuxScanner.ScanLinuxAsync( + "retry_hash", + layers, + 0, + enabledTypes, + LinuxScannerScope.AllLayers + ); + + result.Should().NotBeEmpty(); + result.First().Components.Should().ContainSingle(); + + this.mockDockerService.Verify( + service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ), + Times.Exactly(2) + ); + } + + [TestMethod] + public async Task TestLinuxScanner_CancelledCaller_DoesNotBlockOnInFlightSyftRunAsync() + { + LinuxScanner.ResetCache(); + + // Use a TCS to control when the syft container "completes", + // so the first caller's run stays in-flight while we cancel the second. + var syftCompletionSource = new TaskCompletionSource<(string, string)>(); + + this.mockDockerService.Setup(service => + service.CreateAndRunContainerAsync( + It.IsAny(), + It.IsAny>(), + It.IsAny>(), + It.IsAny() + ) + ) + .Returns(syftCompletionSource.Task); + + var enabledTypes = new HashSet { ComponentType.Linux }; + + var layers = new[] + { + new DockerLayer + { + LayerIndex = 0, + DiffId = "sha256:f95fc50d21d981f1efe1f04109c2c3287c271794f5d9e4fdf9888851a174a971", + }, + }; + + var scanner1 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + var scanner2 = new LinuxScanner( + this.mockDockerService.Object, + this.mockLogger.Object, + this.componentFactories, + this.artifactFilters + ); + + // First caller starts the syft run (it will block on syftCompletionSource). + var task1 = scanner1.ScanLinuxAsync("cancel_hash", layers, 0, enabledTypes, LinuxScannerScope.AllLayers); + + // Second caller with a cancellable token joins the same in-flight run. + using var cts = new CancellationTokenSource(); + var task2 = scanner2.ScanLinuxAsync("cancel_hash", layers, 0, enabledTypes, LinuxScannerScope.AllLayers, cts.Token); + + // Cancel the second caller while the first is still running. + await cts.CancelAsync(); + + // The second caller should throw OperationCanceledException promptly. + try + { + await task2; + Assert.Fail("Expected OperationCanceledException was not thrown"); + } + catch (OperationCanceledException) + { + // Expected — the second caller was cancelled while waiting for the in-flight run. + } + + // The first caller should still be running (not cancelled). + task1.IsCompleted.Should().BeFalse(); + + // Now let the first caller complete normally. + syftCompletionSource.SetResult((SyftOutputNoAuthorOrLicense, string.Empty)); + var result1 = await task1; + result1.Should().NotBeEmpty(); + } }