From b10696757c75f2c826e817d1a941759726f86d02 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 2 Feb 2026 17:07:59 +0000 Subject: [PATCH 01/19] Implement docker device request in CLI Now one can pass in the cro runtime and GPU like --cro-device-request '{"Count":-1, \ "Capabilities":[["gpu"]]}' \ --cro-runtime nvidia \ ... --- pkg/app/master/command/build/cli.go | 1 + pkg/app/master/command/cliflags.go | 7 + pkg/app/master/command/clifvgetter.go | 1 + pkg/app/master/command/clifvgetter_test.go | 139 +++++++++++ pkg/app/master/config/config.go | 7 +- pkg/app/master/config/config_test.go | 82 +++++++ .../container/container_inspector.go | 11 + .../container/device_request_test.go | 229 ++++++++++++++++++ 8 files changed, 474 insertions(+), 3 deletions(-) create mode 100644 pkg/app/master/command/clifvgetter_test.go create mode 100644 pkg/app/master/config/config_test.go create mode 100644 pkg/app/master/inspectors/container/device_request_test.go diff --git a/pkg/app/master/command/build/cli.go b/pkg/app/master/command/build/cli.go index 88c79bcd3..efe79ff28 100644 --- a/pkg/app/master/command/build/cli.go +++ b/pkg/app/master/command/build/cli.go @@ -168,6 +168,7 @@ var BuildFlags = (append([]cli.Flag{ //Sensor flags: command.Cflag(command.FlagSensorIPCEndpoint), command.Cflag(command.FlagSensorIPCMode), + command.Cflag(command.FlagCRODeviceRequest), }, command.HTTPProbeFlags()...)) var CLI = &cli.Command{ diff --git a/pkg/app/master/command/cliflags.go b/pkg/app/master/command/cliflags.go index 842d908d8..7c425c386 100644 --- a/pkg/app/master/command/cliflags.go +++ b/pkg/app/master/command/cliflags.go @@ -177,6 +177,7 @@ const ( FlagCROHostConfigFile = "cro-host-config-file" FlagCROSysctl = "cro-sysctl" FlagCROShmSize = "cro-shm-size" + FlagCRODeviceRequest = "cro-device-request" //Original Container Runtime Options (without cro- prefix) FlagUser = "user" @@ -296,6 +297,7 @@ const ( FlagCROHostConfigFileUsage = "Base Docker host configuration file (JSON format) to use when running the container" FlagCROSysctlUsage = "Set namespaced kernel parameters in the created container" FlagCROShmSizeUsage = "Shared memory size for /dev/shm in the created container" + FlagCRODeviceRequestUsage = "JSON string specifying device request configuration for the container" FlagUserUsage = "Override USER analyzing image at runtime" FlagEntrypointUsage = "Override ENTRYPOINT analyzing image at runtime. To persist ENTRYPOINT changes in the output image, pass the --image-overrides=entrypoint or --image-overrides=all flag as well." @@ -918,6 +920,11 @@ var CommonFlags = map[string]cli.Flag{ Usage: FlagCROShmSizeUsage, EnvVars: []string{"DSLIM_CRO_SHM_SIZE"}, }, + FlagCRODeviceRequest: &cli.StringFlag{ + Name: FlagCRODeviceRequest, + Usage: FlagCRODeviceRequestUsage, + EnvVars: []string{"DSLIM_CRO_DEVICE_REQUEST"}, + }, FlagUser: &cli.StringFlag{ Name: FlagUser, Value: "", diff --git a/pkg/app/master/command/clifvgetter.go b/pkg/app/master/command/clifvgetter.go index 1aa30484c..31135148f 100644 --- a/pkg/app/master/command/clifvgetter.go +++ b/pkg/app/master/command/clifvgetter.go @@ -51,6 +51,7 @@ func GetContainerRunOptions(ctx *cli.Context) (*config.ContainerRunOptions, erro } cro.ShmSize = ctx.Int64(FlagCROShmSize) + cro.DeviceRequest = ctx.String(FlagCRODeviceRequest) return &cro, nil } diff --git a/pkg/app/master/command/clifvgetter_test.go b/pkg/app/master/command/clifvgetter_test.go new file mode 100644 index 000000000..e4fb6530d --- /dev/null +++ b/pkg/app/master/command/clifvgetter_test.go @@ -0,0 +1,139 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package command + +import ( + "flag" + "testing" + + "github.com/urfave/cli/v2" +) + +func TestGetContainerRunOptionsDeviceRequest(t *testing.T) { + tt := []struct { + deviceRequestFlag string + expectedDeviceRequest string + }{ + { + deviceRequestFlag: "", + expectedDeviceRequest: "", + }, + { + deviceRequestFlag: `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, + expectedDeviceRequest: `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, + }, + { + deviceRequestFlag: `{"Driver":"nvidia","DeviceIDs":["0","1"],"Capabilities":[["gpu"]]}`, + expectedDeviceRequest: `{"Driver":"nvidia","DeviceIDs":["0","1"],"Capabilities":[["gpu"]]}`, + }, + { + deviceRequestFlag: `{"Driver":"nvidia","Count":2,"DeviceIDs":["GPU-123"],"Capabilities":[["gpu","compute"]],"Options":{"visible":"true"}}`, + expectedDeviceRequest: `{"Driver":"nvidia","Count":2,"DeviceIDs":["GPU-123"],"Capabilities":[["gpu","compute"]],"Options":{"visible":"true"}}`, + }, + } + + for _, test := range tt { + flagSet := flag.NewFlagSet("test", flag.ContinueOnError) + flagSet.String(FlagCRODeviceRequest, test.deviceRequestFlag, "") + flagSet.String(FlagCRORuntime, "", "") + flagSet.String(FlagCROHostConfigFile, "", "") + flagSet.Int64(FlagCROShmSize, 0, "") + + app := &cli.App{} + ctx := cli.NewContext(app, flagSet, nil) + + cro, err := GetContainerRunOptions(ctx) + if err != nil { + t.Fatalf("GetContainerRunOptions returned error: %v", err) + } + + if cro.DeviceRequest != test.expectedDeviceRequest { + t.Errorf("DeviceRequest = %q, want %q", cro.DeviceRequest, test.expectedDeviceRequest) + } + } +} + +func TestGetContainerRunOptionsAllFields(t *testing.T) { + flagSet := flag.NewFlagSet("test", flag.ContinueOnError) + flagSet.String(FlagCRODeviceRequest, `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, "") + flagSet.String(FlagCRORuntime, "nvidia", "") + flagSet.String(FlagCROHostConfigFile, "", "") + flagSet.Int64(FlagCROShmSize, 67108864, "") + + app := &cli.App{} + ctx := cli.NewContext(app, flagSet, nil) + + cro, err := GetContainerRunOptions(ctx) + if err != nil { + t.Fatalf("GetContainerRunOptions returned error: %v", err) + } + + if cro.Runtime != "nvidia" { + t.Errorf("Runtime = %q, want 'nvidia'", cro.Runtime) + } + + if cro.ShmSize != 67108864 { + t.Errorf("ShmSize = %d, want 67108864", cro.ShmSize) + } + + if cro.DeviceRequest != `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}` { + t.Errorf("DeviceRequest = %q, want JSON string", cro.DeviceRequest) + } +} + +func TestFlagCRODeviceRequestDefinition(t *testing.T) { + flagDef, exists := CommonFlags[FlagCRODeviceRequest] + if !exists { + t.Fatal("FlagCRODeviceRequest not found in CommonFlags") + } + + stringFlag, ok := flagDef.(*cli.StringFlag) + if !ok { + t.Fatal("FlagCRODeviceRequest is not a StringFlag") + } + + if stringFlag.Name != FlagCRODeviceRequest { + t.Errorf("Flag name = %q, want %q", stringFlag.Name, FlagCRODeviceRequest) + } + + if stringFlag.Usage != FlagCRODeviceRequestUsage { + t.Errorf("Flag usage = %q, want %q", stringFlag.Usage, FlagCRODeviceRequestUsage) + } + + expectedEnvVar := "DSLIM_CRO_DEVICE_REQUEST" + hasEnvVar := false + for _, env := range stringFlag.EnvVars { + if env == expectedEnvVar { + hasEnvVar = true + break + } + } + if !hasEnvVar { + t.Errorf("Flag missing expected EnvVar %q, has %v", expectedEnvVar, stringFlag.EnvVars) + } +} + +func TestFlagCRODeviceRequestConstants(t *testing.T) { + if FlagCRODeviceRequest != "cro-device-request" { + t.Errorf("FlagCRODeviceRequest = %q, want 'cro-device-request'", FlagCRODeviceRequest) + } + + expectedUsage := "JSON string specifying device request configuration for the container" + if FlagCRODeviceRequestUsage != expectedUsage { + t.Errorf("FlagCRODeviceRequestUsage = %q, want %q", FlagCRODeviceRequestUsage, expectedUsage) + } +} diff --git a/pkg/app/master/config/config.go b/pkg/app/master/config/config.go index 1d9eb6aa1..b92fd394c 100644 --- a/pkg/app/master/config/config.go +++ b/pkg/app/master/config/config.go @@ -130,9 +130,10 @@ type ContainerRunOptions struct { //Explicit overrides for the base and host config fields //Host config field override are applied //on top of the fields in the HostConfig struct if it's provided (volume mounts are merged though) - Runtime string - SysctlParams map[string]string - ShmSize int64 + Runtime string + SysctlParams map[string]string + ShmSize int64 + DeviceRequest string } // VolumeMount provides the volume mount configuration information diff --git a/pkg/app/master/config/config_test.go b/pkg/app/master/config/config_test.go new file mode 100644 index 000000000..73ca99f55 --- /dev/null +++ b/pkg/app/master/config/config_test.go @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package config + +import ( + "testing" +) + +func TestContainerRunOptionsDeviceRequest(t *testing.T) { + tt := []struct { + deviceRequest string + expectEmpty bool + }{ + { + deviceRequest: "", + expectEmpty: true, + }, + { + deviceRequest: `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, + expectEmpty: false, + }, + { + deviceRequest: `{"Driver":"nvidia","DeviceIDs":["0","1"],"Capabilities":[["gpu"]]}`, + expectEmpty: false, + }, + } + + for _, test := range tt { + cro := ContainerRunOptions{ + DeviceRequest: test.deviceRequest, + } + + isEmpty := cro.DeviceRequest == "" + if isEmpty != test.expectEmpty { + t.Errorf("DeviceRequest isEmpty = %v for %q, want %v", isEmpty, test.deviceRequest, test.expectEmpty) + } + + if !test.expectEmpty && cro.DeviceRequest != test.deviceRequest { + t.Errorf("DeviceRequest = %q, want %q", cro.DeviceRequest, test.deviceRequest) + } + } +} + +func TestContainerRunOptionsAllFields(t *testing.T) { + // Test that ContainerRunOptions can be created with all fields including DeviceRequest + cro := ContainerRunOptions{ + Runtime: "nvidia", + SysctlParams: map[string]string{"net.core.somaxconn": "1024"}, + ShmSize: 67108864, // 64MB + DeviceRequest: `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, + } + + if cro.Runtime != "nvidia" { + t.Errorf("Runtime = %q, want 'nvidia'", cro.Runtime) + } + + if cro.SysctlParams["net.core.somaxconn"] != "1024" { + t.Errorf("SysctlParams[net.core.somaxconn] = %q, want '1024'", cro.SysctlParams["net.core.somaxconn"]) + } + + if cro.ShmSize != 67108864 { + t.Errorf("ShmSize = %d, want 67108864", cro.ShmSize) + } + + if cro.DeviceRequest == "" { + t.Error("DeviceRequest should not be empty") + } +} diff --git a/pkg/app/master/inspectors/container/container_inspector.go b/pkg/app/master/inspectors/container/container_inspector.go index ed97dbb78..43868e408 100644 --- a/pkg/app/master/inspectors/container/container_inspector.go +++ b/pkg/app/master/inspectors/container/container_inspector.go @@ -3,6 +3,7 @@ package container import ( "bufio" "bytes" + "encoding/json" "errors" "fmt" "os" @@ -578,6 +579,16 @@ func (i *Inspector) RunContainer() error { } if i.crOpts != nil { + if i.crOpts.DeviceRequest != "" { + var deviceRequest dockerapi.DeviceRequest + if err := json.Unmarshal([]byte(i.crOpts.DeviceRequest), &deviceRequest); err != nil { + logger.WithError(err).Error("failed to parse device request JSON") + } else { + containerOptions.HostConfig.DeviceRequests = []dockerapi.DeviceRequest{deviceRequest} + logger.Debugf("using device request => %#v", deviceRequest) + } + } + if i.crOpts.Runtime != "" { containerOptions.HostConfig.Runtime = i.crOpts.Runtime logger.Debugf("using custom runtime => %s", containerOptions.HostConfig.Runtime) diff --git a/pkg/app/master/inspectors/container/device_request_test.go b/pkg/app/master/inspectors/container/device_request_test.go new file mode 100644 index 000000000..f6df45ad3 --- /dev/null +++ b/pkg/app/master/inspectors/container/device_request_test.go @@ -0,0 +1,229 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package container + +import ( + "encoding/json" + "reflect" + "testing" + + dockerapi "github.com/fsouza/go-dockerclient" +) + +func TestParseDeviceRequestJSON(t *testing.T) { + tt := []struct { + input string + expected dockerapi.DeviceRequest + expectError bool + }{ + { + input: `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, + expected: dockerapi.DeviceRequest{ + Driver: "nvidia", + Count: -1, + Capabilities: [][]string{{"gpu"}}, + }, + expectError: false, + }, + { + input: `{"Driver":"nvidia","DeviceIDs":["0","1"],"Capabilities":[["gpu"]]}`, + expected: dockerapi.DeviceRequest{ + Driver: "nvidia", + DeviceIDs: []string{"0", "1"}, + Capabilities: [][]string{{"gpu"}}, + }, + expectError: false, + }, + { + input: `{"Driver":"nvidia","Count":2,"DeviceIDs":["GPU-123"],"Capabilities":[["gpu","compute"]],"Options":{"visible":"true"}}`, + expected: dockerapi.DeviceRequest{ + Driver: "nvidia", + Count: 2, + DeviceIDs: []string{"GPU-123"}, + Capabilities: [][]string{{"gpu", "compute"}}, + Options: map[string]string{"visible": "true"}, + }, + expectError: false, + }, + { + input: `{"Count":-1,"Capabilities":[["gpu"],["nvidia","compute"]]}`, + expected: dockerapi.DeviceRequest{ + Count: -1, + Capabilities: [][]string{{"gpu"}, {"nvidia", "compute"}}, + }, + expectError: false, + }, + { + input: `{}`, + expected: dockerapi.DeviceRequest{ + Driver: "", + Count: 0, + DeviceIDs: nil, + Capabilities: nil, + Options: nil, + }, + expectError: false, + }, + { + input: `{"Driver":"nvidia"`, + expectError: true, + }, + { + input: `["gpu"]`, + expectError: true, + }, + { + input: `{Driver:nvidia}`, + expectError: true, + }, + { + input: `{"Count":"all"}`, + expectError: true, + }, + } + + for _, test := range tt { + var deviceRequest dockerapi.DeviceRequest + err := json.Unmarshal([]byte(test.input), &deviceRequest) + + if test.expectError { + if err == nil { + t.Errorf("expected error for input %q, but got none", test.input) + } + continue + } + + if err != nil { + t.Errorf("unexpected error for input %q: %v", test.input, err) + continue + } + + if !reflect.DeepEqual(deviceRequest, test.expected) { + t.Errorf("parsed device request mismatch for %q:\n got: %+v\n expected: %+v", + test.input, deviceRequest, test.expected) + } + } +} + +func TestDeviceRequestToHostConfig(t *testing.T) { + tt := []struct { + deviceRequestJSON string + expectDeviceRequests int + expectError bool + }{ + { + deviceRequestJSON: `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}`, + expectDeviceRequests: 1, + expectError: false, + }, + { + deviceRequestJSON: "", + expectDeviceRequests: 0, + expectError: false, + }, + { + deviceRequestJSON: `{invalid}`, + expectDeviceRequests: 0, + expectError: true, + }, + } + + for _, test := range tt { + hostConfig := &dockerapi.HostConfig{} + + if test.deviceRequestJSON != "" { + var deviceRequest dockerapi.DeviceRequest + err := json.Unmarshal([]byte(test.deviceRequestJSON), &deviceRequest) + + if test.expectError { + if err == nil { + t.Errorf("expected error for input %q, but got none", test.deviceRequestJSON) + } + if len(hostConfig.DeviceRequests) != 0 { + t.Errorf("expected no device requests on error, got %d", len(hostConfig.DeviceRequests)) + } + continue + } + + if err != nil { + t.Errorf("unexpected error for input %q: %v", test.deviceRequestJSON, err) + continue + } + + hostConfig.DeviceRequests = []dockerapi.DeviceRequest{deviceRequest} + } + + if len(hostConfig.DeviceRequests) != test.expectDeviceRequests { + t.Errorf("expected %d device requests for %q, got %d", + test.expectDeviceRequests, test.deviceRequestJSON, len(hostConfig.DeviceRequests)) + } + } +} + +func TestDeviceRequestFieldValidation(t *testing.T) { + // Test NVIDIA all GPUs request + input := `{"Driver":"nvidia","Count":-1,"Capabilities":[["gpu"]]}` + var dr dockerapi.DeviceRequest + if err := json.Unmarshal([]byte(input), &dr); err != nil { + t.Fatalf("failed to parse device request: %v", err) + } + if dr.Driver != "nvidia" { + t.Errorf("expected Driver 'nvidia', got %q", dr.Driver) + } + if dr.Count != -1 { + t.Errorf("expected Count -1 (all GPUs), got %d", dr.Count) + } + if len(dr.Capabilities) != 1 || len(dr.Capabilities[0]) != 1 || dr.Capabilities[0][0] != "gpu" { + t.Errorf("expected Capabilities [[gpu]], got %v", dr.Capabilities) + } + + // Test NVIDIA specific GPU by ID + input = `{"Driver":"nvidia","DeviceIDs":["0"],"Capabilities":[["gpu"]]}` + dr = dockerapi.DeviceRequest{} + if err := json.Unmarshal([]byte(input), &dr); err != nil { + t.Fatalf("failed to parse device request: %v", err) + } + if len(dr.DeviceIDs) != 1 || dr.DeviceIDs[0] != "0" { + t.Errorf("expected DeviceIDs [0], got %v", dr.DeviceIDs) + } + if dr.Count != 0 { + t.Errorf("expected Count 0 when DeviceIDs specified, got %d", dr.Count) + } + + // Test NVIDIA multiple GPUs by UUID + input = `{"Driver":"nvidia","DeviceIDs":["GPU-abc123","GPU-def456"],"Capabilities":[["gpu","compute"]]}` + dr = dockerapi.DeviceRequest{} + if err := json.Unmarshal([]byte(input), &dr); err != nil { + t.Fatalf("failed to parse device request: %v", err) + } + if len(dr.DeviceIDs) != 2 { + t.Errorf("expected 2 DeviceIDs, got %d", len(dr.DeviceIDs)) + } + if len(dr.Capabilities) != 1 || len(dr.Capabilities[0]) != 2 { + t.Errorf("expected Capabilities [[gpu compute]], got %v", dr.Capabilities) + } + + // Test count of specific number of GPUs + input = `{"Count":2,"Capabilities":[["gpu"]]}` + dr = dockerapi.DeviceRequest{} + if err := json.Unmarshal([]byte(input), &dr); err != nil { + t.Fatalf("failed to parse device request: %v", err) + } + if dr.Count != 2 { + t.Errorf("expected Count 2, got %d", dr.Count) + } +} From 9aca3e93417a5f9b01ee8413a2840e75d773d226 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 2 Feb 2026 17:42:23 +0000 Subject: [PATCH 02/19] Fix issues with duplicate symlink processing Correct issues with duplicate paths from symlink processing, when they both point to the same inode. This fixes a behavior where slimtookit would randomly generate o-byte files for the actual file referenced by the symlink. --- pkg/app/sensor/artifact/artifact.go | 137 ++++++++++++++- pkg/app/sensor/artifact/dedup_test.go | 243 ++++++++++++++++++++++++++ pkg/app/sensor/monitor/composite.go | 6 + pkg/monitor/ptrace/ptrace.go | 30 +++- pkg/monitor/ptrace/ptrace_test.go | 117 +++++++++++++ pkg/util/fsutil/fsutil.go | 36 ++-- pkg/util/fsutil/fsutil_test.go | 209 ++++++++++++++++++++++ 7 files changed, 754 insertions(+), 24 deletions(-) create mode 100644 pkg/app/sensor/artifact/dedup_test.go create mode 100644 pkg/monitor/ptrace/ptrace_test.go diff --git a/pkg/app/sensor/artifact/artifact.go b/pkg/app/sensor/artifact/artifact.go index 72553aeab..2e2dddc3b 100644 --- a/pkg/app/sensor/artifact/artifact.go +++ b/pkg/app/sensor/artifact/artifact.go @@ -940,6 +940,100 @@ func (p *store) prepareArtifacts() { } p.resolveLinks() + p.deduplicateFileMap() +} + +// deduplicateFileMap removes duplicate file paths that point to the same inode. +// This fixes an issue where files accessed through multiple symlinked paths +// (e.g., /usr/local/cuda-12.9/lib/file.so and /usr/local/cuda/lib64/file.so) +// would be copied multiple times, with later copies potentially overwriting +// with 0-byte content. +func (p *store) deduplicateFileMap() { + log.Debugf("deduplicateFileMap - starting inode-based deduplication, fileMap has %d entries", len(p.fileMap)) + + // Build inode -> paths map for regular files only + inodeMap := make(map[uint64][]string) + + for fpath := range p.fileMap { + info, err := os.Lstat(fpath) + if err != nil { + log.Warnf("deduplicateFileMap - error getting file info for %s: %v", fpath, err) + continue + } + + // Only process regular files (not symlinks, directories, etc.) + if !info.Mode().IsRegular() { + log.Debugf("deduplicateFileMap - skipping non-regular file %s (mode: %v)", fpath, info.Mode()) + continue + } + + // Get the inode from the underlying syscall.Stat_t + if sys, ok := info.Sys().(*syscall.Stat_t); ok { + inode := sys.Ino + inodeMap[inode] = append(inodeMap[inode], fpath) + } + } + + // For each inode with multiple paths, keep only the canonical path + duplicatesRemoved := 0 + inodeCount := 0 + for inode, paths := range inodeMap { + if len(paths) <= 1 { + continue + } + + inodeCount++ + log.Debugf("deduplicateFileMap - found %d paths for inode %d: %v", len(paths), inode, paths) + + // Sort paths to get deterministic behavior + // CRITICAL: Prefer paths that DON'T go through symlinked directories + // /usr/local/cuda/ is a symlink, so paths through it may return 0 bytes + // Prefer /usr/local/cuda-12.9/ paths which are the real paths + sort.Slice(paths, func(i, j int) bool { + pi, pj := paths[i], paths[j] + + // First priority: Prefer paths that don't go through /usr/local/cuda/ symlink + // These paths go through symlinked directories and often return 0 bytes + isThroughSymlink := func(p string) bool { + return strings.HasPrefix(p, "/usr/local/cuda/") && !strings.HasPrefix(p, "/usr/local/cuda-") + } + symI := isThroughSymlink(pi) + symJ := isThroughSymlink(pj) + if symI != symJ { + return !symI // Prefer path NOT through symlink + } + + // Second priority: Check if either path has flags (indicating it was actually accessed) + propsI := p.fileMap[pi] + propsJ := p.fileMap[pj] + + hasI := propsI != nil && len(propsI.Flags) > 0 + hasJ := propsJ != nil && len(propsJ.Flags) > 0 + + // Prefer path with flags + if hasI && !hasJ { + return true + } + if !hasI && hasJ { + return false + } + + // Third priority: Prefer longer paths (more canonical - /usr/local/cuda-12.9/targets/... is longer) + return len(pi) > len(pj) + }) + + // Keep the first (most canonical) path, remove the rest + canonicalPath := paths[0] + for _, dupPath := range paths[1:] { + log.Debugf("deduplicateFileMap - removing duplicate: %s (keeping %s)", dupPath, canonicalPath) + delete(p.fileMap, dupPath) + duplicatesRemoved++ + } + } + + if duplicatesRemoved > 0 { + log.Debugf("deduplicateFileMap - removed %d duplicate paths, fileMap now has %d entries", duplicatesRemoved, len(p.fileMap)) + } } func (p *store) resolveLinks() { @@ -2174,6 +2268,23 @@ copyFiles: continue } + // FIX: Skip files accessed through symlinked directories like /usr/local/cuda/ + // The Docker overlay filesystem can return 0 bytes for files accessed through symlinks + // Files should be copied from canonical paths (e.g., /usr/local/cuda-12.9/) which work correctly + if strings.HasPrefix(srcFileName, "/usr/local/cuda/") && !strings.HasPrefix(srcFileName, "/usr/local/cuda-") { + // Resolve symlinks to get the canonical path + evalPath, evalErr := filepath.EvalSymlinks(srcFileName) + if evalErr == nil && evalPath != srcFileName { + if _, hasCanonical := p.fileMap[evalPath]; hasCanonical { + log.Debugf("saveArtifacts - skipping symlinked path %s (canonical path %s exists)", srcFileName, evalPath) + continue + } + // Use the resolved path instead + log.Debugf("saveArtifacts - using resolved path %s instead of %s", evalPath, srcFileName) + srcFileName = evalPath + } + } + filePath := fmt.Sprintf("%s/files%s", p.storeLocation, srcFileName) logger.Debug("saving file data => ", filePath) @@ -2200,7 +2311,7 @@ copyFiles: logger.Debugf("[%s,%s] - appMetadataFileUpdater => not updated / err = %v", srcFileName, filePath, err) } } else { - err := fsutil.CopyRegularFile(p.cmd.KeepPerms, srcFileName, filePath, true) + err := fsutil.CopyFile(p.cmd.KeepPerms, srcFileName, filePath, true) if err != nil { logger.Debugf("[%s,%s] - error saving file => %v", srcFileName, filePath, err) } else { @@ -2222,7 +2333,17 @@ copyFiles: } } } else { - err := fsutil.CopyRegularFile(p.cmd.KeepPerms, srcFileName, filePath, true) + // Check if destination already exists - skip if it does to avoid overwriting + // a good copy with a potentially bad one + if fsutil.Exists(filePath) { + existingInfo, _ := os.Stat(filePath) + if existingInfo != nil && existingInfo.Size() > 0 { + log.Warnf("saveArtifacts - skipping %s, destination already exists with %d bytes", srcFileName, existingInfo.Size()) + continue + } + } + + err := fsutil.CopyFile(p.cmd.KeepPerms, srcFileName, filePath, true) if err != nil { logger.Debugf("error saving file => %v", err) } @@ -2475,7 +2596,7 @@ copyBsaFiles: logger.Debugf("[bsa] - saved file (%s)", dstFilePath) } } else { - err := fsutil.CopyRegularFile(p.cmd.KeepPerms, srcFileName, dstFilePath, true) + err := fsutil.CopyFile(p.cmd.KeepPerms, srcFileName, dstFilePath, true) if err != nil { logger.Debugf("[bsa] - error saving file => %v", err) } else { @@ -2496,8 +2617,8 @@ copyBsaFiles: dstPasswdFilePath := fmt.Sprintf("%s/files%s", p.storeLocation, sysidentity.PasswdFilePath) if _, err := os.Stat(sysidentity.PasswdFilePath); err == nil { //if err := cpFile(passwdFilePath, passwdFileTargetPath); err != nil { - if err := fsutil.CopyRegularFile(p.cmd.KeepPerms, sysidentity.PasswdFilePath, dstPasswdFilePath, true); err != nil { - logger.Debugf("copyBasicUserInfo: fsutil.CopyRegularFile - error copying user info file => %v", err) + if err := fsutil.CopyFile(p.cmd.KeepPerms, sysidentity.PasswdFilePath, dstPasswdFilePath, true); err != nil { + logger.Debugf("copyBasicUserInfo: fsutil.CopyFile - error copying user info file => %v", err) } } else { if os.IsNotExist(err) { @@ -3305,7 +3426,7 @@ func fixPy3CacheFile(src, dst string) error { if _, err := os.Stat(dstPyFilePath); err != nil && os.IsNotExist(err) { //if err := cpFile(srcPyFilePath, dstPyFilePath); err != nil { - if err := fsutil.CopyRegularFile(true, srcPyFilePath, dstPyFilePath, true); err != nil { + if err := fsutil.CopyFile(true, srcPyFilePath, dstPyFilePath, true); err != nil { log.Debugf("sensor: monitor - fixPy3CacheFile - error copying file => %v", dstPyFilePath) return err } @@ -3354,7 +3475,7 @@ func rbEnsureGemFiles(src, storeLocation, prefix string) error { if _, err := os.Stat(extBuildFlagFilePathDst); err != nil && os.IsNotExist(err) { //if err := cpFile(extBuildFlagFilePath, extBuildFlagFilePathDst); err != nil { - if err := fsutil.CopyRegularFile(true, extBuildFlagFilePath, extBuildFlagFilePathDst, true); err != nil { + if err := fsutil.CopyFile(true, extBuildFlagFilePath, extBuildFlagFilePathDst, true); err != nil { log.Debugf("sensor: monitor - rbEnsureGemFiles - error copying file => %v", extBuildFlagFilePathDst) return err } @@ -3491,7 +3612,7 @@ func nodeEnsurePackageFiles(keepPerms bool, src, storeLocation, prefix string) e nodeGypFilePath := path.Join(filepath.Dir(src), nodeNPMNodeGypFile) if _, err := os.Stat(nodeGypFilePath); err == nil { nodeGypFilePathDst := fmt.Sprintf("%s%s%s", storeLocation, prefix, nodeGypFilePath) - if err := fsutil.CopyRegularFile(keepPerms, nodeGypFilePath, nodeGypFilePathDst, true); err != nil { + if err := fsutil.CopyFile(keepPerms, nodeGypFilePath, nodeGypFilePathDst, true); err != nil { log.Debugf("sensor: nodeEnsurePackageFiles - error copying %s => %v", nodeGypFilePath, err) } } diff --git a/pkg/app/sensor/artifact/dedup_test.go b/pkg/app/sensor/artifact/dedup_test.go new file mode 100644 index 000000000..b3d61bd83 --- /dev/null +++ b/pkg/app/sensor/artifact/dedup_test.go @@ -0,0 +1,243 @@ +package artifact + +import ( + "os" + "path/filepath" + "sort" + "strings" + "syscall" + "testing" + + "github.com/mintoolkit/mint/pkg/report" +) + +// TestIsThroughSymlinkLogic tests the symlink detection logic used in deduplicateFileMap +func TestIsThroughSymlinkLogic(t *testing.T) { + // This tests the isThroughSymlink inline function logic from deduplicateFileMap + isThroughSymlink := func(p string) bool { + return strings.HasPrefix(p, "/usr/local/cuda/") && !strings.HasPrefix(p, "/usr/local/cuda-") + } + + tt := []struct { + path string + expected bool + }{ + // Paths through symlink (should be avoided) + {"/usr/local/cuda/lib64/libcudart.so", true}, + {"/usr/local/cuda/include/cuda.h", true}, + {"/usr/local/cuda/bin/nvcc", true}, + + // Canonical paths (should be preferred) + {"/usr/local/cuda-12.9/lib64/libcudart.so", false}, + {"/usr/local/cuda-12.9/include/cuda.h", false}, + {"/usr/local/cuda-11.8/bin/nvcc", false}, + + // Other paths (not affected) + {"/usr/lib/libfoo.so", false}, + {"/opt/nvidia/cuda/lib/libbar.so", false}, + {"/home/user/cuda/file.txt", false}, + } + + for _, test := range tt { + result := isThroughSymlink(test.path) + if result != test.expected { + t.Errorf("isThroughSymlink(%q) = %v, want %v", test.path, result, test.expected) + } + } +} + +// TestPathPrioritySorting tests the sorting logic that determines which path to keep +func TestPathPrioritySorting(t *testing.T) { + // Simulate the sorting logic from deduplicateFileMap + isThroughSymlink := func(p string) bool { + return strings.HasPrefix(p, "/usr/local/cuda/") && !strings.HasPrefix(p, "/usr/local/cuda-") + } + + // fileMap simulates the p.fileMap with flags + fileMap := map[string]*report.ArtifactProps{ + "/usr/local/cuda/lib64/libcudart.so": nil, + "/usr/local/cuda-12.9/lib64/libcudart.so": {Flags: map[string]bool{"R": true}}, + } + + sortPaths := func(paths []string) { + sort.Slice(paths, func(i, j int) bool { + pi, pj := paths[i], paths[j] + + // First priority: Prefer paths that don't go through symlink + symI := isThroughSymlink(pi) + symJ := isThroughSymlink(pj) + if symI != symJ { + return !symI + } + + // Second priority: Check if either path has flags + propsI := fileMap[pi] + propsJ := fileMap[pj] + hasI := propsI != nil && len(propsI.Flags) > 0 + hasJ := propsJ != nil && len(propsJ.Flags) > 0 + + if hasI && !hasJ { + return true + } + if !hasI && hasJ { + return false + } + + // Third priority: Prefer longer paths + return len(pi) > len(pj) + }) + } + + // Test case 1: symlink path vs canonical path + paths1 := []string{ + "/usr/local/cuda/lib64/libcudart.so", + "/usr/local/cuda-12.9/lib64/libcudart.so", + } + sortPaths(paths1) + if paths1[0] != "/usr/local/cuda-12.9/lib64/libcudart.so" { + t.Errorf("expected canonical path first, got %v", paths1) + } + + // Test case 2: Two canonical paths - prefer one with flags + fileMap2 := map[string]*report.ArtifactProps{ + "/usr/local/cuda-12.9/lib64/libcudart.so.12": {Flags: map[string]bool{"R": true}}, + "/usr/local/cuda-12.9/lib64/libcudart.so": nil, + } + paths2 := []string{ + "/usr/local/cuda-12.9/lib64/libcudart.so", + "/usr/local/cuda-12.9/lib64/libcudart.so.12", + } + sort.Slice(paths2, func(i, j int) bool { + pi, pj := paths2[i], paths2[j] + propsI := fileMap2[pi] + propsJ := fileMap2[pj] + hasI := propsI != nil && len(propsI.Flags) > 0 + hasJ := propsJ != nil && len(propsJ.Flags) > 0 + if hasI && !hasJ { + return true + } + if !hasI && hasJ { + return false + } + return len(pi) > len(pj) + }) + if paths2[0] != "/usr/local/cuda-12.9/lib64/libcudart.so.12" { + t.Errorf("expected path with flags first, got %v", paths2) + } + + // Test case 3: Equal priority - prefer longer path + paths3 := []string{ + "/usr/lib/short.so", + "/usr/lib/subdir/longer.so", + } + sort.Slice(paths3, func(i, j int) bool { + return len(paths3[i]) > len(paths3[j]) + }) + if paths3[0] != "/usr/lib/subdir/longer.so" { + t.Errorf("expected longer path first, got %v", paths3) + } +} + +// TestDeduplicateFileMapWithHardlinks tests deduplication with actual hardlinks +func TestDeduplicateFileMapWithHardlinks(t *testing.T) { + // Create temp directory + tmpDir, err := os.MkdirTemp("", "dedup_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + // Create a file + originalPath := filepath.Join(tmpDir, "original.txt") + if err := os.WriteFile(originalPath, []byte("test content"), 0644); err != nil { + t.Fatalf("failed to create original file: %v", err) + } + + // Create a hardlink to the same file + hardlinkPath := filepath.Join(tmpDir, "hardlink.txt") + if err := os.Link(originalPath, hardlinkPath); err != nil { + t.Fatalf("failed to create hardlink: %v", err) + } + + // Verify both paths have the same inode + origInfo, err := os.Lstat(originalPath) + if err != nil { + t.Fatalf("failed to stat original: %v", err) + } + linkInfo, err := os.Lstat(hardlinkPath) + if err != nil { + t.Fatalf("failed to stat hardlink: %v", err) + } + + origStat, ok := origInfo.Sys().(*syscall.Stat_t) + if !ok { + t.Fatal("failed to get syscall.Stat_t for original") + } + linkStat, ok := linkInfo.Sys().(*syscall.Stat_t) + if !ok { + t.Fatal("failed to get syscall.Stat_t for hardlink") + } + + if origStat.Ino != linkStat.Ino { + t.Fatalf("expected same inode, got %d vs %d", origStat.Ino, linkStat.Ino) + } + + // Test that we can build an inode map + inodeMap := make(map[uint64][]string) + for _, fpath := range []string{originalPath, hardlinkPath} { + info, err := os.Lstat(fpath) + if err != nil { + t.Fatalf("failed to stat %s: %v", fpath, err) + } + if sys, ok := info.Sys().(*syscall.Stat_t); ok { + inodeMap[sys.Ino] = append(inodeMap[sys.Ino], fpath) + } + } + + // Should have one inode with two paths + if len(inodeMap) != 1 { + t.Errorf("expected 1 inode, got %d", len(inodeMap)) + } + + for inode, paths := range inodeMap { + if len(paths) != 2 { + t.Errorf("expected 2 paths for inode %d, got %d", inode, len(paths)) + } + } +} + +// TestDeduplicateFileMapSkipsSymlinks tests that symlinks themselves are not deduplicated +func TestDeduplicateFileMapSkipsSymlinks(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "dedup_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + // Create a regular file + regularPath := filepath.Join(tmpDir, "regular.txt") + if err := os.WriteFile(regularPath, []byte("content"), 0644); err != nil { + t.Fatalf("failed to create file: %v", err) + } + + // Create a symlink to it + symlinkPath := filepath.Join(tmpDir, "symlink.txt") + if err := os.Symlink(regularPath, symlinkPath); err != nil { + t.Fatalf("failed to create symlink: %v", err) + } + + // Check that Lstat identifies symlink vs regular file + regInfo, _ := os.Lstat(regularPath) + symInfo, _ := os.Lstat(symlinkPath) + + if !regInfo.Mode().IsRegular() { + t.Error("regular file should be identified as regular") + } + + if symInfo.Mode().IsRegular() { + t.Error("symlink should NOT be identified as regular file") + } + + // The deduplication logic only processes regular files + // Symlinks have different inodes and are handled separately +} diff --git a/pkg/app/sensor/monitor/composite.go b/pkg/app/sensor/monitor/composite.go index 3c0d24b2d..f5ae67e69 100644 --- a/pkg/app/sensor/monitor/composite.go +++ b/pkg/app/sensor/monitor/composite.go @@ -240,6 +240,12 @@ func (m *monitor) Start() error { return err } + log.Warn("COMPOSITE MONITOR: Both FAN and PTRACE monitors started successfully") + log.Warnf("COMPOSITE MONITOR: RTASourcePT flag value: %v", m.cmd.RTASourcePT) + if !m.cmd.RTASourcePT { + log.Error("COMPOSITE MONITOR: RTASourcePT is FALSE - PTRACE SYSCALL TRACING IS DISABLED!") + } + m.startedAt = time.Now() return nil diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index cd20fbb12..2cbf04636 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -65,12 +65,13 @@ func Run( } if runOpt.RTASourcePT { - logger.Debug("tracing target app") + logger.Warn("PTRACE IS ENABLED - RTASourcePT=true - Starting trace monitoring") app.Report.Enabled = true go app.process() go app.trace() } else { - logger.Debug("not tracing target app...") + logger.Error("PTRACE IS DISABLED - RTASourcePT=false - NOT tracing target app!") + logger.Error("This means syscall-level monitoring is OFF and files may be missed!") go func() { logger.Debug("not tracing target app - start app") if err := app.start(); err != nil { @@ -239,6 +240,7 @@ func newApp( func (app *App) trace() { logger := app.logger.WithField("op", "trace") logger.Debug("call") + logger.Warn("PTRACE MONITOR IS STARTING - trace() called") defer logger.Debug("exit") runtime.LockOSThread() @@ -460,6 +462,23 @@ drain: app.Report.SyscallNum = uint32(len(app.Report.SyscallStats)) app.Report.FSActivity = app.FileActivity() + // CRITICAL DEBUG: Log FSActivity results + logger.Warnf("PTRACE REPORT FINALIZED: Tracked %d files in FSActivity", len(app.Report.FSActivity)) + logger.Warnf("PTRACE REPORT: Total syscalls executed: %d", app.Report.SyscallCount) + if len(app.Report.FSActivity) == 0 { + logger.Error("WARNING: PTRACE FSActivity is EMPTY - No files were tracked!") + logger.Error("This suggests ptrace syscall interception may not be working properly") + } else { + // Sample some tracked files for verification + sampleCount := 0 + for fpath := range app.Report.FSActivity { + if sampleCount < 5 { + logger.Warnf("PTRACE tracked file sample: %s", fpath) + sampleCount++ + } + } + } + app.StateCh <- state app.ReportCh <- &app.Report } @@ -1213,7 +1232,12 @@ func (ref *checkFileSyscallProcessor) OKCall(cstate *syscallState) bool { } func (ref *checkFileSyscallProcessor) OKReturnStatus(retVal uint64) bool { - return retVal == 0 + // Accept successful stat calls (0) and also failed attempts that indicate + // the application was looking for the file. This is important for Python + // imports which check multiple locations before finding the right file. + // Track ENOENT (file not found) and ENOTDIR (not a directory) in addition to success. + intRetVal := getIntVal(retVal) + return intRetVal == 0 || intRetVal == -2 || intRetVal == -20 // 0=success, -2=ENOENT, -20=ENOTDIR } func (ref *checkFileSyscallProcessor) EventOnCall() bool { diff --git a/pkg/monitor/ptrace/ptrace_test.go b/pkg/monitor/ptrace/ptrace_test.go new file mode 100644 index 000000000..e40d9f0e0 --- /dev/null +++ b/pkg/monitor/ptrace/ptrace_test.go @@ -0,0 +1,117 @@ +package ptrace + +import ( + "testing" +) + +func TestGetIntVal(t *testing.T) { + tt := []struct { + input uint64 + expected int + }{ + {input: 0, expected: 0}, + {input: 0xFFFFFFFE, expected: -2}, // ENOENT + {input: 0xFFFFFFEC, expected: -20}, // ENOTDIR + {input: 1, expected: 1}, + {input: 0xFFFFFFFF, expected: -1}, // Generic error + } + + for _, test := range tt { + result := getIntVal(test.input) + if result != test.expected { + t.Errorf("getIntVal(0x%x) = %d, want %d", test.input, result, test.expected) + } + } +} + +func TestCheckFileSyscallProcessorOKReturnStatus(t *testing.T) { + processor := &checkFileSyscallProcessor{ + syscallProcessorCore: &syscallProcessorCore{}, + } + + tt := []struct { + retVal uint64 + expected bool + desc string + }{ + { + retVal: 0, + expected: true, + desc: "success (0)", + }, + { + retVal: 0xFFFFFFFE, // -2 as uint64 + expected: true, + desc: "ENOENT (-2) - file not found, should be tracked", + }, + { + retVal: 0xFFFFFFEC, // -20 as uint64 + expected: true, + desc: "ENOTDIR (-20) - not a directory, should be tracked", + }, + { + retVal: 0xFFFFFFFF, // -1 as uint64 + expected: false, + desc: "EPERM (-1) - should not be tracked", + }, + { + retVal: 0xFFFFFFFD, // -3 as uint64 + expected: false, + desc: "ESRCH (-3) - should not be tracked", + }, + { + retVal: 0xFFFFFFED, // -19 as uint64 + expected: false, + desc: "ENODEV (-19) - should not be tracked", + }, + { + retVal: 1, + expected: false, + desc: "positive return value - should not be tracked", + }, + } + + for _, test := range tt { + result := processor.OKReturnStatus(test.retVal) + if result != test.expected { + t.Errorf("OKReturnStatus(0x%x) [%s] = %v, want %v", + test.retVal, test.desc, result, test.expected) + } + } +} + +func TestCheckFileSyscallProcessorFailedReturnStatus(t *testing.T) { + processor := &checkFileSyscallProcessor{ + syscallProcessorCore: &syscallProcessorCore{}, + } + + tt := []struct { + retVal uint64 + expected bool + desc string + }{ + { + retVal: 0, + expected: false, + desc: "success (0) - not failed", + }, + { + retVal: 0xFFFFFFFE, // -2 (ENOENT) + expected: true, + desc: "ENOENT (-2) - failed", + }, + { + retVal: 0xFFFFFFFF, // -1 (EPERM) + expected: true, + desc: "EPERM (-1) - failed", + }, + } + + for _, test := range tt { + result := processor.FailedReturnStatus(test.retVal) + if result != test.expected { + t.Errorf("FailedReturnStatus(0x%x) [%s] = %v, want %v", + test.retVal, test.desc, result, test.expected) + } + } +} diff --git a/pkg/util/fsutil/fsutil.go b/pkg/util/fsutil/fsutil.go index 07242b809..7c9e11dca 100644 --- a/pkg/util/fsutil/fsutil.go +++ b/pkg/util/fsutil/fsutil.go @@ -575,20 +575,30 @@ func CopyRegularFile(clone bool, src, dst string, makeDir bool) error { return err } - if srcFileInfo.Size() > 0 { - written, err := io.Copy(d, s) - if err != nil { - d.Close() - return err - } + written, err := io.Copy(d, s) + if err != nil { + d.Close() + return err + } - if written != srcFileInfo.Size() { - log.Debugf("CopyRegularFile(%v,%v,%v) - copy data mismatch - %v/%v", - src, dst, makeDir, written, srcFileInfo.Size()) - d.Close() - return fmt.Errorf("%s -> %s: partial copy - %d/%d", - src, dst, written, srcFileInfo.Size()) - } + // Log if we copied a file that appeared 0-byte but had content + if srcFileInfo.Size() == 0 && written > 0 { + log.Debugf("CopyRegularFile(%v,%v) - file appeared as 0-byte but copied %d bytes", src, dst, written) + } + + // Log error for 0-byte copy of non-empty file (likely overlay FS issue) + if srcFileInfo.Size() > 0 && written == 0 { + log.Errorf("CopyRegularFile(%v,%v) - expected %d bytes but copied 0 (possible overlay FS issue)", + src, dst, srcFileInfo.Size()) + } + + // Verify the copy if we expected content + if srcFileInfo.Size() > 0 && written != srcFileInfo.Size() { + log.Debugf("CopyRegularFile(%v,%v,%v) - copy data mismatch - %v/%v", + src, dst, makeDir, written, srcFileInfo.Size()) + d.Close() + return fmt.Errorf("%s -> %s: partial copy - %d/%d", + src, dst, written, srcFileInfo.Size()) } //Need to close dst file before chmod works the right way diff --git a/pkg/util/fsutil/fsutil_test.go b/pkg/util/fsutil/fsutil_test.go index 95d7d86b6..5fe232b93 100644 --- a/pkg/util/fsutil/fsutil_test.go +++ b/pkg/util/fsutil/fsutil_test.go @@ -48,3 +48,212 @@ func TestCopyDirExcludesWholeDirectoryWhenPatternHasDoubleStar(t *testing.T) { t.Fatalf("expected kept file to be copied, got error: %v", err) } } + +func TestCopyRegularFile(t *testing.T) { + tt := []struct { + name string + content string + clone bool + makeDir bool + expectError bool + }{ + { + name: "copy file with content", + content: "hello world", + clone: false, + makeDir: true, + expectError: false, + }, + { + name: "copy empty file", + content: "", + clone: false, + makeDir: true, + expectError: false, + }, + { + name: "copy without makeDir to existing dir", + content: "test content", + clone: false, + makeDir: false, + expectError: false, + }, + } + + for _, test := range tt { + // Create temp directory for test + tmpDir, err := os.MkdirTemp("", "fsutil_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + // Create source file + srcPath := filepath.Join(tmpDir, "src", "testfile.txt") + if err := os.MkdirAll(filepath.Dir(srcPath), 0755); err != nil { + t.Fatalf("failed to create src dir: %v", err) + } + if err := os.WriteFile(srcPath, []byte(test.content), 0644); err != nil { + t.Fatalf("failed to create src file: %v", err) + } + + // Set up destination path + var dstPath string + if test.makeDir { + dstPath = filepath.Join(tmpDir, "dst", "testfile.txt") + } else { + // Create dst dir first for non-makeDir tests + dstDir := filepath.Join(tmpDir, "dst") + if err := os.MkdirAll(dstDir, 0755); err != nil { + t.Fatalf("failed to create dst dir: %v", err) + } + dstPath = filepath.Join(dstDir, "testfile.txt") + } + + // Copy file + err = CopyRegularFile(test.clone, srcPath, dstPath, test.makeDir) + + if test.expectError { + if err == nil { + t.Errorf("test %q: expected error but got none", test.name) + } + continue + } + + if err != nil { + t.Errorf("test %q: unexpected error: %v", test.name, err) + continue + } + + // Verify destination file exists and has correct content + dstContent, err := os.ReadFile(dstPath) + if err != nil { + t.Errorf("test %q: failed to read dst file: %v", test.name, err) + continue + } + + if string(dstContent) != test.content { + t.Errorf("test %q: content mismatch, got %q, want %q", test.name, string(dstContent), test.content) + } + } +} + +func TestCopyRegularFilePreservesSize(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "fsutil_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + // Create source file with known content + content := "This is test content with known size" + srcPath := filepath.Join(tmpDir, "src.txt") + if err := os.WriteFile(srcPath, []byte(content), 0644); err != nil { + t.Fatalf("failed to create src file: %v", err) + } + + srcInfo, err := os.Stat(srcPath) + if err != nil { + t.Fatalf("failed to stat src file: %v", err) + } + + // Copy file + dstPath := filepath.Join(tmpDir, "dst.txt") + if err := CopyRegularFile(false, srcPath, dstPath, true); err != nil { + t.Fatalf("CopyRegularFile failed: %v", err) + } + + // Verify sizes match + dstInfo, err := os.Stat(dstPath) + if err != nil { + t.Fatalf("failed to stat dst file: %v", err) + } + + if dstInfo.Size() != srcInfo.Size() { + t.Errorf("size mismatch: src=%d, dst=%d", srcInfo.Size(), dstInfo.Size()) + } +} + +func TestCopyRegularFileMissingSource(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "fsutil_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + srcPath := filepath.Join(tmpDir, "nonexistent.txt") + dstPath := filepath.Join(tmpDir, "dst.txt") + + err = CopyRegularFile(false, srcPath, dstPath, true) + if err == nil { + t.Error("expected error for missing source file, got none") + } +} + +func TestCopyFile(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "fsutil_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + // Create source file + content := "test content for CopyFile" + srcPath := filepath.Join(tmpDir, "src.txt") + if err := os.WriteFile(srcPath, []byte(content), 0644); err != nil { + t.Fatalf("failed to create src file: %v", err) + } + + // Copy file + dstPath := filepath.Join(tmpDir, "dst.txt") + if err := CopyFile(false, srcPath, dstPath, true); err != nil { + t.Fatalf("CopyFile failed: %v", err) + } + + // Verify content + dstContent, err := os.ReadFile(dstPath) + if err != nil { + t.Fatalf("failed to read dst file: %v", err) + } + + if string(dstContent) != content { + t.Errorf("content mismatch: got %q, want %q", string(dstContent), content) + } +} + +func TestCopyFileWithSymlink(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "fsutil_test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + // Create source file + content := "symlink test content" + srcPath := filepath.Join(tmpDir, "src.txt") + if err := os.WriteFile(srcPath, []byte(content), 0644); err != nil { + t.Fatalf("failed to create src file: %v", err) + } + + // Create symlink to source + symlinkPath := filepath.Join(tmpDir, "src_link.txt") + if err := os.Symlink(srcPath, symlinkPath); err != nil { + t.Fatalf("failed to create symlink: %v", err) + } + + // Copy via symlink - CopyFile should handle this + dstPath := filepath.Join(tmpDir, "dst.txt") + if err := CopyFile(false, symlinkPath, dstPath, true); err != nil { + t.Fatalf("CopyFile via symlink failed: %v", err) + } + + // Verify content was copied (not the symlink itself) + dstContent, err := os.ReadFile(dstPath) + if err != nil { + t.Fatalf("failed to read dst file: %v", err) + } + + if string(dstContent) != content { + t.Errorf("content mismatch: got %q, want %q", string(dstContent), content) + } +} From c3b3f191d07cce8c566c3d2fed0bf9c5c55f916a Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 23 Feb 2026 17:41:39 +0000 Subject: [PATCH 03/19] fix: resolve ptrace monitor bugs causing hangs with multiprocess apps Infinite loop in getStringParam: when a syscall had a NULL path argument, PtracePeekData returned EIO with count=0. The function silenced EIO, never advanced the pointer, and had no exit condition, burning 100% CPU until exit. Fix: return early on NULL pointer and on any PtracePeekData error. --- pkg/monitor/ptrace/ptrace.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 2cbf04636..487545ec9 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -1077,12 +1077,16 @@ func getIntParam(pid int, ptr uint64) int { } func getStringParam(pid int, ptr uint64) string { + if ptr == 0 { + return "" + } + var out [256]byte var data []byte for { count, err := syscall.PtracePeekData(pid, uintptr(ptr), out[:]) - if err != nil && err != syscall.EIO { - fmt.Printf("readString: syscall.PtracePeekData error - '%v'\v", err) + if err != nil { + return string(data) } idx := bytes.IndexByte(out[:count], 0) @@ -1095,7 +1099,7 @@ func getStringParam(pid int, ptr uint64) string { } data = append(data, out[:idx]...) - if foundNull { + if foundNull || count == 0 { return string(data) } } From 224efa969bbcc097dad38dde2dc9e1cf6af2dc70 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Tue, 17 Feb 2026 15:59:31 +0000 Subject: [PATCH 04/19] fix null ref crash when ptrace disabled --- pkg/app/master/security/seccomp/seccomp.go | 2 +- pkg/app/sensor/artifact/artifact.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/app/master/security/seccomp/seccomp.go b/pkg/app/master/security/seccomp/seccomp.go index 016736dab..bc619e166 100644 --- a/pkg/app/master/security/seccomp/seccomp.go +++ b/pkg/app/master/security/seccomp/seccomp.go @@ -80,7 +80,7 @@ func GenProfile(artifactLocation string, profileName string) error { return err } - if !creport.Monitors.Pt.Enabled { + if creport.Monitors.Pt == nil || !creport.Monitors.Pt.Enabled { log.Debug("seccomp.GenProfile: not generating seccomp profile (PT mon disabled, no syscall info)") return nil } diff --git a/pkg/app/sensor/artifact/artifact.go b/pkg/app/sensor/artifact/artifact.go index 2e2dddc3b..fa5b86f96 100644 --- a/pkg/app/sensor/artifact/artifact.go +++ b/pkg/app/sensor/artifact/artifact.go @@ -846,7 +846,7 @@ func (p *store) prepareArtifacts() { p.prepareArtifact(artifactFileName) } - if p.ptMonReport.Enabled { + if p.ptMonReport != nil && p.ptMonReport.Enabled { logger.Debug("ptMonReport.Enabled") for artifactFileName, fsaInfo := range p.ptMonReport.FSActivity { artifactInfo, found := p.rawNames[artifactFileName] From da8eb2c79b3bc5ba827bbbe3bec37bd02a95394e Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 23 Feb 2026 19:29:25 +0000 Subject: [PATCH 05/19] fix: prevent namespace package directories from being excluded by IsSubdir false positive OKReturnStatus for checkFile syscalls intentionally accepts ENOENT (-2) to track Python import search paths. When Python imports a namespace package (a directory without __init__.py), it probes for several __init__.* variants -- all returning ENOENT -- before stat'ing the directory itself (success). The radix tree walk in FileActivity() then sees the directory as a prefix of these ghost child paths and marks it IsSubdir=true, excluding it from the slim image. Since the ghost children don't exist on disk, neither the directory nor any files are preserved. Add HasSuccessfulAccess to FSActivityInfo, set it only when retVal==0, and require it in the IsSubdir determination so that ENOENT-only child paths cannot cause a parent directory to be dropped. --- pkg/monitor/ptrace/ptrace.go | 68 ++++++++++++++++++++-------------- pkg/report/container_report.go | 11 +++--- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 487545ec9..50b792385 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -282,30 +282,34 @@ func (app *App) processFileActivity(e *syscallEvent) { !strings.HasPrefix(e.pathParam, "/proc/") && !strings.HasPrefix(e.pathParam, "/sys/") && !strings.HasPrefix(e.pathParam, "/dev/") { - if fsa, ok := app.fsActivity[e.pathParam]; ok { - fsa.OpsAll++ - fsa.Pids[e.pid] = struct{}{} - fsa.Syscalls[int(e.callNum)] = struct{}{} - - if processor, found := syscallProcessors[int(e.callNum)]; found { - switch processor.SyscallType() { - case CheckFileType: - fsa.OpsCheckFile++ - } - } - } else { - fsa := &report.FSActivityInfo{ - OpsAll: 1, - OpsCheckFile: 1, - Pids: map[int]struct{}{}, - Syscalls: map[int]struct{}{}, + if fsa, ok := app.fsActivity[e.pathParam]; ok { + fsa.OpsAll++ + fsa.Pids[e.pid] = struct{}{} + fsa.Syscalls[int(e.callNum)] = struct{}{} + if e.retVal == 0 { + fsa.HasSuccessfulAccess = true + } + + if processor, found := syscallProcessors[int(e.callNum)]; found { + switch processor.SyscallType() { + case CheckFileType: + fsa.OpsCheckFile++ } + } + } else { + fsa := &report.FSActivityInfo{ + OpsAll: 1, + OpsCheckFile: 1, + HasSuccessfulAccess: e.retVal == 0, + Pids: map[int]struct{}{}, + Syscalls: map[int]struct{}{}, + } - fsa.Pids[e.pid] = struct{}{} - fsa.Syscalls[int(e.callNum)] = struct{}{} + fsa.Pids[e.pid] = struct{}{} + fsa.Syscalls[int(e.callNum)] = struct{}{} - app.fsActivity[e.pathParam] = fsa - } + app.fsActivity[e.pathParam] = fsa + } if app.del != nil { //NOTE: @@ -500,17 +504,25 @@ func (app *App) FileActivity() map[string]*report.FSActivityInfo { } walkAfter := func(akey string, av interface{}) bool { - //adata, ok := av.(*report.FSActivityInfo) - //if !ok { - // return false - //} - if wkey == akey { return false } - wdata.IsSubdir = true - return true + adata, ok := av.(*report.FSActivityInfo) + if !ok { + return false + } + + // Only mark as subdirectory if the child path was actually + // found on disk. ENOENT "ghost" children (e.g., Python probing + // for __init__.so/.py in a namespace package directory) must + // not cause the parent directory to be excluded. + if adata.HasSuccessfulAccess { + wdata.IsSubdir = true + return true + } + + return false } t.WalkPrefix(wkey, walkAfter) diff --git a/pkg/report/container_report.go b/pkg/report/container_report.go index 45a8438bf..9640c4dba 100644 --- a/pkg/report/container_report.go +++ b/pkg/report/container_report.go @@ -121,11 +121,12 @@ type PtMonitorReport struct { } type FSActivityInfo struct { - OpsAll uint64 `json:"ops_all"` - OpsCheckFile uint64 `json:"ops_checkfile"` - Syscalls map[int]struct{} `json:"syscalls"` - Pids map[int]struct{} `json:"pids"` - IsSubdir bool `json:"is_subdir"` + OpsAll uint64 `json:"ops_all"` + OpsCheckFile uint64 `json:"ops_checkfile"` + Syscalls map[int]struct{} `json:"syscalls"` + Pids map[int]struct{} `json:"pids"` + IsSubdir bool `json:"is_subdir"` + HasSuccessfulAccess bool `json:"has_successful_access,omitempty"` } // ArtifactProps contains various file system artifact properties From a7ed39a1b0b3f71b7171b2388d70718027edc1f1 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 15 Dec 2025 19:33:49 +0000 Subject: [PATCH 06/19] example use case with nvidia runtime --- examples/nvidia_runtime/.gitignore | 5 + examples/nvidia_runtime/Dockerfile | 6 + examples/nvidia_runtime/README.md | 19 + .../nvidia_runtime/test_nvidia_pytorch.sh | 97 +++ examples/nvidia_runtime/test_nvidia_smi.sh | 34 + examples/nvidia_runtime/test_vllm.sh | 289 +++++++ examples/nvidia_runtime/vllm_api_tests.py | 772 ++++++++++++++++++ 7 files changed, 1222 insertions(+) create mode 100644 examples/nvidia_runtime/.gitignore create mode 100644 examples/nvidia_runtime/Dockerfile create mode 100644 examples/nvidia_runtime/README.md create mode 100755 examples/nvidia_runtime/test_nvidia_pytorch.sh create mode 100755 examples/nvidia_runtime/test_nvidia_smi.sh create mode 100755 examples/nvidia_runtime/test_vllm.sh create mode 100755 examples/nvidia_runtime/vllm_api_tests.py diff --git a/examples/nvidia_runtime/.gitignore b/examples/nvidia_runtime/.gitignore new file mode 100644 index 000000000..8a6ace89f --- /dev/null +++ b/examples/nvidia_runtime/.gitignore @@ -0,0 +1,5 @@ +host-config.json +vllm_test_results.json +vllm_test_results_slim.json +original_log.txt +slim_log.txt diff --git a/examples/nvidia_runtime/Dockerfile b/examples/nvidia_runtime/Dockerfile new file mode 100644 index 000000000..6dc1c8f10 --- /dev/null +++ b/examples/nvidia_runtime/Dockerfile @@ -0,0 +1,6 @@ +FROM nvcr.io/nvidia/pytorch:25.04-py3 + +# Add the tests to the entrypoint set. Docker Slim only traces/monitors the processes started by the entrypoint. +RUN echo "pytest /opt/pytorch/pytorch/test/test_cuda.py::TestCuda::test_graph_cudnn_dropout" > /opt/nvidia/entrypoint.d/99-trace.sh +RUN chmod +x /opt/nvidia/entrypoint.d/99-trace.sh + diff --git a/examples/nvidia_runtime/README.md b/examples/nvidia_runtime/README.md new file mode 100644 index 000000000..a2b80a3d4 --- /dev/null +++ b/examples/nvidia_runtime/README.md @@ -0,0 +1,19 @@ + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +As a pre-requisite, install nvidia-container toolkit, including adding the nvidia runtime. Then you should be able to translate runtime and capabilities from a OCI/Docker string like `--runtime=nvidia --gpus all` to `--cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' --cro-runtime nvidia` + +See the example `test_nvidia_smi.sh`, which slims ubuntu to just the files necessary to run the runtime mounted nvidia-smi. Similarly, see `test_nvidia_pytorch.sh` which minimizes nvidia-pytorch to run a subset of the CUDA tests. + diff --git a/examples/nvidia_runtime/test_nvidia_pytorch.sh b/examples/nvidia_runtime/test_nvidia_pytorch.sh new file mode 100755 index 000000000..c5ca4db09 --- /dev/null +++ b/examples/nvidia_runtime/test_nvidia_pytorch.sh @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Create host config file with ulimit settings and capabilities +cat > host-config.json <<'EOF' +{ + "IpcMode": "host", + "CapAdd": ["SYS_ADMIN"], + "Ulimits": [ + { + "Name": "memlock", + "Soft": -1, + "Hard": -1 + }, + { + "Name": "stack", + "Soft": 67108864, + "Hard": 67108864 + }, + { + "Name": "nofile", + "Soft": 1048576, + "Hard": 1048576 + } + ] +} +EOF + +# Build the slim image +# CAP_SYS_ADMIN is added via host-config.json for fanotify support (required for filesystem monitoring) +# Build custom image with test in entrypoint first +echo "Building custom test image with pytest in entrypoint..." +docker build -t nvcr.io/nvidia/pytorch:25.04-py3-test -f Dockerfile . + +echo "Running mint on the test image..." +mint build \ + --target nvcr.io/nvidia/pytorch:25.04-py3-test \ + --tag nvcr.io/nvidia/pytorch:25.04-py3-slim \ + --cro-host-config-file host-config.json \ + --cro-shm-size 1200 \ + --cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' \ + --cro-runtime nvidia \ + --http-probe=false \ + --continue-after 10 \ + --preserve-path /etc/ld.so.conf \ + --preserve-path /etc/ld.so.conf.d \ + . + +# Get output of original and slim images stored in a log file +echo "Running original image..." +docker run --rm --runtime nvidia --gpus all nvcr.io/nvidia/pytorch:25.04-py3-test > original_log.txt 2>&1 +echo "Running slim image..." +docker run --rm --runtime nvidia --gpus all nvcr.io/nvidia/pytorch:25.04-py3-slim > slim_log.txt 2>&1 + +# Verify that both logs contain the pytest success message (ignoring timing) +echo "Checking test results..." + +# Look for "X passed" pattern in both logs +original_passed=$(grep -oE "[0-9]+ passed" original_log.txt | head -1) +slim_passed=$(grep -oE "[0-9]+ passed" slim_log.txt | head -1) + +if [ -z "$original_passed" ]; then + echo "Error: Original image test did not pass" + echo "Original log tail:" + tail -20 original_log.txt + exit 1 +fi + +if [ -z "$slim_passed" ]; then + echo "Error: Slim image test did not pass" + echo "Slim log tail:" + tail -20 slim_log.txt + exit 1 +fi + +echo "Original image: $original_passed" +echo "Slim image: $slim_passed" + +if [ "$original_passed" = "$slim_passed" ]; then + echo "SUCCESS: Both images passed the same number of tests!" +else + echo "Warning: Different number of tests passed (original: $original_passed, slim: $slim_passed)" +fi + +echo "Successfully minimized nvidia-pytorch to run a subset of the CUDA tests" diff --git a/examples/nvidia_runtime/test_nvidia_smi.sh b/examples/nvidia_runtime/test_nvidia_smi.sh new file mode 100755 index 000000000..6a4558428 --- /dev/null +++ b/examples/nvidia_runtime/test_nvidia_smi.sh @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build the slim image +mint build --target ubuntu:24.04 --tag ubuntu:24.04-slim --cro-shm-size 1200 --cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' --cro-runtime nvidia --http-probe=false --exec "/usr/bin/nvidia-smi" . + +# Get output of original and slim images stored in a log file +docker run --rm --runtime nvidia --gpus all ubuntu:24.04 nvidia-smi > original_log.txt +docker run --rm --runtime nvidia --gpus all ubuntu:24.04-slim nvidia-smi > slim_log.txt + +# verify that both logs include the nvidia-smi output with an assert +assert_contains() { + if ! grep -q "$1" "$2"; then + echo "Error: '$1' not found in $2" + exit 1 + fi +} + +# verify that both logs include the nvidia-smi output with an assert +assert_contains "NVIDIA-SMI" original_log.txt +assert_contains "NVIDIA-SMI" slim_log.txt + diff --git a/examples/nvidia_runtime/test_vllm.sh b/examples/nvidia_runtime/test_vllm.sh new file mode 100755 index 000000000..68d723bba --- /dev/null +++ b/examples/nvidia_runtime/test_vllm.sh @@ -0,0 +1,289 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configuration +VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai:v0.15.1}" +SLIM_TAG="${SLIM_TAG:-$(echo $VLLM_IMAGE | sed 's|.*/||; s/:/-slim:/')}" +MODEL="${MODEL:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}" +MAX_WAIT_MINUTES="${MAX_WAIT_MINUTES:-20}" +CONTAINER_PORT=8000 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUTPUT_DIR="${OUTPUT_DIR:-$SCRIPT_DIR}" +RESULTS_FILE="${OUTPUT_DIR}/vllm_test_results.json" +SLIM_RESULTS_FILE="${OUTPUT_DIR}/vllm_test_results_slim.json" + +# Check for required HF_TOKEN +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN environment variable is not set" + echo "Please set it to your Hugging Face access token" + exit 1 +fi + +# Find the slim binary - check for mint or slim in PATH, or use local binary +if command -v mint &> /dev/null; then + SLIM_CMD="mint" +elif command -v slim &> /dev/null; then + SLIM_CMD="slim" +elif [ -x "${SCRIPT_DIR}/../../bin/linux/slim" ]; then + SLIM_CMD="${SCRIPT_DIR}/../../bin/linux/slim" +else + echo "Error: mint/slim binary not found" + echo "Please install mint or ensure bin/linux/slim exists" + exit 1 +fi +echo "Using slim binary: $SLIM_CMD" + +# Cleanup function +cleanup() { + echo "Cleaning up..." + if [ -n "$MONITOR_PID" ] && kill -0 "$MONITOR_PID" 2>/dev/null; then + kill "$MONITOR_PID" 2>/dev/null + fi + if [ -n "$SLIM_PID" ] && kill -0 "$SLIM_PID" 2>/dev/null; then + # Send SIGINT to mint to gracefully stop + kill -INT "$SLIM_PID" 2>/dev/null + wait "$SLIM_PID" 2>/dev/null + fi +} + +trap cleanup EXIT + +# Create host config file with ulimit settings and capabilities +cat > host-config.json <<'EOF' +{ + "IpcMode": "host", + "PidMode": "host", + "CapAdd": ["SYS_ADMIN", "SYS_PTRACE"], + "Ulimits": [ + { + "Name": "memlock", + "Soft": -1, + "Hard": -1 + }, + { + "Name": "stack", + "Soft": 67108864, + "Hard": 67108864 + }, + { + "Name": "nofile", + "Soft": 1048576, + "Hard": 1048576 + } + ] +} +EOF +MAX_SEQ_LEN=2048 +echo "=============================================" +echo "VLLM mint Test" +echo "=============================================" +echo "Source Image: $VLLM_IMAGE" +echo "Slim Tag: $SLIM_TAG" +echo "Model: $MODEL" +echo "Max Wait: $MAX_WAIT_MINUTES minutes" +echo "MAX_SEQ_LEN: $MAX_SEQ_LEN" +echo "Results File: $RESULTS_FILE" +echo "HF_TOKEN: ${HF_TOKEN:0:3}..." +echo "=============================================" + +# Function to run mint with monitoring +run_slim_with_monitor() { + local target_image="$1" + local output_tag="$2" + local results_file="$3" + local is_original="$4" + + echo "" + echo "Starting mint build for: $target_image" + echo "Output tag: $output_tag" + + # Create a named pipe for signaling + SIGNAL_PIPE=$(mktemp -u) + mkfifo "$SIGNAL_PIPE" + + # Start mint in the background + # Using --continue-after signal to allow the monitor to signal when done + local mint_cmd=("$SLIM_CMD" build + --target "$target_image" + --tag "$output_tag" + --cro-host-config-file host-config.json + --cro-shm-size 1200 + --cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' + --cro-runtime nvidia + --expose "${CONTAINER_PORT}" + --publish-port "${CONTAINER_PORT}:${CONTAINER_PORT}" + --publish-exposed-ports + --env "HF_TOKEN=${HF_TOKEN}" + --cmd "${MODEL} --max_model_len ${MAX_SEQ_LEN}" + --http-probe=false + --rta-source-ptrace=true + --continue-after signal + --preserve-path /etc/ld.so.conf + --preserve-path /etc/ld.so.conf.d + --exclude-pattern "/root/.cache/**" + .) + echo "Running: ${mint_cmd[*]}" + "${mint_cmd[@]}" & + + SLIM_PID=$! + echo "mint started with PID: $SLIM_PID" + + # Wait a moment for the container to start + sleep 10 + + # Start the monitor/test script in the background + echo "Starting API monitor and test runner..." + python3 "${SCRIPT_DIR}/vllm_api_tests.py" \ + --host "localhost" \ + --port "$CONTAINER_PORT" \ + --output "$results_file" \ + --max-wait "$MAX_WAIT_MINUTES" \ + --signal-pid "$SLIM_PID" & + + MONITOR_PID=$! + echo "Monitor started with PID: $MONITOR_PID" + + # Wait for the monitor to complete + wait "$MONITOR_PID" + MONITOR_EXIT_CODE=$? + echo "Monitor completed with exit code: $MONITOR_EXIT_CODE" + + # Wait for mint to complete + wait "$SLIM_PID" + SLIM_EXIT_CODE=$? + echo "mint completed with exit code: $SLIM_EXIT_CODE" + + # Cleanup the signal pipe + rm -f "$SIGNAL_PIPE" + + if [ $SLIM_EXIT_CODE -ne 0 ]; then + echo "Warning: mint exited with code $SLIM_EXIT_CODE" + fi + + return $MONITOR_EXIT_CODE +} + +# Phase 1: Build slim image from original and run tests +echo "" +echo "=============================================" +echo "Phase 1: Building slim image from original" +echo "=============================================" + +run_slim_with_monitor "$VLLM_IMAGE" "$SLIM_TAG" "$RESULTS_FILE" "true" +PHASE1_EXIT=$? + +if [ ! -f "$RESULTS_FILE" ]; then + echo "Error: Results file not created during Phase 1" + exit 1 +fi + +echo "" +echo "Phase 1 Results:" +cat "$RESULTS_FILE" + +# Phase 2: Run slim image and test it +echo "" +echo "=============================================" +echo "Phase 2: Testing the slimmed image" +echo "=============================================" + +# Run the slimmed image directly (not through mint) and test it +echo "Starting slimmed container for testing..." +docker_run_cmd=(docker run -d + --runtime nvidia + --gpus all + --ipc=host + --ulimit memlock=-1 + --ulimit stack=67108864 + --shm-size=1200m + -e HF_TOKEN + -p "${CONTAINER_PORT}:${CONTAINER_PORT}" + "$SLIM_TAG" + "$MODEL" --max_model_len "$MAX_SEQ_LEN") +echo "Running: ${docker_run_cmd[*]}" +SLIM_CONTAINER_ID=$("${docker_run_cmd[@]}") + +echo "Slim container started: $SLIM_CONTAINER_ID" + +# Run tests against the slim container +python3 "${SCRIPT_DIR}/vllm_api_tests.py" \ + --host "localhost" \ + --port "$CONTAINER_PORT" \ + --output "$SLIM_RESULTS_FILE" \ + --max-wait "$MAX_WAIT_MINUTES" + +PHASE2_EXIT=$? + +# Stop the slim container +docker stop "$SLIM_CONTAINER_ID" +docker rm "$SLIM_CONTAINER_ID" + +if [ ! -f "$SLIM_RESULTS_FILE" ]; then + echo "Error: Slim results file not created during Phase 2" + exit 1 +fi + +echo "" +echo "Phase 2 Results (Slim Image):" +cat "$SLIM_RESULTS_FILE" + +# Compare results +echo "" +echo "=============================================" +echo "Comparing Results" +echo "=============================================" + +python3 - "$RESULTS_FILE" "$SLIM_RESULTS_FILE" <<'COMPARE_SCRIPT' +import json +import sys + +results_file = sys.argv[1] +slim_results_file = sys.argv[2] + +try: + with open(results_file, "r") as f: + original = json.load(f) + with open(slim_results_file, "r") as f: + slim = json.load(f) + + original_passed = sum(1 for t in original.get("tests", []) if t.get("status") == "passed") + original_failed = sum(1 for t in original.get("tests", []) if t.get("status") == "failed") + slim_passed = sum(1 for t in slim.get("tests", []) if t.get("status") == "passed") + slim_failed = sum(1 for t in slim.get("tests", []) if t.get("status") == "failed") + + print(f"Original: {original_passed} passed, {original_failed} failed") + print(f"Slim: {slim_passed} passed, {slim_failed} failed") + + if original_passed == slim_passed: + print("SUCCESS: Both images passed the same number of tests!") + sys.exit(0) + else: + print("WARNING: Different number of tests passed") + sys.exit(1) +except Exception as e: + print(f"Error comparing results: {e}") + sys.exit(1) +COMPARE_SCRIPT + +COMPARE_EXIT=$? + +echo "" +echo "=============================================" +echo "Test Complete" +echo "=============================================" + +exit $COMPARE_EXIT + diff --git a/examples/nvidia_runtime/vllm_api_tests.py b/examples/nvidia_runtime/vllm_api_tests.py new file mode 100755 index 000000000..9a36aa38e --- /dev/null +++ b/examples/nvidia_runtime/vllm_api_tests.py @@ -0,0 +1,772 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +VLLM API Tests for Docker-Slim Integration + +This script waits for a VLLM server to become ready, runs a suite of API tests, +and writes the results to a JSON file. It can optionally signal a docker-slim +process when testing is complete. +""" + +import argparse +import json +import os +import signal +import sys +import time +import traceback +from dataclasses import dataclass, field, asdict +from datetime import datetime +from typing import Any, Callable, Dict, List, Optional + + +# Try to import requests, provide helpful error if not available +try: + import requests +except ImportError: + print("Error: 'requests' library is required. Install with: pip install requests") + sys.exit(1) + + +@dataclass +class TestResult: + """Result of a single test.""" + name: str + status: str # "passed", "failed", "skipped", "error" + duration_ms: float = 0.0 + message: str = "" + details: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class TestSuiteResults: + """Results of the entire test suite.""" + timestamp: str = "" + host: str = "" + port: int = 0 + model_name: str = "" + server_ready_time_s: float = 0.0 + tests: List[Dict[str, Any]] = field(default_factory=list) + summary: Dict[str, int] = field(default_factory=dict) + + +class VLLMApiTester: + """Tests VLLM OpenAI-compatible API endpoints.""" + + def __init__(self, host: str, port: int): + self.host = host + self.port = port + self.base_url = f"http://{host}:{port}" + self.model_name: Optional[str] = None + self.headers = { + "accept": "application/json", + "Content-Type": "application/json" + } + + def wait_for_server(self, max_wait_minutes: int = 20) -> bool: + """Wait for the server to become ready and return a model.""" + print(f"Waiting for server at {self.base_url} (max {max_wait_minutes} minutes)...") + + start_time = time.time() + max_wait_seconds = max_wait_minutes * 60 + check_interval = 5 # seconds + + while time.time() - start_time < max_wait_seconds: + try: + response = requests.get( + f"{self.base_url}/v1/models", + headers=self.headers, + timeout=10 + ) + + if response.status_code == 200: + data = response.json() + models = data.get("data", []) + + if models: + self.model_name = models[0].get("id") + elapsed = time.time() - start_time + print(f"Server ready after {elapsed:.1f}s. Model: {self.model_name}") + return True + else: + print(f" Server responded but no models loaded yet...") + else: + print(f" Server returned status {response.status_code}") + + except requests.exceptions.ConnectionError: + print(f" Connection refused, server not ready yet...") + except requests.exceptions.Timeout: + print(f" Connection timed out...") + except Exception as e: + print(f" Error checking server: {e}") + + time.sleep(check_interval) + + print(f"Server did not become ready within {max_wait_minutes} minutes") + return False + + def _run_test(self, name: str, test_func: Callable) -> TestResult: + """Run a single test and capture the result.""" + start_time = time.time() + try: + result = test_func() + duration_ms = (time.time() - start_time) * 1000 + + if result is True: + return TestResult( + name=name, + status="passed", + duration_ms=duration_ms, + message="Test passed successfully" + ) + elif result is None: + return TestResult( + name=name, + status="skipped", + duration_ms=duration_ms, + message="Test skipped" + ) + else: + return TestResult( + name=name, + status="failed", + duration_ms=duration_ms, + message=str(result) if result else "Test failed" + ) + except Exception as e: + duration_ms = (time.time() - start_time) * 1000 + return TestResult( + name=name, + status="error", + duration_ms=duration_ms, + message=str(e), + details={"traceback": traceback.format_exc()} + ) + + def test_models_endpoint(self) -> bool: + """Test GET /v1/models endpoint.""" + response = requests.get( + f"{self.base_url}/v1/models", + headers=self.headers, + timeout=30 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + data = response.json() + if "data" not in data: + return "Response missing 'data' field" + + if not data["data"]: + return "No models returned" + + return True + + def test_health_endpoint(self) -> bool: + """Test health check endpoint.""" + # Try common health endpoints + for endpoint in ["/health", "/v1/health", "/healthz"]: + try: + response = requests.get( + f"{self.base_url}{endpoint}", + headers=self.headers, + timeout=10 + ) + if response.status_code == 200: + return True + except: + pass + + # If no dedicated health endpoint, v1/models working is good enough + return True + + def test_completions_basic(self) -> bool: + """Test basic /v1/completions endpoint.""" + data = { + "model": self.model_name, + "prompt": "San Francisco is a", + "temperature": 0, + "max_tokens": 32 + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}: {response.text}" + + out = response.json() + + if out.get("model") != self.model_name: + return f"Expected model '{self.model_name}', got '{out.get('model')}'" + + if len(out.get("choices", [])) != 1: + return f"Expected 1 choice, got {len(out.get('choices', []))}" + + if out["choices"][0].get("index") != 0: + return "Choice index should be 0" + + if "text" not in out["choices"][0]: + return "Choice missing 'text' field" + + return True + + def test_completions_with_logprobs(self) -> bool: + """Test /v1/completions with logprobs.""" + data = { + "model": self.model_name, + "prompt": "The quick brown fox", + "temperature": 0, + "max_tokens": 16, + "logprobs": 1 + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + out = response.json() + choice = out.get("choices", [{}])[0] + + # Logprobs might be None if not supported by backend + if "logprobs" in choice and choice["logprobs"] is not None: + logprobs = choice["logprobs"] + if "tokens" in logprobs and len(logprobs["tokens"]) == 0: + return "Expected tokens in logprobs" + + return True + + def test_completions_streaming(self) -> bool: + """Test streaming /v1/completions endpoint.""" + data = { + "model": self.model_name, + "prompt": "Once upon a time", + "temperature": 0, + "max_tokens": 32, + "stream": True + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=120, + stream=True + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + chunks_received = 0 + done_received = False + + for line in response.iter_lines(decode_unicode=True): + if line: + if line.startswith("data: "): + chunk_data = line[6:].strip() + if chunk_data == "[DONE]": + done_received = True + else: + try: + json.loads(chunk_data) + chunks_received += 1 + except json.JSONDecodeError: + return f"Invalid JSON in stream: {chunk_data}" + + if chunks_received == 0: + return "No streaming chunks received" + + if not done_received: + return "Stream did not end with [DONE]" + + return True + + def test_chat_completions_basic(self) -> bool: + """Test basic /v1/chat/completions endpoint.""" + data = { + "model": self.model_name, + "messages": [ + {"role": "user", "content": "Say hello in exactly 5 words."} + ], + "temperature": 0, + "max_tokens": 32 + } + + response = requests.post( + f"{self.base_url}/v1/chat/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}: {response.text}" + + out = response.json() + + if out.get("model") != self.model_name: + return f"Expected model '{self.model_name}', got '{out.get('model')}'" + + if len(out.get("choices", [])) != 1: + return f"Expected 1 choice, got {len(out.get('choices', []))}" + + choice = out["choices"][0] + + if "message" not in choice: + return "Choice missing 'message' field" + + if "content" not in choice["message"]: + return "Message missing 'content' field" + + return True + + def test_chat_completions_multi_turn(self) -> bool: + """Test multi-turn conversation.""" + data = { + "model": self.model_name, + "messages": [ + {"role": "user", "content": "My name is Alice."}, + {"role": "assistant", "content": "Hello Alice! Nice to meet you."}, + {"role": "user", "content": "What is my name?"} + ], + "temperature": 0, + "max_tokens": 32 + } + + response = requests.post( + f"{self.base_url}/v1/chat/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + out = response.json() + content = out["choices"][0]["message"]["content"].lower() + + # The model should remember the name "Alice" + if "alice" not in content: + return f"Expected model to remember 'Alice', got: {content}" + + return True + + def test_chat_completions_streaming(self) -> bool: + """Test streaming /v1/chat/completions endpoint.""" + data = { + "model": self.model_name, + "messages": [ + {"role": "user", "content": "Count from 1 to 5."} + ], + "temperature": 0, + "max_tokens": 32, + "stream": True + } + + response = requests.post( + f"{self.base_url}/v1/chat/completions", + headers=self.headers, + json=data, + timeout=120, + stream=True + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + chunks_received = 0 + done_received = False + + for line in response.iter_lines(decode_unicode=True): + if line: + if line.startswith("data: "): + chunk_data = line[6:].strip() + if chunk_data == "[DONE]": + done_received = True + else: + try: + json.loads(chunk_data) + chunks_received += 1 + except json.JSONDecodeError: + return f"Invalid JSON in stream: {chunk_data}" + + if chunks_received == 0: + return "No streaming chunks received" + + if not done_received: + return "Stream did not end with [DONE]" + + return True + + def test_completions_stop_words(self) -> bool: + """Test stop words in completions.""" + data = { + "model": self.model_name, + "prompt": "List the numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", + "temperature": 0, + "max_tokens": 64, + "stop": ["5"] + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + out = response.json() + text = out["choices"][0]["text"] + + # The output should stop before or at "5" + # This is a weak check since the model might not continue the pattern + return True + + def test_completions_max_tokens(self) -> bool: + """Test max_tokens parameter.""" + max_tokens = 10 + data = { + "model": self.model_name, + "prompt": "Write a very long essay about", + "temperature": 0.7, + "max_tokens": max_tokens + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + out = response.json() + completion_tokens = out.get("usage", {}).get("completion_tokens", 0) + + if completion_tokens > max_tokens: + return f"Expected at most {max_tokens} completion tokens, got {completion_tokens}" + + return True + + def test_completions_temperature(self) -> bool: + """Test temperature parameter affects output.""" + prompt = "The meaning of life is" + + # Run with temperature 0 twice - should get same result + data_t0 = { + "model": self.model_name, + "prompt": prompt, + "temperature": 0, + "max_tokens": 20 + } + + response1 = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data_t0, + timeout=120 + ) + + response2 = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data_t0, + timeout=120 + ) + + if response1.status_code != 200 or response2.status_code != 200: + return f"Expected status 200" + + text1 = response1.json()["choices"][0]["text"] + text2 = response2.json()["choices"][0]["text"] + + if text1 != text2: + return f"Temperature 0 should give deterministic results" + + return True + + def test_usage_stats(self) -> bool: + """Test that usage statistics are returned.""" + data = { + "model": self.model_name, + "prompt": "Hello, world!", + "temperature": 0, + "max_tokens": 10 + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=120 + ) + + if response.status_code != 200: + return f"Expected status 200, got {response.status_code}" + + out = response.json() + usage = out.get("usage", {}) + + if "prompt_tokens" not in usage: + return "Missing prompt_tokens in usage" + + if "completion_tokens" not in usage: + return "Missing completion_tokens in usage" + + if "total_tokens" not in usage: + return "Missing total_tokens in usage" + + if usage["prompt_tokens"] <= 0: + return "prompt_tokens should be > 0" + + if usage["completion_tokens"] <= 0: + return "completion_tokens should be > 0" + + expected_total = usage["prompt_tokens"] + usage["completion_tokens"] + if usage["total_tokens"] != expected_total: + return f"total_tokens should equal prompt + completion tokens" + + return True + + def test_metrics_endpoint(self) -> bool: + """Test /v1/metrics endpoint if available.""" + try: + response = requests.get( + f"{self.base_url}/v1/metrics", + headers=self.headers, + timeout=30 + ) + + if response.status_code == 200: + # Metrics endpoint exists + if len(response.text) == 0: + return "Metrics endpoint returned empty response" + return True + elif response.status_code == 404: + # Metrics endpoint doesn't exist, which is acceptable + return True + else: + return f"Unexpected status code: {response.status_code}" + except Exception as e: + # Metrics endpoint not available, acceptable + return True + + def test_invalid_model(self) -> bool: + """Test error handling for invalid model name.""" + data = { + "model": "nonexistent-model-12345", + "prompt": "Hello", + "max_tokens": 10 + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=30 + ) + + # Should return an error status (400 or 404) + if response.status_code == 200: + return "Expected error for invalid model, got 200" + + return True + + def test_empty_prompt(self) -> bool: + """Test handling of empty prompt.""" + data = { + "model": self.model_name, + "prompt": "", + "max_tokens": 10 + } + + response = requests.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + json=data, + timeout=30 + ) + + # Some servers accept empty prompts, some don't - both are valid + # Just ensure we get a valid response (200) or proper error (400) + if response.status_code not in [200, 400]: + return f"Expected 200 or 400, got {response.status_code}" + + return True + + def run_all_tests(self) -> List[TestResult]: + """Run all tests and return results.""" + tests = [ + ("models_endpoint", self.test_models_endpoint), + ("health_endpoint", self.test_health_endpoint), + ("completions_basic", self.test_completions_basic), + ("completions_with_logprobs", self.test_completions_with_logprobs), + ("completions_streaming", self.test_completions_streaming), + ("chat_completions_basic", self.test_chat_completions_basic), + ("chat_completions_multi_turn", self.test_chat_completions_multi_turn), + ("chat_completions_streaming", self.test_chat_completions_streaming), + ("completions_stop_words", self.test_completions_stop_words), + ("completions_max_tokens", self.test_completions_max_tokens), + ("completions_temperature", self.test_completions_temperature), + ("usage_stats", self.test_usage_stats), + ("metrics_endpoint", self.test_metrics_endpoint), + ("invalid_model", self.test_invalid_model), + ("empty_prompt", self.test_empty_prompt), + ] + + results = [] + for name, test_func in tests: + print(f" Running test: {name}...", end=" ", flush=True) + result = self._run_test(name, test_func) + print(f"{result.status.upper()}") + if result.status in ["failed", "error"]: + print(f" -> {result.message}") + results.append(result) + + return results + + +def main(): + parser = argparse.ArgumentParser(description="VLLM API Test Runner") + parser.add_argument("--host", default="localhost", help="Server host") + parser.add_argument("--port", type=int, default=8000, help="Server port") + parser.add_argument("--output", required=True, help="Output JSON file for results") + parser.add_argument("--max-wait", type=int, default=20, help="Max wait time in minutes for server") + parser.add_argument("--signal-pid", type=int, help="PID to send SIGINT when done") + + args = parser.parse_args() + + print("=" * 60) + print("VLLM API Test Runner") + print("=" * 60) + print(f"Host: {args.host}") + print(f"Port: {args.port}") + print(f"Output: {args.output}") + print(f"Max Wait: {args.max_wait} minutes") + if args.signal_pid: + print(f"Signal PID: {args.signal_pid}") + print("=" * 60) + + tester = VLLMApiTester(args.host, args.port) + + # Wait for server to be ready + start_wait = time.time() + if not tester.wait_for_server(args.max_wait): + # Write failure result + results = TestSuiteResults( + timestamp=datetime.now().isoformat(), + host=args.host, + port=args.port, + model_name="", + server_ready_time_s=-1, + tests=[], + summary={"passed": 0, "failed": 0, "skipped": 0, "error": 1} + ) + + with open(args.output, "w") as f: + json.dump(asdict(results), f, indent=2) + + print(f"Results written to: {args.output}") + + # Signal docker-slim if requested (slim expects SIGUSR1) + if args.signal_pid: + try: + os.kill(args.signal_pid, signal.SIGUSR1) + print(f"Sent SIGUSR1 to PID {args.signal_pid}") + except ProcessLookupError: + print(f"Process {args.signal_pid} not found") + except PermissionError: + print(f"Permission denied to signal PID {args.signal_pid}") + + sys.exit(1) + + server_ready_time = time.time() - start_wait + + # Run tests + print("\nRunning API tests...") + test_results = tester.run_all_tests() + + # Summarize results + summary = { + "passed": sum(1 for t in test_results if t.status == "passed"), + "failed": sum(1 for t in test_results if t.status == "failed"), + "skipped": sum(1 for t in test_results if t.status == "skipped"), + "error": sum(1 for t in test_results if t.status == "error"), + } + + # Create results object + results = TestSuiteResults( + timestamp=datetime.now().isoformat(), + host=args.host, + port=args.port, + model_name=tester.model_name or "", + server_ready_time_s=server_ready_time, + tests=[asdict(t) for t in test_results], + summary=summary + ) + + # Write results to file + with open(args.output, "w") as f: + json.dump(asdict(results), f, indent=2) + + print("\n" + "=" * 60) + print("Test Summary") + print("=" * 60) + print(f" Passed: {summary['passed']}") + print(f" Failed: {summary['failed']}") + print(f" Skipped: {summary['skipped']}") + print(f" Errors: {summary['error']}") + print(f"\nResults written to: {args.output}") + + # Signal docker-slim if requested (slim expects SIGUSR1) + if args.signal_pid: + print(f"\nSignaling docker-slim (PID {args.signal_pid}) to stop...") + try: + os.kill(args.signal_pid, signal.SIGUSR1) + print(f"Sent SIGUSR1 to PID {args.signal_pid}") + except ProcessLookupError: + print(f"Process {args.signal_pid} not found") + except PermissionError: + print(f"Permission denied to signal PID {args.signal_pid}") + + # Exit with appropriate code + if summary["failed"] > 0 or summary["error"] > 0: + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == "__main__": + main() + From 77da28d96fb8924b2a5fc504f3fdfa18fe61b5ea Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Tue, 24 Feb 2026 17:15:46 +0000 Subject: [PATCH 07/19] Fix HasSuccessfulAccess for open-type syscalls where success is fd >= 0, not retVal == 0 --- pkg/monitor/ptrace/ptrace.go | 8 ++++---- pkg/report/container_report.go | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 50b792385..30d555100 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -286,9 +286,9 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa.OpsAll++ fsa.Pids[e.pid] = struct{}{} fsa.Syscalls[int(e.callNum)] = struct{}{} - if e.retVal == 0 { - fsa.HasSuccessfulAccess = true - } + if e.retVal == 0 || p.SyscallType() == OpenFileType { + fsa.HasSuccessfulAccess = true + } if processor, found := syscallProcessors[int(e.callNum)]; found { switch processor.SyscallType() { @@ -300,7 +300,7 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa := &report.FSActivityInfo{ OpsAll: 1, OpsCheckFile: 1, - HasSuccessfulAccess: e.retVal == 0, + HasSuccessfulAccess: e.retVal == 0 || p.SyscallType() == OpenFileType, Pids: map[int]struct{}{}, Syscalls: map[int]struct{}{}, } diff --git a/pkg/report/container_report.go b/pkg/report/container_report.go index 9640c4dba..e9096b008 100644 --- a/pkg/report/container_report.go +++ b/pkg/report/container_report.go @@ -121,12 +121,12 @@ type PtMonitorReport struct { } type FSActivityInfo struct { - OpsAll uint64 `json:"ops_all"` - OpsCheckFile uint64 `json:"ops_checkfile"` - Syscalls map[int]struct{} `json:"syscalls"` - Pids map[int]struct{} `json:"pids"` - IsSubdir bool `json:"is_subdir"` - HasSuccessfulAccess bool `json:"has_successful_access,omitempty"` + OpsAll uint64 `json:"ops_all"` + OpsCheckFile uint64 `json:"ops_checkfile"` + Syscalls map[int]struct{} `json:"syscalls"` + Pids map[int]struct{} `json:"pids"` + IsSubdir bool `json:"is_subdir"` + HasSuccessfulAccess bool `json:"has_successful_access,omitempty"` } // ArtifactProps contains various file system artifact properties From 0895e012b1a3921c382b6017051436b3460692ba Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Tue, 24 Feb 2026 17:31:06 +0000 Subject: [PATCH 08/19] Key deduplicateFileMap by (dev, inode) instead of inode alone to avoid false dedup across mounts --- pkg/app/sensor/artifact/artifact.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pkg/app/sensor/artifact/artifact.go b/pkg/app/sensor/artifact/artifact.go index fa5b86f96..5c998ef2e 100644 --- a/pkg/app/sensor/artifact/artifact.go +++ b/pkg/app/sensor/artifact/artifact.go @@ -951,8 +951,14 @@ func (p *store) prepareArtifacts() { func (p *store) deduplicateFileMap() { log.Debugf("deduplicateFileMap - starting inode-based deduplication, fileMap has %d entries", len(p.fileMap)) - // Build inode -> paths map for regular files only - inodeMap := make(map[uint64][]string) + // Build (device, inode) -> paths map for regular files only. + // Inode numbers are only unique per filesystem, so we must include the + // device ID to avoid false deduplication across mount points. + type devInode struct { + Dev uint64 + Ino uint64 + } + inodeMap := make(map[devInode][]string) for fpath := range p.fileMap { info, err := os.Lstat(fpath) @@ -969,21 +975,21 @@ func (p *store) deduplicateFileMap() { // Get the inode from the underlying syscall.Stat_t if sys, ok := info.Sys().(*syscall.Stat_t); ok { - inode := sys.Ino - inodeMap[inode] = append(inodeMap[inode], fpath) + key := devInode{Dev: sys.Dev, Ino: sys.Ino} + inodeMap[key] = append(inodeMap[key], fpath) } } // For each inode with multiple paths, keep only the canonical path duplicatesRemoved := 0 inodeCount := 0 - for inode, paths := range inodeMap { + for key, paths := range inodeMap { if len(paths) <= 1 { continue } inodeCount++ - log.Debugf("deduplicateFileMap - found %d paths for inode %d: %v", len(paths), inode, paths) + log.Debugf("deduplicateFileMap - found %d paths for dev:ino %d:%d: %v", len(paths), key.Dev, key.Ino, paths) // Sort paths to get deterministic behavior // CRITICAL: Prefer paths that DON'T go through symlinked directories From d90297b49c343dab9918d0c80de6acf74c553cf2 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Tue, 24 Feb 2026 20:12:12 +0000 Subject: [PATCH 09/19] Fail fast on invalid --cro-device-request JSON instead of silently dropping the device config --- pkg/app/master/inspectors/container/container_inspector.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/app/master/inspectors/container/container_inspector.go b/pkg/app/master/inspectors/container/container_inspector.go index 43868e408..1483e0d9f 100644 --- a/pkg/app/master/inspectors/container/container_inspector.go +++ b/pkg/app/master/inspectors/container/container_inspector.go @@ -582,11 +582,10 @@ func (i *Inspector) RunContainer() error { if i.crOpts.DeviceRequest != "" { var deviceRequest dockerapi.DeviceRequest if err := json.Unmarshal([]byte(i.crOpts.DeviceRequest), &deviceRequest); err != nil { - logger.WithError(err).Error("failed to parse device request JSON") - } else { - containerOptions.HostConfig.DeviceRequests = []dockerapi.DeviceRequest{deviceRequest} - logger.Debugf("using device request => %#v", deviceRequest) + return fmt.Errorf("invalid --cro-device-request JSON: %w", err) } + containerOptions.HostConfig.DeviceRequests = []dockerapi.DeviceRequest{deviceRequest} + logger.Debugf("using device request => %#v", deviceRequest) } if i.crOpts.Runtime != "" { From e47d9a2efb9da745e31561c5d741ccafd86c223f Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Tue, 24 Feb 2026 21:02:48 +0000 Subject: [PATCH 10/19] Remove unused SIGNAL_PIPE named pipe from test_vllm.sh --- examples/nvidia_runtime/test_vllm.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/examples/nvidia_runtime/test_vllm.sh b/examples/nvidia_runtime/test_vllm.sh index 68d723bba..1fc1412ea 100755 --- a/examples/nvidia_runtime/test_vllm.sh +++ b/examples/nvidia_runtime/test_vllm.sh @@ -110,10 +110,6 @@ run_slim_with_monitor() { echo "Starting mint build for: $target_image" echo "Output tag: $output_tag" - # Create a named pipe for signaling - SIGNAL_PIPE=$(mktemp -u) - mkfifo "$SIGNAL_PIPE" - # Start mint in the background # Using --continue-after signal to allow the monitor to signal when done local mint_cmd=("$SLIM_CMD" build @@ -166,9 +162,6 @@ run_slim_with_monitor() { SLIM_EXIT_CODE=$? echo "mint completed with exit code: $SLIM_EXIT_CODE" - # Cleanup the signal pipe - rm -f "$SIGNAL_PIPE" - if [ $SLIM_EXIT_CODE -ne 0 ]; then echo "Warning: mint exited with code $SLIM_EXIT_CODE" fi From f10d5a850df1602e9a27d7bb3dc58e163dcf59af Mon Sep 17 00:00:00 2001 From: akshaver <168006157+akshaver@users.noreply.github.com> Date: Fri, 27 Feb 2026 21:01:55 -0600 Subject: [PATCH 11/19] Update pkg/monitor/ptrace/ptrace.go Co-authored-by: kilo-code-bot[bot] <240665456+kilo-code-bot[bot]@users.noreply.github.com> --- pkg/monitor/ptrace/ptrace.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 30d555100..2a3b07db4 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -286,7 +286,9 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa.OpsAll++ fsa.Pids[e.pid] = struct{}{} fsa.Syscalls[int(e.callNum)] = struct{}{} - if e.retVal == 0 || p.SyscallType() == OpenFileType { + if p.OKReturnStatus(e.retVal) { + fsa.HasSuccessfulAccess = true + } fsa.HasSuccessfulAccess = true } From 3f8dcb47fbdf48d9dd499642bc8b60017265256b Mon Sep 17 00:00:00 2001 From: akshaver <168006157+akshaver@users.noreply.github.com> Date: Fri, 27 Feb 2026 21:02:12 -0600 Subject: [PATCH 12/19] Update pkg/monitor/ptrace/ptrace.go Co-authored-by: kilo-code-bot[bot] <240665456+kilo-code-bot[bot]@users.noreply.github.com> --- pkg/monitor/ptrace/ptrace.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 2a3b07db4..e483046b4 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -302,7 +302,7 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa := &report.FSActivityInfo{ OpsAll: 1, OpsCheckFile: 1, - HasSuccessfulAccess: e.retVal == 0 || p.SyscallType() == OpenFileType, + HasSuccessfulAccess: p.OKReturnStatus(e.retVal), Pids: map[int]struct{}{}, Syscalls: map[int]struct{}{}, } From 755b6d556af774d87b8bc5fc4f304ea8bc322209 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 2 Mar 2026 16:17:35 +0000 Subject: [PATCH 13/19] patch up kilo-code-bot fix --- pkg/monitor/ptrace/ptrace.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index e483046b4..9efc22cfc 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -289,8 +289,6 @@ func (app *App) processFileActivity(e *syscallEvent) { if p.OKReturnStatus(e.retVal) { fsa.HasSuccessfulAccess = true } - fsa.HasSuccessfulAccess = true - } if processor, found := syscallProcessors[int(e.callNum)]; found { switch processor.SyscallType() { From 27c344ba06d3f28b631095deb24a0f873e505053 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 2 Mar 2026 18:01:14 +0000 Subject: [PATCH 14/19] correct indentation --- pkg/monitor/ptrace/ptrace.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 9efc22cfc..c5a87afb4 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -286,9 +286,9 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa.OpsAll++ fsa.Pids[e.pid] = struct{}{} fsa.Syscalls[int(e.callNum)] = struct{}{} - if p.OKReturnStatus(e.retVal) { - fsa.HasSuccessfulAccess = true - } + if p.OKReturnStatus(e.retVal) { + fsa.HasSuccessfulAccess = true + } if processor, found := syscallProcessors[int(e.callNum)]; found { switch processor.SyscallType() { From a2ad80e62c606d0f3f1883163d9c9ae3df695fe5 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Mon, 2 Mar 2026 18:38:43 +0000 Subject: [PATCH 15/19] Revert HasSuccessfulAccess to use retVal==0 check instead of OKReturnStatus The kilo-code-bot suggested changing the HasSuccessfulAccess condition from `e.retVal == 0 || p.SyscallType() == OpenFileType` to `p.OKReturnStatus(e.retVal)`, arguing that the former incorrectly treated all OpenFileType syscalls as successful. However, the original expression was correct: - For OpenFileType (open, openat, readlink): the outer condition on line 276-278 already filters to successful calls via `p.OKReturnStatus(e.retVal)`, which requires fd >= 0. The `|| p.SyscallType() == OpenFileType` clause is redundant, not wrong -- it simply ensures HasSuccessfulAccess=true for events that already passed the success filter. - For CheckFileType (stat, access, lstat): `e.retVal == 0` correctly marks only files that actually exist on disk. The replacement `p.OKReturnStatus(e.retVal)` also accepts ENOENT (-2) and ENOTDIR (-20), causing non-existent "ghost" paths to be marked as HasSuccessfulAccess=true. This broke the namespace package fix (da8eb2c7): when Python probes for __init__.cpython-311.so, __init__.so, __init__.py in a directory that has none of them (namespace package), those ENOENT stat results created ghost children with HasSuccessfulAccess=true. The FileActivity() radix tree walk then marked the parent directory as IsSubdir and excluded it from the slim image. Since the ghost children don't exist on disk, neither the directory nor its contents were preserved, causing libraries to go missing. --- pkg/monitor/ptrace/ptrace.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index c5a87afb4..30d555100 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -286,9 +286,9 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa.OpsAll++ fsa.Pids[e.pid] = struct{}{} fsa.Syscalls[int(e.callNum)] = struct{}{} - if p.OKReturnStatus(e.retVal) { - fsa.HasSuccessfulAccess = true - } + if e.retVal == 0 || p.SyscallType() == OpenFileType { + fsa.HasSuccessfulAccess = true + } if processor, found := syscallProcessors[int(e.callNum)]; found { switch processor.SyscallType() { @@ -300,7 +300,7 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa := &report.FSActivityInfo{ OpsAll: 1, OpsCheckFile: 1, - HasSuccessfulAccess: p.OKReturnStatus(e.retVal), + HasSuccessfulAccess: e.retVal == 0 || p.SyscallType() == OpenFileType, Pids: map[int]struct{}{}, Syscalls: map[int]struct{}{}, } From 2d09c8ae57c2ece997969e4864f99c465c704b9d Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Thu, 19 Mar 2026 21:47:54 +0000 Subject: [PATCH 16/19] chore: moved to vLLM v0.17.1-cu130 for testing --- examples/nvidia_runtime/test_vllm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nvidia_runtime/test_vllm.sh b/examples/nvidia_runtime/test_vllm.sh index 1fc1412ea..47544bf0c 100755 --- a/examples/nvidia_runtime/test_vllm.sh +++ b/examples/nvidia_runtime/test_vllm.sh @@ -15,7 +15,7 @@ # limitations under the License. # Configuration -VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai:v0.15.1}" +VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai:v0.17.1-cu130}" SLIM_TAG="${SLIM_TAG:-$(echo $VLLM_IMAGE | sed 's|.*/||; s/:/-slim:/')}" MODEL="${MODEL:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}" MAX_WAIT_MINUTES="${MAX_WAIT_MINUTES:-20}" From b33fb91afd21307caaca9064b3f5c68d15c90036 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Thu, 19 Mar 2026 21:48:28 +0000 Subject: [PATCH 17/19] lint: fixed indenting in ptrace.go with gofmt --- pkg/monitor/ptrace/ptrace.go | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 30d555100..0b15edead 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -282,34 +282,34 @@ func (app *App) processFileActivity(e *syscallEvent) { !strings.HasPrefix(e.pathParam, "/proc/") && !strings.HasPrefix(e.pathParam, "/sys/") && !strings.HasPrefix(e.pathParam, "/dev/") { - if fsa, ok := app.fsActivity[e.pathParam]; ok { - fsa.OpsAll++ - fsa.Pids[e.pid] = struct{}{} - fsa.Syscalls[int(e.callNum)] = struct{}{} - if e.retVal == 0 || p.SyscallType() == OpenFileType { - fsa.HasSuccessfulAccess = true - } + if fsa, ok := app.fsActivity[e.pathParam]; ok { + fsa.OpsAll++ + fsa.Pids[e.pid] = struct{}{} + fsa.Syscalls[int(e.callNum)] = struct{}{} + if e.retVal == 0 || p.SyscallType() == OpenFileType { + fsa.HasSuccessfulAccess = true + } - if processor, found := syscallProcessors[int(e.callNum)]; found { - switch processor.SyscallType() { - case CheckFileType: - fsa.OpsCheckFile++ + if processor, found := syscallProcessors[int(e.callNum)]; found { + switch processor.SyscallType() { + case CheckFileType: + fsa.OpsCheckFile++ + } + } + } else { + fsa := &report.FSActivityInfo{ + OpsAll: 1, + OpsCheckFile: 1, + HasSuccessfulAccess: e.retVal == 0 || p.SyscallType() == OpenFileType, + Pids: map[int]struct{}{}, + Syscalls: map[int]struct{}{}, } - } - } else { - fsa := &report.FSActivityInfo{ - OpsAll: 1, - OpsCheckFile: 1, - HasSuccessfulAccess: e.retVal == 0 || p.SyscallType() == OpenFileType, - Pids: map[int]struct{}{}, - Syscalls: map[int]struct{}{}, - } - fsa.Pids[e.pid] = struct{}{} - fsa.Syscalls[int(e.callNum)] = struct{}{} + fsa.Pids[e.pid] = struct{}{} + fsa.Syscalls[int(e.callNum)] = struct{}{} - app.fsActivity[e.pathParam] = fsa - } + app.fsActivity[e.pathParam] = fsa + } if app.del != nil { //NOTE: From a245d14719499649cfc6678cbfbec4984d235673 Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Thu, 2 Apr 2026 20:46:31 +0000 Subject: [PATCH 18/19] add unit tests for bugs related to ghost paths --- pkg/monitor/ptrace/ptrace_test.go | 610 ++++++++++++++++++++++++++++++ 1 file changed, 610 insertions(+) diff --git a/pkg/monitor/ptrace/ptrace_test.go b/pkg/monitor/ptrace/ptrace_test.go index e40d9f0e0..09814c818 100644 --- a/pkg/monitor/ptrace/ptrace_test.go +++ b/pkg/monitor/ptrace/ptrace_test.go @@ -2,8 +2,29 @@ package ptrace import ( "testing" + + log "github.com/sirupsen/logrus" + + "github.com/mintoolkit/mint/pkg/mondel" + "github.com/mintoolkit/mint/pkg/report" + "github.com/mintoolkit/mint/pkg/system" ) +// mockPublisher captures MonitorDataEvents published during processFileActivity. +type mockPublisher struct { + events []*report.MonitorDataEvent +} + +func (m *mockPublisher) Publish(event *report.MonitorDataEvent) error { + m.events = append(m.events, event) + return nil +} + +func (m *mockPublisher) Stop() {} + +// Verify mockPublisher satisfies the interface. +var _ mondel.Publisher = (*mockPublisher)(nil) + func TestGetIntVal(t *testing.T) { tt := []struct { input uint64 @@ -115,3 +136,592 @@ func TestCheckFileSyscallProcessorFailedReturnStatus(t *testing.T) { } } } + +// TestOKReturnStatusAndFailedReturnStatusOverlap demonstrates that for +// checkFileSyscallProcessor, OKReturnStatus and FailedReturnStatus are NOT +// complementary. ENOENT (-2) and ENOTDIR (-20) return true for BOTH methods. +// This is by design: OKReturnStatus means "this path is interesting enough to +// track" while FailedReturnStatus means "the syscall itself did not succeed." +func TestOKReturnStatusAndFailedReturnStatusOverlap(t *testing.T) { + processor := &checkFileSyscallProcessor{ + syscallProcessorCore: &syscallProcessorCore{}, + } + + tt := []struct { + retVal uint64 + okReturn bool + failReturn bool + desc string + }{ + { + retVal: 0, + okReturn: true, + failReturn: false, + desc: "success (0): OK=true, Failed=false — no overlap", + }, + { + retVal: 0xFFFFFFFE, // -2 + okReturn: true, + failReturn: true, + desc: "ENOENT (-2): OK=true AND Failed=true — OVERLAP (path tracked but file doesn't exist)", + }, + { + retVal: 0xFFFFFFEC, // -20 + okReturn: true, + failReturn: true, + desc: "ENOTDIR (-20): OK=true AND Failed=true — OVERLAP (path tracked but not a directory)", + }, + { + retVal: 0xFFFFFFFF, // -1 + okReturn: false, + failReturn: true, + desc: "EPERM (-1): OK=false, Failed=true — no overlap", + }, + } + + for _, test := range tt { + okResult := processor.OKReturnStatus(test.retVal) + failResult := processor.FailedReturnStatus(test.retVal) + if okResult != test.okReturn { + t.Errorf("[%s] OKReturnStatus(0x%x) = %v, want %v", + test.desc, test.retVal, okResult, test.okReturn) + } + if failResult != test.failReturn { + t.Errorf("[%s] FailedReturnStatus(0x%x) = %v, want %v", + test.desc, test.retVal, failResult, test.failReturn) + } + } +} + +// newTestApp creates a minimal App suitable for testing processFileActivity +// and FileActivity without needing exec, ptrace, or signal infrastructure. +func newTestApp() *App { + return &App{ + fsActivity: map[string]*report.FSActivityInfo{}, + logger: log.WithField("test", true), + } +} + +// newTestAppWithPublisher creates a test App with a mock event publisher, +// so we can capture MonitorDataEvents emitted by processFileActivity. +func newTestAppWithPublisher(pub mondel.Publisher) *App { + return &App{ + del: pub, + fsActivity: map[string]*report.FSActivityInfo{}, + logger: log.WithField("test", true), + } +} + +// lookupStatCallNum returns the syscall number for "stat" (or "newfstatat" on +// architectures that don't have "stat"). Panics if neither is found, since +// init() should have registered them. +func lookupStatCallNum() uint32 { + if num, found := system.LookupCallNumber("stat"); found { + return num + } + if num, found := system.LookupCallNumber("newfstatat"); found { + return num + } + panic("neither stat nor newfstatat syscall found — init() may not have run") +} + +// lookupOpenatCallNum returns the syscall number for "openat". +func lookupOpenatCallNum() uint32 { + num, found := system.LookupCallNumber("openat") + if !found { + panic("openat syscall not found") + } + return num +} + +// TestProcessFileActivity_GhostPathsRecorded shows that stat calls returning +// ENOENT (-2) still get recorded in fsActivity (because OKReturnStatus accepts +// ENOENT), but with HasSuccessfulAccess=false. +func TestProcessFileActivity_GhostPathsRecorded(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // Simulate a stat call that returned ENOENT (file not found) + app.processFileActivity(&syscallEvent{ + returned: true, + pid: 1, + callNum: statNum, + retVal: 0xFFFFFFFE, // -2 = ENOENT + pathParam: "/usr/lib/foo/__init__.cpython-311.so", + }) + + fsa, found := app.fsActivity["/usr/lib/foo/__init__.cpython-311.so"] + if !found { + t.Fatal("ghost path (ENOENT stat) was NOT recorded in fsActivity — " + + "OKReturnStatus should accept ENOENT to track Python import probes") + } + + if fsa.HasSuccessfulAccess { + t.Error("ghost path should have HasSuccessfulAccess=false, got true — " + + "ENOENT paths should not be marked as successfully accessed") + } + + if fsa.OpsAll != 1 { + t.Errorf("expected OpsAll=1, got %d", fsa.OpsAll) + } +} + +// TestProcessFileActivity_SuccessPathsRecorded shows that stat calls returning +// success (0) are recorded with HasSuccessfulAccess=true. +func TestProcessFileActivity_SuccessPathsRecorded(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // Simulate a stat call that succeeded (file exists) + app.processFileActivity(&syscallEvent{ + returned: true, + pid: 1, + callNum: statNum, + retVal: 0, // success + pathParam: "/usr/lib/foo/__init__.py", + }) + + fsa, found := app.fsActivity["/usr/lib/foo/__init__.py"] + if !found { + t.Fatal("successful stat path was NOT recorded in fsActivity") + } + + if !fsa.HasSuccessfulAccess { + t.Error("successful stat path should have HasSuccessfulAccess=true, got false") + } +} + +// TestProcessFileActivity_OpenFileAlwaysSuccessful shows that for OpenFileType +// syscalls (e.g., openat), HasSuccessfulAccess is always true when the path +// passes OKReturnStatus (which requires fd >= 0 for open-type calls). +func TestProcessFileActivity_OpenFileAlwaysSuccessful(t *testing.T) { + app := newTestApp() + openatNum := lookupOpenatCallNum() + + // Simulate an openat call that succeeded (returned fd=3) + app.processFileActivity(&syscallEvent{ + returned: true, + pid: 1, + callNum: openatNum, + retVal: 3, // fd=3, success + pathParam: "/etc/config.json", + }) + + fsa, found := app.fsActivity["/etc/config.json"] + if !found { + t.Fatal("successful openat path was NOT recorded in fsActivity") + } + + if !fsa.HasSuccessfulAccess { + t.Error("successful openat path should have HasSuccessfulAccess=true") + } +} + +// TestProcessFileActivity_EPERMNotRecorded shows that stat calls returning +// EPERM (-1) are NOT recorded, because OKReturnStatus rejects them. +func TestProcessFileActivity_EPERMNotRecorded(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + app.processFileActivity(&syscallEvent{ + returned: true, + pid: 1, + callNum: statNum, + retVal: 0xFFFFFFFF, // -1 = EPERM + pathParam: "/root/secret", + }) + + if _, found := app.fsActivity["/root/secret"]; found { + t.Error("EPERM stat path should NOT be recorded — OKReturnStatus rejects -1") + } +} + +// TestFileActivity_GhostChildrenDontTriggerIsSubdir demonstrates the fix from +// da8eb2c7: ghost children (ENOENT stat results) with HasSuccessfulAccess=false +// do NOT cause the parent directory to be marked IsSubdir=true and excluded. +// +// Scenario: Python namespace package — directory /usr/lib/foo/ is stat'd +// successfully, but probes for __init__.*.so and __init__.py all return ENOENT. +// The directory must NOT be excluded. +func TestFileActivity_GhostChildrenDontTriggerIsSubdir(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // Parent directory stat — success + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/foo", + }) + + // Ghost children — ENOENT probes for __init__ variants + ghostPaths := []string{ + "/usr/lib/foo/__init__.cpython-311.so", + "/usr/lib/foo/__init__.so", + "/usr/lib/foo/__init__.py", + } + for _, p := range ghostPaths { + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: p, // ENOENT + }) + } + + result := app.FileActivity() + + // The parent directory must NOT be filtered out + if _, found := result["/usr/lib/foo"]; !found { + t.Error("/usr/lib/foo was excluded by FileActivity() — " + + "ghost children with HasSuccessfulAccess=false should not trigger IsSubdir") + } + + // Ghost children should still be present (they are leaf nodes, not IsSubdir) + for _, p := range ghostPaths { + if fsa, found := result[p]; found { + if fsa.HasSuccessfulAccess { + t.Errorf("ghost path %s should have HasSuccessfulAccess=false", p) + } + } + // Note: ghost paths ARE expected in the result — this is side effect #2 + // from the analysis. They pass through FileActivity() because they are + // leaf nodes with no children. + } +} + +// TestFileActivity_RealChildrenTriggerIsSubdir shows that when a child path has +// HasSuccessfulAccess=true (i.e., the file actually exists), the parent IS +// correctly marked IsSubdir and excluded from the result. +func TestFileActivity_RealChildrenTriggerIsSubdir(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // Parent directory stat — success + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/bar", + }) + + // Real child file — success + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/bar/__init__.py", + }) + + result := app.FileActivity() + + // The parent directory SHOULD be filtered out (IsSubdir=true) + if _, found := result["/usr/lib/bar"]; found { + t.Error("/usr/lib/bar should be excluded by FileActivity() — " + + "a real child with HasSuccessfulAccess=true should trigger IsSubdir") + } + + // The child file should be present + if _, found := result["/usr/lib/bar/__init__.py"]; !found { + t.Error("/usr/lib/bar/__init__.py should be in FileActivity() result") + } +} + +// TestFileActivity_GhostLeafPathsPassThrough demonstrates side effect #2: +// ghost leaf paths (no children of their own) survive FileActivity() filtering +// and appear in the final report, even though they represent non-existent files. +func TestFileActivity_GhostLeafPathsPassThrough(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // A ghost leaf path with no parent or children in fsActivity + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/nonexistent.so", // ENOENT + }) + + result := app.FileActivity() + + fsa, found := result["/usr/lib/nonexistent.so"] + if !found { + t.Fatal("ghost leaf path was filtered out by FileActivity() — " + + "expected it to pass through (this IS a known side effect)") + } + + if fsa.HasSuccessfulAccess { + t.Error("ghost leaf path should have HasSuccessfulAccess=false") + } +} + +// TestFileActivity_MixedGhostAndRealChildren tests the realistic Python import +// scenario: a package directory has both ghost probes (ENOENT) and one real file +// (success). The parent should be marked IsSubdir because a real child exists. +func TestFileActivity_MixedGhostAndRealChildren(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // Parent directory + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/pkg", + }) + + // Ghost probes — ENOENT + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/pkg/__init__.cpython-311.so", + }) + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/pkg/__init__.so", + }) + + // Real file — success + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/pkg/__init__.py", + }) + + result := app.FileActivity() + + // Parent should be excluded (real child exists) + if _, found := result["/usr/lib/pkg"]; found { + t.Error("/usr/lib/pkg should be excluded — real child __init__.py exists") + } + + // Real child should be present + if _, found := result["/usr/lib/pkg/__init__.py"]; !found { + t.Error("/usr/lib/pkg/__init__.py should be in result") + } + + // Ghost children should also be present (side effect #2) + for _, p := range []string{ + "/usr/lib/pkg/__init__.cpython-311.so", + "/usr/lib/pkg/__init__.so", + } { + if fsa, found := result[p]; found { + if fsa.HasSuccessfulAccess { + t.Errorf("ghost path %s should have HasSuccessfulAccess=false", p) + } + } + } +} + +// ============================================================================= +// Bug detection tests +// +// These tests verify whether the three possible open bugs identified in the +// analysis document are live. Each test is structured so that: +// - If the bug IS LIVE, the test documents the buggy behavior (passes but +// logs what's wrong via t.Log). +// - If the bug is FIXED in the future, the test will fail, signaling that +// the fix landed and the test expectations should be updated. +// ============================================================================= + +// BUG #1: Ghost leaf paths leak into the final FSActivity report. +// +// FileActivity() only filters paths with IsSubdir=true. Ghost leaf paths +// (ENOENT stat results with no children) have HasSuccessfulAccess=false but +// are never filtered. They appear in Report.FSActivity and get passed to +// prepareArtifact() for files that don't exist on disk. +// +// This test simulates a Python import that probes 3 non-existent paths and +// finds 1 real file. It checks whether the ghost paths leak into the report. +func TestBug1_GhostLeafPathsLeakIntoReport(t *testing.T) { + app := newTestApp() + statNum := lookupStatCallNum() + + // Python probes these — all return ENOENT (file not found) + ghostPaths := []string{ + "/usr/lib/mypackage/__init__.cpython-311.so", + "/usr/lib/mypackage/__init__.so", + "/usr/lib/mypackage/__init__.abi3.so", + } + for _, p := range ghostPaths { + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: p, // ENOENT + }) + } + + // One real file found + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/mypackage/__init__.py", + }) + + result := app.FileActivity() + + // Count how many ghost (non-existent) paths leaked into the report + leaked := 0 + for _, p := range ghostPaths { + if fsa, found := result[p]; found { + leaked++ + if fsa.HasSuccessfulAccess { + t.Errorf("ghost path %s has HasSuccessfulAccess=true (unexpected)", p) + } + } + } + + if leaked > 0 { + // BUG IS LIVE: ghost paths leak into the report. + // When this bug is fixed, this branch will stop being reached and the + // test will fail at the assertion below — update it to expect 0. + t.Logf("BUG #1 IS LIVE: %d/%d ghost leaf paths leaked into FileActivity() report", + leaked, len(ghostPaths)) + if leaked != len(ghostPaths) { + t.Errorf("expected all %d ghost paths to leak (or none if fixed), got %d", + len(ghostPaths), leaked) + } + } else { + // BUG IS FIXED: no ghost paths in the report. + t.Log("BUG #1 IS FIXED: ghost leaf paths are no longer in the report") + } +} + +// BUG #2: Ghost paths are published as MDETypeArtifact events. +// +// processFileActivity() publishes a MonitorDataEvent for every path that passes +// the OKReturnStatus gate, including ENOENT ghost paths. Event stream consumers +// receive artifact events for files that don't exist. +func TestBug2_GhostPathsPublishedAsArtifactEvents(t *testing.T) { + pub := &mockPublisher{} + app := newTestAppWithPublisher(pub) + statNum := lookupStatCallNum() + + // Ghost path — ENOENT + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/ghost.so", // ENOENT + }) + + // Real path — success + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/real.so", + }) + + // Check which paths got published + ghostPublished := false + realPublished := false + for _, ev := range pub.events { + if ev.Artifact == "/usr/lib/ghost.so" { + ghostPublished = true + } + if ev.Artifact == "/usr/lib/real.so" { + realPublished = true + } + } + + if !realPublished { + t.Error("real path /usr/lib/real.so was NOT published as an event (unexpected)") + } + + if ghostPublished { + // BUG IS LIVE: ghost paths are published as artifact events. + t.Log("BUG #2 IS LIVE: ghost path /usr/lib/ghost.so was published as " + + "an MDETypeArtifact event despite the file not existing on disk") + } else { + // BUG IS FIXED: ghost paths are not published. + t.Log("BUG #2 IS FIXED: ghost paths are no longer published as artifact events") + } +} + +// BUG #3: OKReturnStatus/FailedReturnStatus naming trap causes regressions. +// +// This test reproduces the exact regression introduced by kilo-code-bot in +// commit 3f8dcb47: using p.OKReturnStatus(e.retVal) for the HasSuccessfulAccess +// condition instead of e.retVal == 0. Because OKReturnStatus accepts ENOENT, +// ghost paths get HasSuccessfulAccess=true, which breaks the namespace package +// fix — ghost children trigger IsSubdir on the parent directory. +// +// The test simulates what WOULD happen if someone made this mistake again, +// by directly manipulating HasSuccessfulAccess to match what OKReturnStatus +// would produce, and showing the cascade failure. +func TestBug3_OKReturnStatusMisuseBreaksNamespacePackages(t *testing.T) { + statNum := lookupStatCallNum() + statProcessor, found := syscallProcessors[int(statNum)] + if !found { + t.Fatal("stat processor not found in syscallProcessors") + } + + // --- Scenario: namespace package directory with only ghost children --- + // Current (correct) behavior: use e.retVal == 0 for HasSuccessfulAccess + t.Run("correct_behavior_retVal_eq_0", func(t *testing.T) { + app := newTestApp() + + // Parent directory — success + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/nspkg", + }) + // Ghost child — ENOENT + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/nspkg/__init__.py", + }) + + result := app.FileActivity() + if _, found := result["/usr/lib/nspkg"]; !found { + t.Error("namespace package directory was incorrectly excluded") + } + }) + + // Simulated regression: if HasSuccessfulAccess used OKReturnStatus instead + // of retVal == 0, ENOENT ghost children would get HasSuccessfulAccess=true, + // triggering IsSubdir on the parent and excluding it. + t.Run("regression_if_OKReturnStatus_used_for_HasSuccessfulAccess", func(t *testing.T) { + app := newTestApp() + + // Parent directory — success + app.fsActivity["/usr/lib/nspkg"] = &report.FSActivityInfo{ + OpsAll: 1, OpsCheckFile: 1, + HasSuccessfulAccess: true, + Pids: map[int]struct{}{1: {}}, + Syscalls: map[int]struct{}{int(statNum): {}}, + } + + // Ghost child — simulate what happens if HasSuccessfulAccess used + // OKReturnStatus: ENOENT (-2) returns true from OKReturnStatus, so + // HasSuccessfulAccess would be set to true (THIS IS THE BUG). + enoentRetVal := uint64(0xFFFFFFFE) // -2 + buggyHasSuccessfulAccess := statProcessor.OKReturnStatus(enoentRetVal) + // Verify the premise: OKReturnStatus does accept ENOENT + if !buggyHasSuccessfulAccess { + t.Fatal("test premise failed: OKReturnStatus should accept ENOENT") + } + + app.fsActivity["/usr/lib/nspkg/__init__.py"] = &report.FSActivityInfo{ + OpsAll: 1, OpsCheckFile: 1, + HasSuccessfulAccess: buggyHasSuccessfulAccess, // true — THE BUG + Pids: map[int]struct{}{1: {}}, + Syscalls: map[int]struct{}{int(statNum): {}}, + } + + result := app.FileActivity() + + if _, found := result["/usr/lib/nspkg"]; found { + t.Log("BUG #3 CONFIRMED: if OKReturnStatus were used for " + + "HasSuccessfulAccess, this namespace package directory would " + + "survive (but only because there's no real child to trigger IsSubdir)") + } else { + // The parent was excluded — the ghost child with + // HasSuccessfulAccess=true triggered IsSubdir. + t.Log("BUG #3 IS LIVE (naming trap): using OKReturnStatus for " + + "HasSuccessfulAccess causes ghost children to trigger IsSubdir, " + + "excluding the namespace package directory from the slim image. " + + "This is the exact regression from commit 3f8dcb47.") + } + + // The key assertion: the current code does NOT use OKReturnStatus for + // HasSuccessfulAccess, so verify the correct path still works. + correctApp := newTestApp() + correctApp.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/nspkg2", + }) + correctApp.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/nspkg2/__init__.py", + }) + correctResult := correctApp.FileActivity() + if _, found := correctResult["/usr/lib/nspkg2"]; !found { + t.Error("REGRESSION: current code is excluding namespace package " + + "directories — the HasSuccessfulAccess condition may have been " + + "changed to use OKReturnStatus again") + } + }) +} From b45ab6f5d6918207e0bdda59bdacdc4a0647e08b Mon Sep 17 00:00:00 2001 From: Adam Shaver Date: Thu, 2 Apr 2026 23:40:37 +0000 Subject: [PATCH 19/19] Revert OKReturnStatus to success-only and remove HasSuccessfulAccess MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OKReturnStatus for checkFileSyscallProcessor was broadened in 9aca3e93 to accept ENOENT (-2) and ENOTDIR (-20) for Python import tracking. However, ghost paths (non-existent files) that entered fsActivity via this gate were never used by the artifact pipeline — prepareArtifact() discards them at os.Lstat(). Meanwhile they caused three side effects: 1. Ghost leaf paths leaked into Report.FSActivity and triggered prepareArtifact() calls for non-existent files 2. Ghost paths were published as MDETypeArtifact events 3. The permissive OKReturnStatus semantics caused a naming trap that led to a regression (3f8dcb47) where HasSuccessfulAccess was set using OKReturnStatus, breaking namespace package directories This reverts OKReturnStatus to the upstream original (retVal == 0), removes the HasSuccessfulAccess field and its plumbing (no longer needed since only real paths enter fsActivity), and simplifies the FileActivity() radix tree walk. Tests updated to verify: ENOENT paths are rejected, no ghost paths in reports, namespace package directories are preserved, IsSubdir works correctly for real children. --- pkg/monitor/ptrace/ptrace.go | 36 +-- pkg/monitor/ptrace/ptrace_test.go | 479 ++++++------------------------ pkg/report/container_report.go | 11 +- 3 files changed, 110 insertions(+), 416 deletions(-) diff --git a/pkg/monitor/ptrace/ptrace.go b/pkg/monitor/ptrace/ptrace.go index 0b15edead..673eb12ca 100644 --- a/pkg/monitor/ptrace/ptrace.go +++ b/pkg/monitor/ptrace/ptrace.go @@ -286,9 +286,6 @@ func (app *App) processFileActivity(e *syscallEvent) { fsa.OpsAll++ fsa.Pids[e.pid] = struct{}{} fsa.Syscalls[int(e.callNum)] = struct{}{} - if e.retVal == 0 || p.SyscallType() == OpenFileType { - fsa.HasSuccessfulAccess = true - } if processor, found := syscallProcessors[int(e.callNum)]; found { switch processor.SyscallType() { @@ -298,11 +295,10 @@ func (app *App) processFileActivity(e *syscallEvent) { } } else { fsa := &report.FSActivityInfo{ - OpsAll: 1, - OpsCheckFile: 1, - HasSuccessfulAccess: e.retVal == 0 || p.SyscallType() == OpenFileType, - Pids: map[int]struct{}{}, - Syscalls: map[int]struct{}{}, + OpsAll: 1, + OpsCheckFile: 1, + Pids: map[int]struct{}{}, + Syscalls: map[int]struct{}{}, } fsa.Pids[e.pid] = struct{}{} @@ -508,21 +504,8 @@ func (app *App) FileActivity() map[string]*report.FSActivityInfo { return false } - adata, ok := av.(*report.FSActivityInfo) - if !ok { - return false - } - - // Only mark as subdirectory if the child path was actually - // found on disk. ENOENT "ghost" children (e.g., Python probing - // for __init__.so/.py in a namespace package directory) must - // not cause the parent directory to be excluded. - if adata.HasSuccessfulAccess { - wdata.IsSubdir = true - return true - } - - return false + wdata.IsSubdir = true + return true } t.WalkPrefix(wkey, walkAfter) @@ -1248,12 +1231,7 @@ func (ref *checkFileSyscallProcessor) OKCall(cstate *syscallState) bool { } func (ref *checkFileSyscallProcessor) OKReturnStatus(retVal uint64) bool { - // Accept successful stat calls (0) and also failed attempts that indicate - // the application was looking for the file. This is important for Python - // imports which check multiple locations before finding the right file. - // Track ENOENT (file not found) and ENOTDIR (not a directory) in addition to success. - intRetVal := getIntVal(retVal) - return intRetVal == 0 || intRetVal == -2 || intRetVal == -20 // 0=success, -2=ENOENT, -20=ENOTDIR + return retVal == 0 } func (ref *checkFileSyscallProcessor) EventOnCall() bool { diff --git a/pkg/monitor/ptrace/ptrace_test.go b/pkg/monitor/ptrace/ptrace_test.go index 09814c818..9852bb780 100644 --- a/pkg/monitor/ptrace/ptrace_test.go +++ b/pkg/monitor/ptrace/ptrace_test.go @@ -5,26 +5,10 @@ import ( log "github.com/sirupsen/logrus" - "github.com/mintoolkit/mint/pkg/mondel" "github.com/mintoolkit/mint/pkg/report" "github.com/mintoolkit/mint/pkg/system" ) -// mockPublisher captures MonitorDataEvents published during processFileActivity. -type mockPublisher struct { - events []*report.MonitorDataEvent -} - -func (m *mockPublisher) Publish(event *report.MonitorDataEvent) error { - m.events = append(m.events, event) - return nil -} - -func (m *mockPublisher) Stop() {} - -// Verify mockPublisher satisfies the interface. -var _ mondel.Publisher = (*mockPublisher)(nil) - func TestGetIntVal(t *testing.T) { tt := []struct { input uint64 @@ -45,6 +29,9 @@ func TestGetIntVal(t *testing.T) { } } +// TestCheckFileSyscallProcessorOKReturnStatus verifies that OKReturnStatus +// only accepts successful stat calls (retVal == 0). ENOENT, ENOTDIR, and +// other errors are rejected — only paths that actually exist get tracked. func TestCheckFileSyscallProcessorOKReturnStatus(t *testing.T) { processor := &checkFileSyscallProcessor{ syscallProcessorCore: &syscallProcessorCore{}, @@ -58,37 +45,32 @@ func TestCheckFileSyscallProcessorOKReturnStatus(t *testing.T) { { retVal: 0, expected: true, - desc: "success (0)", + desc: "success (0) - file exists, should be tracked", }, { retVal: 0xFFFFFFFE, // -2 as uint64 - expected: true, - desc: "ENOENT (-2) - file not found, should be tracked", + expected: false, + desc: "ENOENT (-2) - file not found, should NOT be tracked", }, { retVal: 0xFFFFFFEC, // -20 as uint64 - expected: true, - desc: "ENOTDIR (-20) - not a directory, should be tracked", + expected: false, + desc: "ENOTDIR (-20) - not a directory, should NOT be tracked", }, { retVal: 0xFFFFFFFF, // -1 as uint64 expected: false, - desc: "EPERM (-1) - should not be tracked", + desc: "EPERM (-1) - should NOT be tracked", }, { retVal: 0xFFFFFFFD, // -3 as uint64 expected: false, - desc: "ESRCH (-3) - should not be tracked", - }, - { - retVal: 0xFFFFFFED, // -19 as uint64 - expected: false, - desc: "ENODEV (-19) - should not be tracked", + desc: "ESRCH (-3) - should NOT be tracked", }, { retVal: 1, expected: false, - desc: "positive return value - should not be tracked", + desc: "positive return value - should NOT be tracked", }, } @@ -137,12 +119,10 @@ func TestCheckFileSyscallProcessorFailedReturnStatus(t *testing.T) { } } -// TestOKReturnStatusAndFailedReturnStatusOverlap demonstrates that for -// checkFileSyscallProcessor, OKReturnStatus and FailedReturnStatus are NOT -// complementary. ENOENT (-2) and ENOTDIR (-20) return true for BOTH methods. -// This is by design: OKReturnStatus means "this path is interesting enough to -// track" while FailedReturnStatus means "the syscall itself did not succeed." -func TestOKReturnStatusAndFailedReturnStatusOverlap(t *testing.T) { +// TestOKReturnStatusAndFailedReturnStatusAreComplements verifies that after +// reverting OKReturnStatus to success-only, OKReturnStatus and +// FailedReturnStatus are now proper complements (no overlap). +func TestOKReturnStatusAndFailedReturnStatusAreComplements(t *testing.T) { processor := &checkFileSyscallProcessor{ syscallProcessorCore: &syscallProcessorCore{}, } @@ -157,25 +137,25 @@ func TestOKReturnStatusAndFailedReturnStatusOverlap(t *testing.T) { retVal: 0, okReturn: true, failReturn: false, - desc: "success (0): OK=true, Failed=false — no overlap", + desc: "success (0)", }, { retVal: 0xFFFFFFFE, // -2 - okReturn: true, + okReturn: false, failReturn: true, - desc: "ENOENT (-2): OK=true AND Failed=true — OVERLAP (path tracked but file doesn't exist)", + desc: "ENOENT (-2)", }, { retVal: 0xFFFFFFEC, // -20 - okReturn: true, + okReturn: false, failReturn: true, - desc: "ENOTDIR (-20): OK=true AND Failed=true — OVERLAP (path tracked but not a directory)", + desc: "ENOTDIR (-20)", }, { retVal: 0xFFFFFFFF, // -1 okReturn: false, failReturn: true, - desc: "EPERM (-1): OK=false, Failed=true — no overlap", + desc: "EPERM (-1)", }, } @@ -190,6 +170,10 @@ func TestOKReturnStatusAndFailedReturnStatusOverlap(t *testing.T) { t.Errorf("[%s] FailedReturnStatus(0x%x) = %v, want %v", test.desc, test.retVal, failResult, test.failReturn) } + if okResult == failResult { + t.Errorf("[%s] OKReturnStatus and FailedReturnStatus should be "+ + "complements but both returned %v", test.desc, okResult) + } } } @@ -202,16 +186,6 @@ func newTestApp() *App { } } -// newTestAppWithPublisher creates a test App with a mock event publisher, -// so we can capture MonitorDataEvents emitted by processFileActivity. -func newTestAppWithPublisher(pub mondel.Publisher) *App { - return &App{ - del: pub, - fsActivity: map[string]*report.FSActivityInfo{}, - logger: log.WithField("test", true), - } -} - // lookupStatCallNum returns the syscall number for "stat" (or "newfstatat" on // architectures that don't have "stat"). Panics if neither is found, since // init() should have registered them. @@ -234,14 +208,13 @@ func lookupOpenatCallNum() uint32 { return num } -// TestProcessFileActivity_GhostPathsRecorded shows that stat calls returning -// ENOENT (-2) still get recorded in fsActivity (because OKReturnStatus accepts -// ENOENT), but with HasSuccessfulAccess=false. -func TestProcessFileActivity_GhostPathsRecorded(t *testing.T) { +// TestProcessFileActivity_ENOENTNotRecorded verifies that stat calls returning +// ENOENT (-2) are NOT recorded in fsActivity. Only successful calls (retVal==0) +// pass the OKReturnStatus gate. +func TestProcessFileActivity_ENOENTNotRecorded(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() - // Simulate a stat call that returned ENOENT (file not found) app.processFileActivity(&syscallEvent{ returned: true, pid: 1, @@ -250,29 +223,18 @@ func TestProcessFileActivity_GhostPathsRecorded(t *testing.T) { pathParam: "/usr/lib/foo/__init__.cpython-311.so", }) - fsa, found := app.fsActivity["/usr/lib/foo/__init__.cpython-311.so"] - if !found { - t.Fatal("ghost path (ENOENT stat) was NOT recorded in fsActivity — " + - "OKReturnStatus should accept ENOENT to track Python import probes") - } - - if fsa.HasSuccessfulAccess { - t.Error("ghost path should have HasSuccessfulAccess=false, got true — " + - "ENOENT paths should not be marked as successfully accessed") - } - - if fsa.OpsAll != 1 { - t.Errorf("expected OpsAll=1, got %d", fsa.OpsAll) + if _, found := app.fsActivity["/usr/lib/foo/__init__.cpython-311.so"]; found { + t.Error("ENOENT stat path should NOT be recorded — " + + "OKReturnStatus should only accept retVal == 0") } } -// TestProcessFileActivity_SuccessPathsRecorded shows that stat calls returning -// success (0) are recorded with HasSuccessfulAccess=true. +// TestProcessFileActivity_SuccessPathsRecorded verifies that stat calls +// returning success (0) are recorded in fsActivity. func TestProcessFileActivity_SuccessPathsRecorded(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() - // Simulate a stat call that succeeded (file exists) app.processFileActivity(&syscallEvent{ returned: true, pid: 1, @@ -281,24 +243,17 @@ func TestProcessFileActivity_SuccessPathsRecorded(t *testing.T) { pathParam: "/usr/lib/foo/__init__.py", }) - fsa, found := app.fsActivity["/usr/lib/foo/__init__.py"] - if !found { + if _, found := app.fsActivity["/usr/lib/foo/__init__.py"]; !found { t.Fatal("successful stat path was NOT recorded in fsActivity") } - - if !fsa.HasSuccessfulAccess { - t.Error("successful stat path should have HasSuccessfulAccess=true, got false") - } } -// TestProcessFileActivity_OpenFileAlwaysSuccessful shows that for OpenFileType -// syscalls (e.g., openat), HasSuccessfulAccess is always true when the path -// passes OKReturnStatus (which requires fd >= 0 for open-type calls). -func TestProcessFileActivity_OpenFileAlwaysSuccessful(t *testing.T) { +// TestProcessFileActivity_OpenFileSuccessRecorded verifies that openat calls +// with a successful fd (>= 0) are recorded. +func TestProcessFileActivity_OpenFileSuccessRecorded(t *testing.T) { app := newTestApp() openatNum := lookupOpenatCallNum() - // Simulate an openat call that succeeded (returned fd=3) app.processFileActivity(&syscallEvent{ returned: true, pid: 1, @@ -307,18 +262,13 @@ func TestProcessFileActivity_OpenFileAlwaysSuccessful(t *testing.T) { pathParam: "/etc/config.json", }) - fsa, found := app.fsActivity["/etc/config.json"] - if !found { + if _, found := app.fsActivity["/etc/config.json"]; !found { t.Fatal("successful openat path was NOT recorded in fsActivity") } - - if !fsa.HasSuccessfulAccess { - t.Error("successful openat path should have HasSuccessfulAccess=true") - } } -// TestProcessFileActivity_EPERMNotRecorded shows that stat calls returning -// EPERM (-1) are NOT recorded, because OKReturnStatus rejects them. +// TestProcessFileActivity_EPERMNotRecorded verifies that stat calls returning +// EPERM (-1) are NOT recorded. func TestProcessFileActivity_EPERMNotRecorded(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() @@ -332,34 +282,30 @@ func TestProcessFileActivity_EPERMNotRecorded(t *testing.T) { }) if _, found := app.fsActivity["/root/secret"]; found { - t.Error("EPERM stat path should NOT be recorded — OKReturnStatus rejects -1") + t.Error("EPERM stat path should NOT be recorded") } } -// TestFileActivity_GhostChildrenDontTriggerIsSubdir demonstrates the fix from -// da8eb2c7: ghost children (ENOENT stat results) with HasSuccessfulAccess=false -// do NOT cause the parent directory to be marked IsSubdir=true and excluded. -// -// Scenario: Python namespace package — directory /usr/lib/foo/ is stat'd -// successfully, but probes for __init__.*.so and __init__.py all return ENOENT. -// The directory must NOT be excluded. -func TestFileActivity_GhostChildrenDontTriggerIsSubdir(t *testing.T) { +// TestFileActivity_NamespacePackageDirectoryPreserved tests that a namespace +// package directory (no __init__.py) is preserved in FileActivity() when only +// the directory itself has a successful stat. Since ENOENT probes are no longer +// tracked, there are no ghost children to cause IsSubdir false positives. +func TestFileActivity_NamespacePackageDirectoryPreserved(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() - // Parent directory stat — success + // Only the directory stat succeeds — ENOENT probes are not tracked app.processFileActivity(&syscallEvent{ returned: true, pid: 1, callNum: statNum, retVal: 0, pathParam: "/usr/lib/foo", }) - // Ghost children — ENOENT probes for __init__ variants - ghostPaths := []string{ + // These ENOENT probes should be silently ignored by the gate + for _, p := range []string{ "/usr/lib/foo/__init__.cpython-311.so", "/usr/lib/foo/__init__.so", "/usr/lib/foo/__init__.py", - } - for _, p := range ghostPaths { + } { app.processFileActivity(&syscallEvent{ returned: true, pid: 1, callNum: statNum, retVal: 0xFFFFFFFE, pathParam: p, // ENOENT @@ -368,33 +314,28 @@ func TestFileActivity_GhostChildrenDontTriggerIsSubdir(t *testing.T) { result := app.FileActivity() - // The parent directory must NOT be filtered out if _, found := result["/usr/lib/foo"]; !found { - t.Error("/usr/lib/foo was excluded by FileActivity() — " + - "ghost children with HasSuccessfulAccess=false should not trigger IsSubdir") + t.Error("/usr/lib/foo was excluded — namespace package directory " + + "should be preserved since no ghost children can trigger IsSubdir") } - // Ghost children should still be present (they are leaf nodes, not IsSubdir) - for _, p := range ghostPaths { - if fsa, found := result[p]; found { - if fsa.HasSuccessfulAccess { - t.Errorf("ghost path %s should have HasSuccessfulAccess=false", p) - } + // Verify no ghost paths leaked into the result + if len(result) != 1 { + t.Errorf("expected exactly 1 entry (the directory), got %d", len(result)) + for k := range result { + t.Logf(" result key: %s", k) } - // Note: ghost paths ARE expected in the result — this is side effect #2 - // from the analysis. They pass through FileActivity() because they are - // leaf nodes with no children. } } -// TestFileActivity_RealChildrenTriggerIsSubdir shows that when a child path has -// HasSuccessfulAccess=true (i.e., the file actually exists), the parent IS -// correctly marked IsSubdir and excluded from the result. +// TestFileActivity_RealChildrenTriggerIsSubdir verifies that when a child path +// has a successful stat (file exists), the parent directory is correctly marked +// IsSubdir and excluded from the result. func TestFileActivity_RealChildrenTriggerIsSubdir(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() - // Parent directory stat — success + // Parent directory — success app.processFileActivity(&syscallEvent{ returned: true, pid: 1, callNum: statNum, retVal: 0, pathParam: "/usr/lib/bar", @@ -408,48 +349,53 @@ func TestFileActivity_RealChildrenTriggerIsSubdir(t *testing.T) { result := app.FileActivity() - // The parent directory SHOULD be filtered out (IsSubdir=true) if _, found := result["/usr/lib/bar"]; found { - t.Error("/usr/lib/bar should be excluded by FileActivity() — " + - "a real child with HasSuccessfulAccess=true should trigger IsSubdir") + t.Error("/usr/lib/bar should be excluded — " + + "a real child should trigger IsSubdir") } - // The child file should be present if _, found := result["/usr/lib/bar/__init__.py"]; !found { - t.Error("/usr/lib/bar/__init__.py should be in FileActivity() result") + t.Error("/usr/lib/bar/__init__.py should be in result") } } -// TestFileActivity_GhostLeafPathsPassThrough demonstrates side effect #2: -// ghost leaf paths (no children of their own) survive FileActivity() filtering -// and appear in the final report, even though they represent non-existent files. -func TestFileActivity_GhostLeafPathsPassThrough(t *testing.T) { +// TestFileActivity_NoGhostPathsInReport verifies that ENOENT stat results +// never appear in the FileActivity() output. +func TestFileActivity_NoGhostPathsInReport(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() - // A ghost leaf path with no parent or children in fsActivity + // Mix of successful and failed stats + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0, pathParam: "/usr/lib/real.so", + }) + app.processFileActivity(&syscallEvent{ + returned: true, pid: 1, callNum: statNum, + retVal: 0xFFFFFFFE, pathParam: "/usr/lib/ghost.so", // ENOENT + }) app.processFileActivity(&syscallEvent{ returned: true, pid: 1, callNum: statNum, - retVal: 0xFFFFFFFE, pathParam: "/usr/lib/nonexistent.so", // ENOENT + retVal: 0xFFFFFFEC, pathParam: "/usr/lib/ghost2.so", // ENOTDIR }) result := app.FileActivity() - fsa, found := result["/usr/lib/nonexistent.so"] - if !found { - t.Fatal("ghost leaf path was filtered out by FileActivity() — " + - "expected it to pass through (this IS a known side effect)") + if _, found := result["/usr/lib/real.so"]; !found { + t.Error("real path should be in result") } - - if fsa.HasSuccessfulAccess { - t.Error("ghost leaf path should have HasSuccessfulAccess=false") + if _, found := result["/usr/lib/ghost.so"]; found { + t.Error("ENOENT ghost path should NOT be in result") + } + if _, found := result["/usr/lib/ghost2.so"]; found { + t.Error("ENOTDIR ghost path should NOT be in result") } } -// TestFileActivity_MixedGhostAndRealChildren tests the realistic Python import -// scenario: a package directory has both ghost probes (ENOENT) and one real file -// (success). The parent should be marked IsSubdir because a real child exists. -func TestFileActivity_MixedGhostAndRealChildren(t *testing.T) { +// TestFileActivity_MixedRealAndGhostChildren tests the realistic Python +// import scenario: a package directory has both ENOENT probes and one real +// file. Only the real file and its IsSubdir effect should matter. +func TestFileActivity_MixedRealAndGhostChildren(t *testing.T) { app := newTestApp() statNum := lookupStatCallNum() @@ -459,7 +405,7 @@ func TestFileActivity_MixedGhostAndRealChildren(t *testing.T) { retVal: 0, pathParam: "/usr/lib/pkg", }) - // Ghost probes — ENOENT + // ENOENT probes — should be silently ignored app.processFileActivity(&syscallEvent{ returned: true, pid: 1, callNum: statNum, retVal: 0xFFFFFFFE, pathParam: "/usr/lib/pkg/__init__.cpython-311.so", @@ -477,9 +423,10 @@ func TestFileActivity_MixedGhostAndRealChildren(t *testing.T) { result := app.FileActivity() - // Parent should be excluded (real child exists) + // Parent should be excluded (real child exists and triggers IsSubdir) if _, found := result["/usr/lib/pkg"]; found { - t.Error("/usr/lib/pkg should be excluded — real child __init__.py exists") + t.Error("/usr/lib/pkg should be excluded — real child __init__.py " + + "triggers IsSubdir") } // Real child should be present @@ -487,241 +434,11 @@ func TestFileActivity_MixedGhostAndRealChildren(t *testing.T) { t.Error("/usr/lib/pkg/__init__.py should be in result") } - // Ghost children should also be present (side effect #2) - for _, p := range []string{ - "/usr/lib/pkg/__init__.cpython-311.so", - "/usr/lib/pkg/__init__.so", - } { - if fsa, found := result[p]; found { - if fsa.HasSuccessfulAccess { - t.Errorf("ghost path %s should have HasSuccessfulAccess=false", p) - } - } - } -} - -// ============================================================================= -// Bug detection tests -// -// These tests verify whether the three possible open bugs identified in the -// analysis document are live. Each test is structured so that: -// - If the bug IS LIVE, the test documents the buggy behavior (passes but -// logs what's wrong via t.Log). -// - If the bug is FIXED in the future, the test will fail, signaling that -// the fix landed and the test expectations should be updated. -// ============================================================================= - -// BUG #1: Ghost leaf paths leak into the final FSActivity report. -// -// FileActivity() only filters paths with IsSubdir=true. Ghost leaf paths -// (ENOENT stat results with no children) have HasSuccessfulAccess=false but -// are never filtered. They appear in Report.FSActivity and get passed to -// prepareArtifact() for files that don't exist on disk. -// -// This test simulates a Python import that probes 3 non-existent paths and -// finds 1 real file. It checks whether the ghost paths leak into the report. -func TestBug1_GhostLeafPathsLeakIntoReport(t *testing.T) { - app := newTestApp() - statNum := lookupStatCallNum() - - // Python probes these — all return ENOENT (file not found) - ghostPaths := []string{ - "/usr/lib/mypackage/__init__.cpython-311.so", - "/usr/lib/mypackage/__init__.so", - "/usr/lib/mypackage/__init__.abi3.so", - } - for _, p := range ghostPaths { - app.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0xFFFFFFFE, pathParam: p, // ENOENT - }) - } - - // One real file found - app.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0, pathParam: "/usr/lib/mypackage/__init__.py", - }) - - result := app.FileActivity() - - // Count how many ghost (non-existent) paths leaked into the report - leaked := 0 - for _, p := range ghostPaths { - if fsa, found := result[p]; found { - leaked++ - if fsa.HasSuccessfulAccess { - t.Errorf("ghost path %s has HasSuccessfulAccess=true (unexpected)", p) - } - } - } - - if leaked > 0 { - // BUG IS LIVE: ghost paths leak into the report. - // When this bug is fixed, this branch will stop being reached and the - // test will fail at the assertion below — update it to expect 0. - t.Logf("BUG #1 IS LIVE: %d/%d ghost leaf paths leaked into FileActivity() report", - leaked, len(ghostPaths)) - if leaked != len(ghostPaths) { - t.Errorf("expected all %d ghost paths to leak (or none if fixed), got %d", - len(ghostPaths), leaked) - } - } else { - // BUG IS FIXED: no ghost paths in the report. - t.Log("BUG #1 IS FIXED: ghost leaf paths are no longer in the report") - } -} - -// BUG #2: Ghost paths are published as MDETypeArtifact events. -// -// processFileActivity() publishes a MonitorDataEvent for every path that passes -// the OKReturnStatus gate, including ENOENT ghost paths. Event stream consumers -// receive artifact events for files that don't exist. -func TestBug2_GhostPathsPublishedAsArtifactEvents(t *testing.T) { - pub := &mockPublisher{} - app := newTestAppWithPublisher(pub) - statNum := lookupStatCallNum() - - // Ghost path — ENOENT - app.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0xFFFFFFFE, pathParam: "/usr/lib/ghost.so", // ENOENT - }) - - // Real path — success - app.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0, pathParam: "/usr/lib/real.so", - }) - - // Check which paths got published - ghostPublished := false - realPublished := false - for _, ev := range pub.events { - if ev.Artifact == "/usr/lib/ghost.so" { - ghostPublished = true - } - if ev.Artifact == "/usr/lib/real.so" { - realPublished = true + // No ghost paths should be present + if len(result) != 1 { + t.Errorf("expected exactly 1 entry (__init__.py), got %d", len(result)) + for k := range result { + t.Logf(" result key: %s", k) } } - - if !realPublished { - t.Error("real path /usr/lib/real.so was NOT published as an event (unexpected)") - } - - if ghostPublished { - // BUG IS LIVE: ghost paths are published as artifact events. - t.Log("BUG #2 IS LIVE: ghost path /usr/lib/ghost.so was published as " + - "an MDETypeArtifact event despite the file not existing on disk") - } else { - // BUG IS FIXED: ghost paths are not published. - t.Log("BUG #2 IS FIXED: ghost paths are no longer published as artifact events") - } -} - -// BUG #3: OKReturnStatus/FailedReturnStatus naming trap causes regressions. -// -// This test reproduces the exact regression introduced by kilo-code-bot in -// commit 3f8dcb47: using p.OKReturnStatus(e.retVal) for the HasSuccessfulAccess -// condition instead of e.retVal == 0. Because OKReturnStatus accepts ENOENT, -// ghost paths get HasSuccessfulAccess=true, which breaks the namespace package -// fix — ghost children trigger IsSubdir on the parent directory. -// -// The test simulates what WOULD happen if someone made this mistake again, -// by directly manipulating HasSuccessfulAccess to match what OKReturnStatus -// would produce, and showing the cascade failure. -func TestBug3_OKReturnStatusMisuseBreaksNamespacePackages(t *testing.T) { - statNum := lookupStatCallNum() - statProcessor, found := syscallProcessors[int(statNum)] - if !found { - t.Fatal("stat processor not found in syscallProcessors") - } - - // --- Scenario: namespace package directory with only ghost children --- - // Current (correct) behavior: use e.retVal == 0 for HasSuccessfulAccess - t.Run("correct_behavior_retVal_eq_0", func(t *testing.T) { - app := newTestApp() - - // Parent directory — success - app.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0, pathParam: "/usr/lib/nspkg", - }) - // Ghost child — ENOENT - app.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0xFFFFFFFE, pathParam: "/usr/lib/nspkg/__init__.py", - }) - - result := app.FileActivity() - if _, found := result["/usr/lib/nspkg"]; !found { - t.Error("namespace package directory was incorrectly excluded") - } - }) - - // Simulated regression: if HasSuccessfulAccess used OKReturnStatus instead - // of retVal == 0, ENOENT ghost children would get HasSuccessfulAccess=true, - // triggering IsSubdir on the parent and excluding it. - t.Run("regression_if_OKReturnStatus_used_for_HasSuccessfulAccess", func(t *testing.T) { - app := newTestApp() - - // Parent directory — success - app.fsActivity["/usr/lib/nspkg"] = &report.FSActivityInfo{ - OpsAll: 1, OpsCheckFile: 1, - HasSuccessfulAccess: true, - Pids: map[int]struct{}{1: {}}, - Syscalls: map[int]struct{}{int(statNum): {}}, - } - - // Ghost child — simulate what happens if HasSuccessfulAccess used - // OKReturnStatus: ENOENT (-2) returns true from OKReturnStatus, so - // HasSuccessfulAccess would be set to true (THIS IS THE BUG). - enoentRetVal := uint64(0xFFFFFFFE) // -2 - buggyHasSuccessfulAccess := statProcessor.OKReturnStatus(enoentRetVal) - // Verify the premise: OKReturnStatus does accept ENOENT - if !buggyHasSuccessfulAccess { - t.Fatal("test premise failed: OKReturnStatus should accept ENOENT") - } - - app.fsActivity["/usr/lib/nspkg/__init__.py"] = &report.FSActivityInfo{ - OpsAll: 1, OpsCheckFile: 1, - HasSuccessfulAccess: buggyHasSuccessfulAccess, // true — THE BUG - Pids: map[int]struct{}{1: {}}, - Syscalls: map[int]struct{}{int(statNum): {}}, - } - - result := app.FileActivity() - - if _, found := result["/usr/lib/nspkg"]; found { - t.Log("BUG #3 CONFIRMED: if OKReturnStatus were used for " + - "HasSuccessfulAccess, this namespace package directory would " + - "survive (but only because there's no real child to trigger IsSubdir)") - } else { - // The parent was excluded — the ghost child with - // HasSuccessfulAccess=true triggered IsSubdir. - t.Log("BUG #3 IS LIVE (naming trap): using OKReturnStatus for " + - "HasSuccessfulAccess causes ghost children to trigger IsSubdir, " + - "excluding the namespace package directory from the slim image. " + - "This is the exact regression from commit 3f8dcb47.") - } - - // The key assertion: the current code does NOT use OKReturnStatus for - // HasSuccessfulAccess, so verify the correct path still works. - correctApp := newTestApp() - correctApp.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0, pathParam: "/usr/lib/nspkg2", - }) - correctApp.processFileActivity(&syscallEvent{ - returned: true, pid: 1, callNum: statNum, - retVal: 0xFFFFFFFE, pathParam: "/usr/lib/nspkg2/__init__.py", - }) - correctResult := correctApp.FileActivity() - if _, found := correctResult["/usr/lib/nspkg2"]; !found { - t.Error("REGRESSION: current code is excluding namespace package " + - "directories — the HasSuccessfulAccess condition may have been " + - "changed to use OKReturnStatus again") - } - }) } diff --git a/pkg/report/container_report.go b/pkg/report/container_report.go index e9096b008..45a8438bf 100644 --- a/pkg/report/container_report.go +++ b/pkg/report/container_report.go @@ -121,12 +121,11 @@ type PtMonitorReport struct { } type FSActivityInfo struct { - OpsAll uint64 `json:"ops_all"` - OpsCheckFile uint64 `json:"ops_checkfile"` - Syscalls map[int]struct{} `json:"syscalls"` - Pids map[int]struct{} `json:"pids"` - IsSubdir bool `json:"is_subdir"` - HasSuccessfulAccess bool `json:"has_successful_access,omitempty"` + OpsAll uint64 `json:"ops_all"` + OpsCheckFile uint64 `json:"ops_checkfile"` + Syscalls map[int]struct{} `json:"syscalls"` + Pids map[int]struct{} `json:"pids"` + IsSubdir bool `json:"is_subdir"` } // ArtifactProps contains various file system artifact properties