From e8f62dd164cf7f0a13fdd170b862830ebe3f4cd0 Mon Sep 17 00:00:00 2001 From: Davi De Castro Reis Date: Fri, 19 Jun 2026 06:19:21 -0700 Subject: [PATCH 1/3] loader: memoize includes to avoid re-expanding diamond include graphs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ApplyInclude re-parses and recursively re-expands an included file once per include path that reaches it. When the same file is reached through more than one path (a "diamond" in the include graph) this is quadratic-to-exponential: a 24-level doubling graph loads the leaf 2^24 times. Monorepos that aggregate per-target / per-project compose fragments hit this in practice (an ~80-service federation took ~55s in `docker compose config`). Memoize each loaded include model for the duration of a single load, keyed on every input that determines it — resolved paths, working dir, project dir, and effective environment — and hand out a deep copy on each hit. The merge into the parent (importResources) still runs for every occurrence, so a same-file `extends` in the including file still resolves and the result is identical to loading each time; only the parse + recursive expansion is shared. Keying on the working dir matters: the same file reached through two parents can have a different relative base, yielding models with different relative paths; reusing across bases would let the caller rebase an already-resolved path. Cycle-safe: an include cycle is intrinsic to a node's subtree, so it is detected on the node's first load, before it can be cached. Adds a deep-diamond regression test (times out without the cache) and a benchmark. Signed-off-by: Davi de Castro Reis --- loader/include.go | 130 ++++++++++++++++++++++++++++++++++++++++- loader/include_test.go | 45 ++++++++++++++ 2 files changed, 172 insertions(+), 3 deletions(-) diff --git a/loader/include.go b/loader/include.go index ff310447..0232a622 100644 --- a/loader/include.go +++ b/loader/include.go @@ -18,10 +18,14 @@ package loader import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" "os" "path/filepath" + "sort" "strings" + "sync" "github.com/compose-spec/compose-go/v2/dotenv" interp "github.com/compose-spec/compose-go/v2/interpolation" @@ -30,6 +34,114 @@ import ( "github.com/compose-spec/compose-go/v2/types" ) +// includeCache memoizes loaded include models for the duration of a single +// project load. A file reached through more than one include path (a "diamond" +// in the include graph) was previously parsed and recursively expanded once per +// path, which is quadratic-to-exponential on deep graphs. The cache parses and +// expands each distinct include only once. +// +// The key captures everything that determines the loaded model — the resolved +// file paths, the project directory, and the effective environment — so a cache +// hit is equivalent to a fresh load even when the same file is included with a +// different env_file or project_directory. Each consumer gets a fresh deep copy, +// so importResources (and later normalization) never mutates a cached entry or a +// sibling branch that shares it. +// +// Cycle-safe: an include cycle is intrinsic to a node's subtree (the back-edge +// is in the fixed set of files the node includes), so it is detected on the +// node's first load, which fails before the node can be cached. +type includeCache struct { + mu sync.Mutex + entries map[string]map[string]any +} + +type includeCacheKey struct{} + +// getOrCreateIncludeCache returns the include cache carried by ctx, creating one +// (and a derived context) on first use so that all sibling and descendant +// includes of a single load share it. +func getOrCreateIncludeCache(ctx context.Context) (*includeCache, context.Context) { + if c, ok := ctx.Value(includeCacheKey{}).(*includeCache); ok { + return c, ctx + } + c := &includeCache{entries: map[string]map[string]any{}} + return c, context.WithValue(ctx, includeCacheKey{}, c) +} + +func (c *includeCache) get(key string) (map[string]any, bool) { + c.mu.Lock() + defer c.mu.Unlock() + if m, ok := c.entries[key]; ok { + return deepCopyMapping(m), true + } + return nil, false +} + +func (c *includeCache) put(key string, model map[string]any) { + c.mu.Lock() + defer c.mu.Unlock() + c.entries[key] = deepCopyMapping(model) +} + +// includeKey hashes the inputs that determine an included model. Two include +// entries with the same key load identical content — including identical +// relative paths, so a cached model is reuse-safe in the caller's context. +// +// workingDir (the relative base the included model's paths are resolved against) +// is part of the key: the same file reached through two include parents can have +// a different relative base (e.g. "a/b" vs "b"), which yields models with +// different relative paths. Keying on it avoids reusing a model whose paths the +// caller would then rebase incorrectly. +func includeKey(paths []string, workingDir, projectDir string, env types.Mapping) string { + h := sha256.New() + for _, p := range paths { + _, _ = h.Write([]byte(p)) + _, _ = h.Write([]byte{0}) + } + _, _ = h.Write([]byte{1}) + _, _ = h.Write([]byte(workingDir)) + _, _ = h.Write([]byte{1}) + _, _ = h.Write([]byte(projectDir)) + _, _ = h.Write([]byte{1}) + keys := make([]string, 0, len(env)) + for k := range env { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + _, _ = h.Write([]byte(k)) + _, _ = h.Write([]byte{0}) + _, _ = h.Write([]byte(env[k])) + _, _ = h.Write([]byte{0}) + } + return hex.EncodeToString(h.Sum(nil)) +} + +// deepCopyMapping returns a deep copy of a generic YAML mapping (the shape of a +// not-yet-typed compose model: nested map[string]any / []any / scalars). +func deepCopyMapping(m map[string]any) map[string]any { + out := make(map[string]any, len(m)) + for k, v := range m { + out[k] = deepCopyValue(v) + } + return out +} + +func deepCopyValue(v any) any { + switch t := v.(type) { + case map[string]any: + return deepCopyMapping(t) + case []any: + out := make([]any, len(t)) + for i, e := range t { + out[i] = deepCopyValue(e) + } + return out + default: + return v + } +} + // loadIncludeConfig parse the required config from raw yaml func loadIncludeConfig(source any) ([]types.IncludeConfig, error) { if source == nil { @@ -57,6 +169,8 @@ func ApplyInclude(ctx context.Context, workingDir string, environment types.Mapp return err } + cache, ctx := getOrCreateIncludeCache(ctx) + for _, r := range includeConfig { for _, listener := range options.Listeners { listener("include", map[string]any{ @@ -151,9 +265,19 @@ func ApplyInclude(ctx context.Context, workingDir string, environment types.Mapp LookupValue: config.LookupEnv, TypeCastMapping: options.Interpolate.TypeCastMapping, } - imported, err := loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) - if err != nil { - return err + // Memoize by the inputs that determine the loaded model so a file + // reached through several include paths is parsed and expanded once. + // The merge into `model` still runs for every occurrence (a copy is + // handed out), so any same-file `extends` in the including file still + // resolves and the result is identical to loading it each time. + key := includeKey(r.Path, config.WorkingDir, r.ProjectDirectory, config.Environment) + imported, ok := cache.get(key) + if !ok { + imported, err = loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) + if err != nil { + return err + } + cache.put(key, imported) } err = importResources(imported, model, processor) if err != nil { diff --git a/loader/include_test.go b/loader/include_test.go index 775fa88c..8de36680 100644 --- a/loader/include_test.go +++ b/loader/include_test.go @@ -18,6 +18,7 @@ package loader import ( "context" + "fmt" "os" "path/filepath" "runtime" @@ -243,3 +244,47 @@ func createFileSubDir(t *testing.T, rootDir, subDir, content, fileName string) { path := filepath.Join(subDirPath, fileName) assert.NilError(t, os.WriteFile(path, []byte(content), 0o600)) } + +// TestIncludeDiamondDedup builds a deep "diamond" include graph where every +// level includes the next level twice. Without include memoization the leaf is +// loaded 2^depth times (exponential); the cache loads each distinct file once. +// A depth that is trivial when deduplicated (and astronomically large when not) +// makes this both a correctness and a non-flaky performance regression test. +func TestIncludeDiamondDedup(t *testing.T) { + dir := t.TempDir() + const depth = 24 // 2^24 ~= 16.7M leaf loads without dedup + for i := 0; i < depth; i++ { + content := fmt.Sprintf("include:\n - path: ./level%d.yaml\n - path: ./level%d.yaml\n", i+1, i+1) + assert.NilError(t, os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", i)), []byte(content), 0o600)) + } + leaf := "services:\n leaf:\n image: busybox\n" + assert.NilError(t, os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", depth)), []byte(leaf), 0o600)) + + p, err := LoadWithContext(context.TODO(), types.ConfigDetails{ + WorkingDir: dir, + ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, + }, withProjectName("diamond", true)) + assert.NilError(t, err) + _, err = p.GetService("leaf") + assert.NilError(t, err) +} + +func BenchmarkIncludeDiamond(b *testing.B) { + dir := b.TempDir() + const depth = 16 + for i := 0; i < depth; i++ { + content := fmt.Sprintf("include:\n - path: ./level%d.yaml\n - path: ./level%d.yaml\n", i+1, i+1) + _ = os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", i)), []byte(content), 0o600) + } + _ = os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", depth)), []byte("services:\n leaf:\n image: busybox\n"), 0o600) + b.ResetTimer() + for n := 0; n < b.N; n++ { + _, err := LoadWithContext(context.TODO(), types.ConfigDetails{ + WorkingDir: dir, + ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, + }, withProjectName("diamond", true)) + if err != nil { + b.Fatal(err) + } + } +} From 52aa7aedd27a8726bc6607eed0de64a9282b8b37 Mon Sep 17 00:00:00 2001 From: Davi De Castro Reis Date: Tue, 30 Jun 2026 13:04:15 -0300 Subject: [PATCH 2/3] loader: address include-memoization review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three points from @glours's review of #886: - Cache key: length/count-prefix every field in includeKey so an env value containing a separator byte (NUL et al., legal in .env / process env) can no longer impersonate a boundary and collide two distinct (paths, env) tuples onto one entry. Documents why Substitute/TypeCastMapping are excluded. Adds TestIncludeKeyNoCollision (incl. the reviewer's NUL toy example). - Listener contract: a cache hit skipped loadYamlModel and silently dropped the "extends" (and nested "include") events emitted while expanding the subtree, changing emitted counts by include topology. Now each entry records the events from its first load and replays them, deep-copied, on every cache hit, so a hit is indistinguishable from a fresh load to a listener. Recording is skipped entirely when no listeners are registered — otherwise a doubling include graph would replay an exponential number of internal events, reintroducing the blow-up the cache removes. Adds TestIncludeDiamondListener (asserts 1 without the replay, 2 with it). - Diamond test: run the load off the test goroutine and select on a 30s timer (loader never checks ctx.Done(), so a context timeout cannot interrupt a regressed exponential load); a regression now fails fast with a clear message instead of hanging until the 10-minute go test default. Co-Authored-By: Claude Opus 4.8 (1M context) --- loader/include.go | 115 ++++++++++++++++++++++++++++++++--------- loader/include_test.go | 111 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 199 insertions(+), 27 deletions(-) diff --git a/loader/include.go b/loader/include.go index 0232a622..6ee8ee62 100644 --- a/loader/include.go +++ b/loader/include.go @@ -50,9 +50,31 @@ import ( // Cycle-safe: an include cycle is intrinsic to a node's subtree (the back-edge // is in the fixed set of files the node includes), so it is detected on the // node's first load, which fails before the node can be cached. +// +// Listener fidelity: loadYamlModel emits listener events (notably "extends", and +// nested "include") for the subtree it expands. Memoizing that call would drop +// those events on every cache hit, silently changing the public Listener +// contract (e.g. an extends-counting consumer would see a count that depends on +// include topology). Each entry therefore records the events emitted on first +// load and replays them, in order, on every cache hit — see ApplyInclude. type includeCache struct { mu sync.Mutex - entries map[string]map[string]any + entries map[string]includeCacheEntry +} + +// includeCacheEntry is a memoized include: the expanded model plus the listener +// events emitted while expanding it, replayed on cache hits to keep per-include +// listeners firing once per occurrence. +type includeCacheEntry struct { + model map[string]any + events []recordedEvent +} + +// recordedEvent is a single listener event captured during an include load so it +// can be replayed verbatim on a later cache hit. +type recordedEvent struct { + event string + metadata map[string]any } type includeCacheKey struct{} @@ -64,23 +86,23 @@ func getOrCreateIncludeCache(ctx context.Context) (*includeCache, context.Contex if c, ok := ctx.Value(includeCacheKey{}).(*includeCache); ok { return c, ctx } - c := &includeCache{entries: map[string]map[string]any{}} + c := &includeCache{entries: map[string]includeCacheEntry{}} return c, context.WithValue(ctx, includeCacheKey{}, c) } -func (c *includeCache) get(key string) (map[string]any, bool) { +func (c *includeCache) get(key string) (includeCacheEntry, bool) { c.mu.Lock() defer c.mu.Unlock() - if m, ok := c.entries[key]; ok { - return deepCopyMapping(m), true + if e, ok := c.entries[key]; ok { + return includeCacheEntry{model: deepCopyMapping(e.model), events: e.events}, true } - return nil, false + return includeCacheEntry{}, false } -func (c *includeCache) put(key string, model map[string]any) { +func (c *includeCache) put(key string, model map[string]any, events []recordedEvent) { c.mu.Lock() defer c.mu.Unlock() - c.entries[key] = deepCopyMapping(model) + c.entries[key] = includeCacheEntry{model: deepCopyMapping(model), events: events} } // includeKey hashes the inputs that determine an included model. Two include @@ -92,27 +114,41 @@ func (c *includeCache) put(key string, model map[string]any) { // a different relative base (e.g. "a/b" vs "b"), which yields models with // different relative paths. Keying on it avoids reusing a model whose paths the // caller would then rebase incorrectly. +// +// Encoding: every field is length-prefixed and each variable-length section is +// count-prefixed. A bare separator byte is not safe here — types.Mapping is +// populated from .env files / process env, where any byte (NUL included) is a +// legal key or value, so a value could otherwise impersonate a separator and +// two distinct (paths, env) tuples could hash to the same key, serving a wrong +// cached model with no error surfaced. Length/count prefixes make the byte +// stream uniquely decodable, so distinct inputs always produce distinct keys. +// +// Note: Substitute and TypeCastMapping are intentionally excluded from the key. +// They are invariant across includes within a single Load (cloned unchanged from +// the top-level options at the call site). If a future option lets them vary per +// include, they must be folded into the key. func includeKey(paths []string, workingDir, projectDir string, env types.Mapping) string { h := sha256.New() + writeInt := func(n int) { _, _ = fmt.Fprintf(h, "%d:", n) } + write := func(s string) { + _, _ = fmt.Fprintf(h, "%d:", len(s)) + _, _ = h.Write([]byte(s)) + } + writeInt(len(paths)) for _, p := range paths { - _, _ = h.Write([]byte(p)) - _, _ = h.Write([]byte{0}) + write(p) } - _, _ = h.Write([]byte{1}) - _, _ = h.Write([]byte(workingDir)) - _, _ = h.Write([]byte{1}) - _, _ = h.Write([]byte(projectDir)) - _, _ = h.Write([]byte{1}) + write(workingDir) + write(projectDir) keys := make([]string, 0, len(env)) for k := range env { keys = append(keys, k) } sort.Strings(keys) + writeInt(len(keys)) for _, k := range keys { - _, _ = h.Write([]byte(k)) - _, _ = h.Write([]byte{0}) - _, _ = h.Write([]byte(env[k])) - _, _ = h.Write([]byte{0}) + write(k) + write(env[k]) } return hex.EncodeToString(h.Sum(nil)) } @@ -271,15 +307,46 @@ func ApplyInclude(ctx context.Context, workingDir string, environment types.Mapp // handed out), so any same-file `extends` in the including file still // resolves and the result is identical to loading it each time. key := includeKey(r.Path, config.WorkingDir, r.ProjectDirectory, config.Environment) - imported, ok := cache.get(key) - if !ok { - imported, err = loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) + entry, ok := cache.get(key) + switch { + case ok: + // Replay the events the first load emitted so per-occurrence + // listeners (e.g. "extends", nested "include") fire for this + // traversal too — a cache hit must be indistinguishable from a + // fresh load to a listener. A fresh copy of each metadata map is + // handed out, matching the per-load isolation of a real expansion. + for _, ev := range entry.events { + options.ProcessEvent(ev.event, deepCopyMapping(ev.metadata)) + } + case len(loadOptions.Listeners) == 0: + // No listeners: there is no event contract to preserve, so skip + // recording entirely. This keeps memoization cheap on the common + // listener-free path; recording here would make a doubling include + // graph emit (and replay) an exponential number of internal events, + // reintroducing the very blow-up the cache exists to remove. + entry.model, err = loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) + if err != nil { + return err + } + cache.put(key, entry.model, nil) + default: + // Record every event emitted while expanding this subtree so it can + // be replayed on later cache hits. The recorder runs alongside the + // real listeners, which still fire live on this first occurrence; + // nested includes clone these options and so feed this recorder too, + // making the recording the subtree's full event stream. + var recorded []recordedEvent + loadOptions.Listeners = append(append([]Listener{}, loadOptions.Listeners...), + func(event string, metadata map[string]any) { + recorded = append(recorded, recordedEvent{event: event, metadata: deepCopyMapping(metadata)}) + }) + entry.model, err = loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) if err != nil { return err } - cache.put(key, imported) + cache.put(key, entry.model, recorded) } - err = importResources(imported, model, processor) + err = importResources(entry.model, model, processor) if err != nil { return err } diff --git a/loader/include_test.go b/loader/include_test.go index 8de36680..4fbea853 100644 --- a/loader/include_test.go +++ b/loader/include_test.go @@ -23,6 +23,7 @@ import ( "path/filepath" "runtime" "testing" + "time" "github.com/compose-spec/compose-go/v2/types" "gotest.tools/v3/assert" @@ -260,13 +261,117 @@ func TestIncludeDiamondDedup(t *testing.T) { leaf := "services:\n leaf:\n image: busybox\n" assert.NilError(t, os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", depth)), []byte(leaf), 0o600)) + type result struct { + p *types.Project + err error + } + + // loader doesn't check ctx.Done() during expansion, so a context timeout + // wouldn't interrupt a regressed (exponential) load — the goroutine would + // stay parked and the test would hang until the 10-minute `go test` default, + // failing with a generic timeout. Running the load off the test goroutine and + // selecting on a separate timer gives a fast, descriptive failure instead. The + // leaked goroutine on timeout is harmless: t.Fatal ends the test immediately. + timeout, cancel := context.WithTimeout(t.Context(), 30*time.Second) + defer cancel() + + done := make(chan result, 1) + go func() { + p, err := LoadWithContext(t.Context(), types.ConfigDetails{ + WorkingDir: dir, + ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, + }, withProjectName("diamond", true)) + done <- result{p, err} + }() + + select { + case r := <-done: + assert.NilError(t, r.err) + _, err := r.p.GetService("leaf") + assert.NilError(t, err) + case <-timeout.Done(): + t.Fatal("diamond include did not complete within 30s — include memoization likely regressed") + } +} + +// TestIncludeDiamondListener pins the public Listener contract under include +// memoization: a listener event emitted while expanding an included file must +// fire once per include occurrence, even when the file is served from the +// include cache. shared.yaml is reached through both a.yaml and b.yaml (a +// diamond) and carries one `extends`; a faithful load emits "extends" twice +// (once per path), exactly as loading shared.yaml twice without a cache would. +// Memoizing the load must replay the recorded event on the cache hit rather than +// silently dropping it — otherwise the emitted count would depend on include +// topology. Without the recordings replayed in ApplyInclude this asserts 1. +func TestIncludeDiamondListener(t *testing.T) { + dir := t.TempDir() + createFile(t, dir, "include:\n - path: ./a.yaml\n - path: ./b.yaml\n", "root.yaml") + createFile(t, dir, "include:\n - path: ./shared.yaml\n", "a.yaml") + createFile(t, dir, "include:\n - path: ./shared.yaml\n", "b.yaml") + createFile(t, dir, "services:\n base:\n image: alpine\n derived:\n extends: base\n command: echo\n", "shared.yaml") + + var extendsCount, includeCount int p, err := LoadWithContext(context.TODO(), types.ConfigDetails{ WorkingDir: dir, - ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, - }, withProjectName("diamond", true)) + ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "root.yaml")}}, + }, func(options *Options) { + options.SkipNormalization = true + options.ResolvePaths = true + options.SetProjectName("diamond-listener", true) + options.Listeners = []Listener{ + func(event string, _ map[string]any) { + switch event { + case "extends": + extendsCount++ + case "include": + includeCount++ + } + }, + } + }) assert.NilError(t, err) - _, err = p.GetService("leaf") + _, err = p.GetService("derived") assert.NilError(t, err) + + // One "extends" per traversal of shared.yaml (via a.yaml and via b.yaml). + assert.Equal(t, extendsCount, 2) + // "include" fires per occurrence in the outer loop (root→a, root→b, a→shared, + // b→shared); it is unaffected by the cache and pins that as the baseline. + assert.Equal(t, includeCount, 4) +} + +// TestIncludeKeyNoCollision pins the length/count-prefixed encoding of the +// include cache key: distinct (paths, workingDir, projectDir, env) tuples must +// never hash to the same key. A bare-separator encoding can collide when a value +// contains the separator byte (env comes from .env files / process env, where +// any byte is legal) or when a field's content spills across a positional +// boundary. A collision would serve a wrong cached model with no error surfaced. +func TestIncludeKeyNoCollision(t *testing.T) { + base := includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "B"}) + + cases := map[string]string{ + // Reviewer's NUL toy example: both serialize identically under a bare + // NUL separator, but the keys must differ. + "nul in key vs value a": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A\x00B": "X"}), + "nul in key vs value b": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "B\x00X"}), + // Field content that could impersonate an adjacent field/boundary. + "path absorbs workingdir": includeKey([]string{"compose.yaml", "/wd"}, "/pd", "", types.Mapping{}), + "env absorbs fields": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"X": "Y"}), + // Plain distinct inputs. + "different env value": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "C"}), + "different workingdir": includeKey([]string{"compose.yaml"}, "/other", "/pd", types.Mapping{"A": "B"}), + } + + seen := map[string]string{base: "base"} + for name, key := range cases { + if prev, ok := seen[key]; ok { + t.Fatalf("include key collision between %q and %q", prev, name) + } + seen[key] = name + } + + // Identical inputs must produce identical keys (the cache hit path). + assert.Equal(t, base, includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "B"})) } func BenchmarkIncludeDiamond(b *testing.B) { From 7b4c015b650bbf615b71d831193b75df7dd84034 Mon Sep 17 00:00:00 2001 From: Davi De Castro Reis Date: Tue, 30 Jun 2026 16:51:04 -0300 Subject: [PATCH 3/3] loader: tighten include-memoization comments, drop unused benchmark Rewrite comments to describe only HEAD state without paraphrasing the code, and remove the leftover BenchmarkIncludeDiamond (TestIncludeDiamondDedup already guards the perf regression). Co-Authored-By: Claude Opus 4.8 (1M context) --- loader/include.go | 106 ++++++++++++++--------------------------- loader/include_test.go | 72 ++++++++-------------------- 2 files changed, 57 insertions(+), 121 deletions(-) diff --git a/loader/include.go b/loader/include.go index 6ee8ee62..c42b445c 100644 --- a/loader/include.go +++ b/loader/include.go @@ -34,44 +34,28 @@ import ( "github.com/compose-spec/compose-go/v2/types" ) -// includeCache memoizes loaded include models for the duration of a single -// project load. A file reached through more than one include path (a "diamond" -// in the include graph) was previously parsed and recursively expanded once per -// path, which is quadratic-to-exponential on deep graphs. The cache parses and -// expands each distinct include only once. +// includeCache memoizes include models for the duration of a single project +// load, so a file reached through several include paths (a "diamond" in the +// include graph) is parsed and expanded once rather than once per path. Entries +// are keyed by every input that determines the model (see includeKey) and handed +// out as deep copies, so importResources and later normalization never mutate a +// cached entry or a sibling branch sharing it. // -// The key captures everything that determines the loaded model — the resolved -// file paths, the project directory, and the effective environment — so a cache -// hit is equivalent to a fresh load even when the same file is included with a -// different env_file or project_directory. Each consumer gets a fresh deep copy, -// so importResources (and later normalization) never mutates a cached entry or a -// sibling branch that shares it. -// -// Cycle-safe: an include cycle is intrinsic to a node's subtree (the back-edge -// is in the fixed set of files the node includes), so it is detected on the -// node's first load, which fails before the node can be cached. -// -// Listener fidelity: loadYamlModel emits listener events (notably "extends", and -// nested "include") for the subtree it expands. Memoizing that call would drop -// those events on every cache hit, silently changing the public Listener -// contract (e.g. an extends-counting consumer would see a count that depends on -// include topology). Each entry therefore records the events emitted on first -// load and replays them, in order, on every cache hit — see ApplyInclude. +// An include cycle is intrinsic to a node's subtree, so it is detected on the +// node's first load — which fails before the node is cached — and a cyclic node +// is never served from cache. type includeCache struct { mu sync.Mutex entries map[string]includeCacheEntry } -// includeCacheEntry is a memoized include: the expanded model plus the listener -// events emitted while expanding it, replayed on cache hits to keep per-include -// listeners firing once per occurrence. +// includeCacheEntry pairs a memoized model with the listener events emitted +// while expanding it, replayed on each cache hit (see ApplyInclude). type includeCacheEntry struct { model map[string]any events []recordedEvent } -// recordedEvent is a single listener event captured during an include load so it -// can be replayed verbatim on a later cache hit. type recordedEvent struct { event string metadata map[string]any @@ -79,9 +63,8 @@ type recordedEvent struct { type includeCacheKey struct{} -// getOrCreateIncludeCache returns the include cache carried by ctx, creating one -// (and a derived context) on first use so that all sibling and descendant -// includes of a single load share it. +// getOrCreateIncludeCache returns the cache carried by ctx, creating one on first +// use so every include in a single load shares it. func getOrCreateIncludeCache(ctx context.Context) (*includeCache, context.Context) { if c, ok := ctx.Value(includeCacheKey{}).(*includeCache); ok { return c, ctx @@ -105,28 +88,20 @@ func (c *includeCache) put(key string, model map[string]any, events []recordedEv c.entries[key] = includeCacheEntry{model: deepCopyMapping(model), events: events} } -// includeKey hashes the inputs that determine an included model. Two include -// entries with the same key load identical content — including identical -// relative paths, so a cached model is reuse-safe in the caller's context. +// includeKey hashes the inputs that fully determine an included model, so two +// entries with the same key are interchangeable in the caller's context. // -// workingDir (the relative base the included model's paths are resolved against) -// is part of the key: the same file reached through two include parents can have -// a different relative base (e.g. "a/b" vs "b"), which yields models with -// different relative paths. Keying on it avoids reusing a model whose paths the -// caller would then rebase incorrectly. +// workingDir is part of the key: the same file reached through two parents can +// have a different relative base (e.g. "a/b" vs "b") and so resolve to different +// relative paths; sharing across bases would let the caller rebase them wrongly. // -// Encoding: every field is length-prefixed and each variable-length section is -// count-prefixed. A bare separator byte is not safe here — types.Mapping is -// populated from .env files / process env, where any byte (NUL included) is a -// legal key or value, so a value could otherwise impersonate a separator and -// two distinct (paths, env) tuples could hash to the same key, serving a wrong -// cached model with no error surfaced. Length/count prefixes make the byte -// stream uniquely decodable, so distinct inputs always produce distinct keys. +// Fields are length-prefixed and variable-length sections count-prefixed so the +// encoding is unambiguous. types.Mapping values may contain any byte, so a bare +// separator would let one tuple impersonate another and collide onto a wrong +// entry. // -// Note: Substitute and TypeCastMapping are intentionally excluded from the key. -// They are invariant across includes within a single Load (cloned unchanged from -// the top-level options at the call site). If a future option lets them vary per -// include, they must be folded into the key. +// Substitute and TypeCastMapping are excluded: they are invariant across includes +// within a load. A future option that varies them per include must fold them in. func includeKey(paths []string, workingDir, projectDir string, env types.Mapping) string { h := sha256.New() writeInt := func(n int) { _, _ = fmt.Fprintf(h, "%d:", n) } @@ -301,40 +276,33 @@ func ApplyInclude(ctx context.Context, workingDir string, environment types.Mapp LookupValue: config.LookupEnv, TypeCastMapping: options.Interpolate.TypeCastMapping, } - // Memoize by the inputs that determine the loaded model so a file - // reached through several include paths is parsed and expanded once. - // The merge into `model` still runs for every occurrence (a copy is - // handed out), so any same-file `extends` in the including file still - // resolves and the result is identical to loading it each time. + // importResources below runs for every occurrence on a handed-out copy, + // so a same-file extends in the including file still resolves and the + // result matches loading the file each time; only parse and expansion + // are shared. key := includeKey(r.Path, config.WorkingDir, r.ProjectDirectory, config.Environment) entry, ok := cache.get(key) switch { case ok: - // Replay the events the first load emitted so per-occurrence - // listeners (e.g. "extends", nested "include") fire for this - // traversal too — a cache hit must be indistinguishable from a - // fresh load to a listener. A fresh copy of each metadata map is - // handed out, matching the per-load isolation of a real expansion. + // Replay so per-occurrence listeners (extends, nested include) fire + // on this traversal too: a cache hit must look identical to a fresh + // load. for _, ev := range entry.events { options.ProcessEvent(ev.event, deepCopyMapping(ev.metadata)) } case len(loadOptions.Listeners) == 0: - // No listeners: there is no event contract to preserve, so skip - // recording entirely. This keeps memoization cheap on the common - // listener-free path; recording here would make a doubling include - // graph emit (and replay) an exponential number of internal events, - // reintroducing the very blow-up the cache exists to remove. + // No listeners, no event contract to preserve. Skipping the recorder + // is required, not just an optimization: a doubling include graph + // would otherwise record and replay an exponential number of events. entry.model, err = loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) if err != nil { return err } cache.put(key, entry.model, nil) default: - // Record every event emitted while expanding this subtree so it can - // be replayed on later cache hits. The recorder runs alongside the - // real listeners, which still fire live on this first occurrence; - // nested includes clone these options and so feed this recorder too, - // making the recording the subtree's full event stream. + // Record this subtree's events for replay on later hits. The recorder + // runs alongside the live listeners and, through the cloned options of + // nested includes, captures the whole subtree. var recorded []recordedEvent loadOptions.Listeners = append(append([]Listener{}, loadOptions.Listeners...), func(event string, metadata map[string]any) { diff --git a/loader/include_test.go b/loader/include_test.go index 4fbea853..b902e393 100644 --- a/loader/include_test.go +++ b/loader/include_test.go @@ -246,11 +246,9 @@ func createFileSubDir(t *testing.T, rootDir, subDir, content, fileName string) { assert.NilError(t, os.WriteFile(path, []byte(content), 0o600)) } -// TestIncludeDiamondDedup builds a deep "diamond" include graph where every -// level includes the next level twice. Without include memoization the leaf is -// loaded 2^depth times (exponential); the cache loads each distinct file once. -// A depth that is trivial when deduplicated (and astronomically large when not) -// makes this both a correctness and a non-flaky performance regression test. +// TestIncludeDiamondDedup builds a deep diamond include graph (every level +// includes the next twice). The cache must load each distinct file once; without +// it the leaf loads 2^depth times. Doubles as a non-flaky perf regression guard. func TestIncludeDiamondDedup(t *testing.T) { dir := t.TempDir() const depth = 24 // 2^24 ~= 16.7M leaf loads without dedup @@ -266,12 +264,10 @@ func TestIncludeDiamondDedup(t *testing.T) { err error } - // loader doesn't check ctx.Done() during expansion, so a context timeout - // wouldn't interrupt a regressed (exponential) load — the goroutine would - // stay parked and the test would hang until the 10-minute `go test` default, - // failing with a generic timeout. Running the load off the test goroutine and - // selecting on a separate timer gives a fast, descriptive failure instead. The - // leaked goroutine on timeout is harmless: t.Fatal ends the test immediately. + // loader doesn't check ctx.Done() during expansion, so a context timeout can't + // interrupt a regressed load; run it off the test goroutine and select on a + // timer for a fast, descriptive failure. The leaked goroutine is harmless — + // t.Fatal ends the test. timeout, cancel := context.WithTimeout(t.Context(), 30*time.Second) defer cancel() @@ -294,15 +290,10 @@ func TestIncludeDiamondDedup(t *testing.T) { } } -// TestIncludeDiamondListener pins the public Listener contract under include -// memoization: a listener event emitted while expanding an included file must -// fire once per include occurrence, even when the file is served from the -// include cache. shared.yaml is reached through both a.yaml and b.yaml (a -// diamond) and carries one `extends`; a faithful load emits "extends" twice -// (once per path), exactly as loading shared.yaml twice without a cache would. -// Memoizing the load must replay the recorded event on the cache hit rather than -// silently dropping it — otherwise the emitted count would depend on include -// topology. Without the recordings replayed in ApplyInclude this asserts 1. +// TestIncludeDiamondListener pins the Listener contract under memoization: +// shared.yaml is reached through both a.yaml and b.yaml and carries one extends, +// so the "extends" event must fire once per path (twice) — the cache hit replays +// it rather than dropping it. Without the replay this asserts 1. func TestIncludeDiamondListener(t *testing.T) { dir := t.TempDir() createFile(t, dir, "include:\n - path: ./a.yaml\n - path: ./b.yaml\n", "root.yaml") @@ -333,28 +324,25 @@ func TestIncludeDiamondListener(t *testing.T) { _, err = p.GetService("derived") assert.NilError(t, err) - // One "extends" per traversal of shared.yaml (via a.yaml and via b.yaml). + // One extends per traversal of shared.yaml (via a.yaml and via b.yaml). assert.Equal(t, extendsCount, 2) - // "include" fires per occurrence in the outer loop (root→a, root→b, a→shared, - // b→shared); it is unaffected by the cache and pins that as the baseline. + // include fires per occurrence regardless of the cache: root→a, root→b, + // a→shared, b→shared. assert.Equal(t, includeCount, 4) } -// TestIncludeKeyNoCollision pins the length/count-prefixed encoding of the -// include cache key: distinct (paths, workingDir, projectDir, env) tuples must -// never hash to the same key. A bare-separator encoding can collide when a value -// contains the separator byte (env comes from .env files / process env, where -// any byte is legal) or when a field's content spills across a positional -// boundary. A collision would serve a wrong cached model with no error surfaced. +// TestIncludeKeyNoCollision: distinct (paths, workingDir, projectDir, env) tuples +// must never hash to the same key. A bare separator collides when an env value +// contains the separator byte or a field spills across a positional boundary; a +// collision would serve a wrong cached model silently. func TestIncludeKeyNoCollision(t *testing.T) { base := includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "B"}) cases := map[string]string{ - // Reviewer's NUL toy example: both serialize identically under a bare - // NUL separator, but the keys must differ. + // NUL in key vs value: identical under a bare separator, distinct keys required. "nul in key vs value a": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A\x00B": "X"}), "nul in key vs value b": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "B\x00X"}), - // Field content that could impersonate an adjacent field/boundary. + // Field content that could impersonate an adjacent field. "path absorbs workingdir": includeKey([]string{"compose.yaml", "/wd"}, "/pd", "", types.Mapping{}), "env absorbs fields": includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"X": "Y"}), // Plain distinct inputs. @@ -373,23 +361,3 @@ func TestIncludeKeyNoCollision(t *testing.T) { // Identical inputs must produce identical keys (the cache hit path). assert.Equal(t, base, includeKey([]string{"compose.yaml"}, "/wd", "/pd", types.Mapping{"A": "B"})) } - -func BenchmarkIncludeDiamond(b *testing.B) { - dir := b.TempDir() - const depth = 16 - for i := 0; i < depth; i++ { - content := fmt.Sprintf("include:\n - path: ./level%d.yaml\n - path: ./level%d.yaml\n", i+1, i+1) - _ = os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", i)), []byte(content), 0o600) - } - _ = os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", depth)), []byte("services:\n leaf:\n image: busybox\n"), 0o600) - b.ResetTimer() - for n := 0; n < b.N; n++ { - _, err := LoadWithContext(context.TODO(), types.ConfigDetails{ - WorkingDir: dir, - ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, - }, withProjectName("diamond", true)) - if err != nil { - b.Fatal(err) - } - } -}