-
Notifications
You must be signed in to change notification settings - Fork 156
loader: memoize includes to avoid re-expanding diamond include graphs #886
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,10 +18,14 @@ package loader | |
|
|
||
| import ( | ||
| "context" | ||
| "crypto/sha256" | ||
| "encoding/hex" | ||
| "fmt" | ||
| "os" | ||
| "path/filepath" | ||
| "sort" | ||
| "strings" | ||
| "sync" | ||
|
|
||
| "github.com/compose-spec/compose-go/v2/dotenv" | ||
| interp "github.com/compose-spec/compose-go/v2/interpolation" | ||
|
|
@@ -30,6 +34,114 @@ import ( | |
| "github.com/compose-spec/compose-go/v2/types" | ||
| ) | ||
|
|
||
| // includeCache memoizes loaded include models for the duration of a single | ||
| // project load. A file reached through more than one include path (a "diamond" | ||
| // in the include graph) was previously parsed and recursively expanded once per | ||
| // path, which is quadratic-to-exponential on deep graphs. The cache parses and | ||
| // expands each distinct include only once. | ||
| // | ||
| // The key captures everything that determines the loaded model — the resolved | ||
| // file paths, the project directory, and the effective environment — so a cache | ||
| // hit is equivalent to a fresh load even when the same file is included with a | ||
| // different env_file or project_directory. Each consumer gets a fresh deep copy, | ||
| // so importResources (and later normalization) never mutates a cached entry or a | ||
| // sibling branch that shares it. | ||
| // | ||
| // Cycle-safe: an include cycle is intrinsic to a node's subtree (the back-edge | ||
| // is in the fixed set of files the node includes), so it is detected on the | ||
| // node's first load, which fails before the node can be cached. | ||
| type includeCache struct { | ||
| mu sync.Mutex | ||
| entries map[string]map[string]any | ||
| } | ||
|
|
||
| type includeCacheKey struct{} | ||
|
|
||
| // getOrCreateIncludeCache returns the include cache carried by ctx, creating one | ||
| // (and a derived context) on first use so that all sibling and descendant | ||
| // includes of a single load share it. | ||
| func getOrCreateIncludeCache(ctx context.Context) (*includeCache, context.Context) { | ||
| if c, ok := ctx.Value(includeCacheKey{}).(*includeCache); ok { | ||
| return c, ctx | ||
| } | ||
| c := &includeCache{entries: map[string]map[string]any{}} | ||
| return c, context.WithValue(ctx, includeCacheKey{}, c) | ||
| } | ||
|
|
||
| func (c *includeCache) get(key string) (map[string]any, bool) { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
| if m, ok := c.entries[key]; ok { | ||
| return deepCopyMapping(m), true | ||
| } | ||
| return nil, false | ||
| } | ||
|
|
||
| func (c *includeCache) put(key string, model map[string]any) { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
| c.entries[key] = deepCopyMapping(model) | ||
| } | ||
|
|
||
| // includeKey hashes the inputs that determine an included model. Two include | ||
| // entries with the same key load identical content — including identical | ||
| // relative paths, so a cached model is reuse-safe in the caller's context. | ||
| // | ||
| // workingDir (the relative base the included model's paths are resolved against) | ||
| // is part of the key: the same file reached through two include parents can have | ||
| // a different relative base (e.g. "a/b" vs "b"), which yields models with | ||
| // different relative paths. Keying on it avoids reusing a model whose paths the | ||
| // caller would then rebase incorrectly. | ||
| func includeKey(paths []string, workingDir, projectDir string, env types.Mapping) string { | ||
| h := sha256.New() | ||
| for _, p := range paths { | ||
| _, _ = h.Write([]byte(p)) | ||
| _, _ = h.Write([]byte{0}) | ||
| } | ||
| _, _ = h.Write([]byte{1}) | ||
| _, _ = h.Write([]byte(workingDir)) | ||
| _, _ = h.Write([]byte{1}) | ||
| _, _ = h.Write([]byte(projectDir)) | ||
| _, _ = h.Write([]byte{1}) | ||
| keys := make([]string, 0, len(env)) | ||
| for k := range env { | ||
| keys = append(keys, k) | ||
| } | ||
| sort.Strings(keys) | ||
| for _, k := range keys { | ||
| _, _ = h.Write([]byte(k)) | ||
| _, _ = h.Write([]byte{0}) | ||
| _, _ = h.Write([]byte(env[k])) | ||
| _, _ = h.Write([]byte{0}) | ||
| } | ||
| return hex.EncodeToString(h.Sum(nil)) | ||
| } | ||
|
|
||
| // deepCopyMapping returns a deep copy of a generic YAML mapping (the shape of a | ||
| // not-yet-typed compose model: nested map[string]any / []any / scalars). | ||
| func deepCopyMapping(m map[string]any) map[string]any { | ||
| out := make(map[string]any, len(m)) | ||
| for k, v := range m { | ||
| out[k] = deepCopyValue(v) | ||
| } | ||
| return out | ||
| } | ||
|
|
||
| func deepCopyValue(v any) any { | ||
| switch t := v.(type) { | ||
| case map[string]any: | ||
| return deepCopyMapping(t) | ||
| case []any: | ||
| out := make([]any, len(t)) | ||
| for i, e := range t { | ||
| out[i] = deepCopyValue(e) | ||
| } | ||
| return out | ||
| default: | ||
| return v | ||
| } | ||
| } | ||
|
|
||
| // loadIncludeConfig parse the required config from raw yaml | ||
| func loadIncludeConfig(source any) ([]types.IncludeConfig, error) { | ||
| if source == nil { | ||
|
|
@@ -57,6 +169,8 @@ func ApplyInclude(ctx context.Context, workingDir string, environment types.Mapp | |
| return err | ||
| } | ||
|
|
||
| cache, ctx := getOrCreateIncludeCache(ctx) | ||
|
|
||
| for _, r := range includeConfig { | ||
| for _, listener := range options.Listeners { | ||
| listener("include", map[string]any{ | ||
|
|
@@ -151,9 +265,19 @@ func ApplyInclude(ctx context.Context, workingDir string, environment types.Mapp | |
| LookupValue: config.LookupEnv, | ||
| TypeCastMapping: options.Interpolate.TypeCastMapping, | ||
| } | ||
| imported, err := loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) | ||
| if err != nil { | ||
| return err | ||
| // Memoize by the inputs that determine the loaded model so a file | ||
| // reached through several include paths is parsed and expanded once. | ||
| // The merge into `model` still runs for every occurrence (a copy is | ||
| // handed out), so any same-file `extends` in the including file still | ||
| // resolves and the result is identical to loading it each time. | ||
| key := includeKey(r.Path, config.WorkingDir, r.ProjectDirectory, config.Environment) | ||
| imported, ok := cache.get(key) | ||
| if !ok { | ||
| imported, err = loadYamlModel(ctx, config, loadOptions, &cycleTracker{}, included) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| cache.put(key, imported) | ||
|
Comment on lines
+274
to
+280
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a silent change to the public When That isn't a cosmetic observability change:
I'd suggest preserving the contract by recording events on first load and replaying them on cache hit. |
||
| } | ||
| err = importResources(imported, model, processor) | ||
| if err != nil { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ package loader | |
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "os" | ||
| "path/filepath" | ||
| "runtime" | ||
|
|
@@ -243,3 +244,47 @@ func createFileSubDir(t *testing.T, rootDir, subDir, content, fileName string) { | |
| path := filepath.Join(subDirPath, fileName) | ||
| assert.NilError(t, os.WriteFile(path, []byte(content), 0o600)) | ||
| } | ||
|
|
||
| // TestIncludeDiamondDedup builds a deep "diamond" include graph where every | ||
| // level includes the next level twice. Without include memoization the leaf is | ||
| // loaded 2^depth times (exponential); the cache loads each distinct file once. | ||
| // A depth that is trivial when deduplicated (and astronomically large when not) | ||
| // makes this both a correctness and a non-flaky performance regression test. | ||
| func TestIncludeDiamondDedup(t *testing.T) { | ||
| dir := t.TempDir() | ||
| const depth = 24 // 2^24 ~= 16.7M leaf loads without dedup | ||
| for i := 0; i < depth; i++ { | ||
| content := fmt.Sprintf("include:\n - path: ./level%d.yaml\n - path: ./level%d.yaml\n", i+1, i+1) | ||
| assert.NilError(t, os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", i)), []byte(content), 0o600)) | ||
| } | ||
| leaf := "services:\n leaf:\n image: busybox\n" | ||
| assert.NilError(t, os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", depth)), []byte(leaf), 0o600)) | ||
|
|
||
| p, err := LoadWithContext(context.TODO(), types.ConfigDetails{ | ||
| WorkingDir: dir, | ||
| ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, | ||
| }, withProjectName("diamond", true)) | ||
| assert.NilError(t, err) | ||
| _, err = p.GetService("leaf") | ||
| assert.NilError(t, err) | ||
| } | ||
|
Comment on lines
+253
to
+270
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice regression test, but the only assertions are A fast, descriptive fail requires running the load in a goroutine and selecting on a timeout. The leaked goroutine on timeout is fine, type result struct {
p *types.Project
err error
}
// LoaderWithContext don't properly support context today, so the timeout below is consumed by the
// select, not by LoadWithContext. Two separate contexts keep that intent clear.
timeout, cancel := context.WithTimeout(t.Context(), 5*time.Second)
defer cancel()
done := make(chan result, 1)
go func() {
p, err := LoadWithContext(t.Context(), types.ConfigDetails{
WorkingDir: dir,
ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}},
}, withProjectName("diamond", true))
done <- result{p, err}
}()
select {
case r := <-done:
assert.NilError(t, r.err)
_, err := r.p.GetService("leaf")
assert.NilError(t, err)
case <-timeout.Done():
t.Fatal("diamond include did not complete within 5s — cache likely not working")
}5s is generous; the current run completes in ~0.18s on my machine. |
||
|
|
||
| func BenchmarkIncludeDiamond(b *testing.B) { | ||
| dir := b.TempDir() | ||
| const depth = 16 | ||
| for i := 0; i < depth; i++ { | ||
| content := fmt.Sprintf("include:\n - path: ./level%d.yaml\n - path: ./level%d.yaml\n", i+1, i+1) | ||
| _ = os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", i)), []byte(content), 0o600) | ||
| } | ||
| _ = os.WriteFile(filepath.Join(dir, fmt.Sprintf("level%d.yaml", depth)), []byte("services:\n leaf:\n image: busybox\n"), 0o600) | ||
| b.ResetTimer() | ||
| for n := 0; n < b.N; n++ { | ||
| _, err := LoadWithContext(context.TODO(), types.ConfigDetails{ | ||
| WorkingDir: dir, | ||
| ConfigFiles: []types.ConfigFile{{Filename: filepath.Join(dir, "level0.yaml")}}, | ||
| }, withProjectName("diamond", true)) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The single-byte separators written here (
[]byte{0}on lines 99, 113, 115 and[]byte{1}on lines 101, 103, 105) aren't unambiguous against env values that contain those bytes.types.Mappingismap[string]stringpopulated from.envfiles /process env, where embedded NULs aren't forbidden. Two distinct
(paths, env)tuples can then produce the same byte stream and hash to the same key, wrong cached model served, no error surfaced.Toy example:
env = {"A\x00B": "X"}andenv = {"A": "B\x00X"}both serialize to…A\x00B\x00X\x00through the loop on lines 111-116.Length-prefixing each field removes the ambiguity:
While you're here, a short comment explaining why Substitute / TypeCastMapping are intentionally excluded from the key (invariant across includes within a single load) would help a future contributor who adds a per-subtree option not silently introduce collisions, like