Skip to content

Commit 0db26d7

Browse files
tmshortclaude
andauthored
OCPBUGS-78787: fix(operator-controller): clean up orphaned temp dirs in catalog cache (#2574)
filesystemCache.writeFS creates a temp dir (.{catalog}-{random}) and renames it into place atomically. If the process is interrupted before the rename, the temp dir persists. Each restart adds another, eventually filling the disk. Additionally, writeFS had no defer os.RemoveAll(tmpDir), so any error during WalkMetasReader or the rename step also left the temp dir behind — no process kill required. Two fixes: - Add defer os.RemoveAll(tmpDir) so errors during normal operation clean up. - Add removeOrphanedTempDirs, called at the start of writeFS (under the write mutex), to clean up dirs orphaned by a previous process run. This bounds worst-case accumulation to one orphaned dir per catalog regardless of restart rate. Signed-off-by: Todd Short <tshort@redhat.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent dd1c319 commit 0db26d7

2 files changed

Lines changed: 53 additions & 0 deletions

File tree

internal/operator-controller/catalogmetadata/cache/cache.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"io/fs"
77
"os"
88
"path/filepath"
9+
"strings"
910
"sync"
1011

1112
"github.com/operator-framework/operator-registry/alpha/declcfg"
@@ -75,10 +76,15 @@ func (fsc *filesystemCache) Put(catalogName, resolvedRef string, source io.Reade
7576
func (fsc *filesystemCache) writeFS(catalogName string, source io.Reader) (fs.FS, error) {
7677
cacheDir := fsc.cacheDir(catalogName)
7778

79+
if err := fsc.removeOrphanedTempDirs(catalogName); err != nil {
80+
return nil, err
81+
}
82+
7883
tmpDir, err := os.MkdirTemp(fsc.cachePath, fmt.Sprintf(".%s-", catalogName))
7984
if err != nil {
8085
return nil, fmt.Errorf("error creating temporary directory to unpack catalog metadata: %v", err)
8186
}
87+
defer os.RemoveAll(tmpDir)
8288

8389
if err := declcfg.WalkMetasReader(source, func(meta *declcfg.Meta, err error) error {
8490
if err != nil {
@@ -164,3 +170,26 @@ func (fsc *filesystemCache) Remove(catalogName string) error {
164170
func (fsc *filesystemCache) cacheDir(catalogName string) string {
165171
return filepath.Join(fsc.cachePath, catalogName)
166172
}
173+
174+
// removeOrphanedTempDirs removes temporary staging directories left behind by a
175+
// previous writeFS call for the given catalog that was interrupted before the
176+
// rename (e.g. pod eviction or crash). Temp dirs use the prefix ".{catalogName}-"
177+
// as created by os.MkdirTemp. This method must be called while the write lock is held.
178+
func (fsc *filesystemCache) removeOrphanedTempDirs(catalogName string) error {
179+
entries, err := os.ReadDir(fsc.cachePath)
180+
if os.IsNotExist(err) {
181+
return nil
182+
}
183+
if err != nil {
184+
return fmt.Errorf("error reading cache directory: %w", err)
185+
}
186+
prefix := fmt.Sprintf(".%s-", catalogName)
187+
for _, entry := range entries {
188+
if strings.HasPrefix(entry.Name(), prefix) {
189+
if err := os.RemoveAll(filepath.Join(fsc.cachePath, entry.Name())); err != nil {
190+
return fmt.Errorf("error removing orphaned temp directory %q: %w", entry.Name(), err)
191+
}
192+
}
193+
}
194+
return nil
195+
}

internal/operator-controller/catalogmetadata/cache/cache_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,30 @@ func TestFilesystemCacheRemove(t *testing.T) {
169169
assert.NoDirExists(t, catalogCachePath)
170170
}
171171

172+
func TestFilesystemCachePutCleansOrphanedTempDirs(t *testing.T) {
173+
const catalogName = "test-catalog"
174+
cacheDir := t.TempDir()
175+
c := cache.NewFilesystemCache(cacheDir)
176+
177+
// Simulate temp dirs left behind by a previous interrupted Put for this catalog.
178+
orphan1 := filepath.Join(cacheDir, ".test-catalog-1234567890")
179+
orphan2 := filepath.Join(cacheDir, ".test-catalog-9876543210")
180+
require.NoError(t, os.MkdirAll(orphan1, 0700))
181+
require.NoError(t, os.MkdirAll(orphan2, 0700))
182+
183+
// A temp dir for a different catalog should NOT be removed.
184+
otherOrphan := filepath.Join(cacheDir, ".other-catalog-1111111111")
185+
require.NoError(t, os.MkdirAll(otherOrphan, 0700))
186+
187+
_, err := c.Put(catalogName, "fake/catalog@sha256:fakesha", defaultContent(), nil)
188+
require.NoError(t, err)
189+
190+
assert.NoDirExists(t, orphan1, "orphaned temp dir for catalog should have been removed")
191+
assert.NoDirExists(t, orphan2, "orphaned temp dir for catalog should have been removed")
192+
assert.DirExists(t, otherOrphan, "temp dir for a different catalog should not be removed")
193+
assert.DirExists(t, filepath.Join(cacheDir, catalogName), "real cache dir should exist")
194+
}
195+
172196
func equalFilesystems(expected, actual fs.FS) error {
173197
normalizeJSON := func(data []byte) []byte {
174198
var v interface{}

0 commit comments

Comments
 (0)