From 8a665bd259ae026cdbd58d4dc9c8f37e581f7743 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 23 Apr 2026 11:00:35 -0400 Subject: [PATCH 01/15] feat(memoize): add SQL-backed memoization cache Add a SQL database-backed memoization cache alongside the existing ConfigMap-based implementation. This includes the database query and migration layer, controller wiring, auto-derived cache keys based on inputs and executor fingerprinting, per-entry expiration handling, and workflow-namespace scoping for cache entries. The branch also includes the supporting artifact checksum and ETag plumbing, generated API/docs updates, and follow-up fixes for error handling, nil-safety, and CI/documentation issues discovered during review. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: droctothorpe --- .features/pending/sql-memoization-cache.md | 12 + .spelling | 2 + config/config.go | 15 + docs/environment-variables.md | 2 + docs/memoization.md | 79 ++++- docs/workflow-controller-configmap.md | 16 + .../workflow-controller-configmap.yaml | 12 + manifests/quick-start-postgres.yaml | 12 + test/e2e/sqldb_memoize_test.go | 150 +++++++++ util/file/watch.go | 2 +- util/memo/db/config.go | 83 +++++ util/memo/db/migrate.go | 47 +++ util/memo/db/queries.go | 124 +++++++ util/memo/db/queries_test.go | 313 ++++++++++++++++++ workflow/controller/cache/cache.go | 140 +++++++- .../controller/cache/cache_factory_test.go | 36 ++ workflow/controller/cache/configmap_cache.go | 2 +- workflow/controller/cache/sqldb_cache.go | 63 ++++ workflow/controller/cache/sqldb_cache_test.go | 241 ++++++++++++++ workflow/controller/cache_test.go | 2 +- workflow/controller/config.go | 40 +++ workflow/controller/controller.go | 57 +++- workflow/controller/controller_test.go | 2 +- workflow/controller/dag.go | 20 +- workflow/controller/operator.go | 58 +++- workflow/controller/steps.go | 20 +- workflow/controller/taskset.go | 21 +- 27 files changed, 1515 insertions(+), 56 deletions(-) create mode 100644 .features/pending/sql-memoization-cache.md create mode 100644 test/e2e/sqldb_memoize_test.go create mode 100644 util/memo/db/config.go create mode 100644 util/memo/db/migrate.go create mode 100644 util/memo/db/queries.go create mode 100644 util/memo/db/queries_test.go create mode 100644 workflow/controller/cache/cache_factory_test.go create mode 100644 workflow/controller/cache/sqldb_cache.go create mode 100644 workflow/controller/cache/sqldb_cache_test.go diff --git a/.features/pending/sql-memoization-cache.md b/.features/pending/sql-memoization-cache.md new file mode 100644 index 000000000000..b4a956e46cec --- /dev/null +++ b/.features/pending/sql-memoization-cache.md @@ -0,0 +1,12 @@ +Description: Add SQL database-backed memoization cache as an alternative to ConfigMaps. +Authors: [droctothorpe](https://github.com/droctothorpe) +Component: General +Issues: 15952 +PRs: 15938 + +Memoization can now store cache entries in a PostgreSQL or MySQL database instead of Kubernetes ConfigMaps. +The SQL backend removes the 1 MB ConfigMap size limit and persists cache entries across cluster restarts. +ConfigMaps remain the default; opt in by adding a `memoization` block to the `workflow-controller-configmap`. +Each cache entry computes an `expires_at` timestamp at save time from the template's `maxAge` field (default: 30 days). +The default max age can be overridden via the `DEFAULT_MAX_AGE` environment variable on the controller. +A periodic garbage collector deletes expired entries whose `expires_at` has elapsed. diff --git a/.spelling b/.spelling index 06a13734e8a9..2c4577cb372e 100644 --- a/.spelling +++ b/.spelling @@ -151,6 +151,7 @@ args async auth backend +backends backoff backport backported @@ -175,6 +176,7 @@ entrypoint enum env errored +expires_at expr fibonacci filename diff --git a/config/config.go b/config/config.go index 37b95844b5a6..210d0a21c49c 100644 --- a/config/config.go +++ b/config/config.go @@ -121,6 +121,10 @@ type Config struct { // Synchronization via databases config Synchronization *SyncConfig `json:"synchronization,omitempty"` + // Memoization configures memoization cache storage. When set, cache entries are stored in a + // database instead of ConfigMaps. ConfigMap-based caching remains the default when omitted. + Memoization *MemoizationConfig `json:"memoization,omitempty"` + // ArtifactDrivers lists artifact driver plugins we can use ArtifactDrivers []ArtifactDriver `json:"artifactDrivers,omitempty"` @@ -353,6 +357,17 @@ type SyncConfig struct { SemaphoreLimitCacheSeconds *int64 `json:"semaphoreLimitCacheSeconds,omitempty"` } +// MemoizationConfig contains memoization cache configuration for database-backed storage. +// When configured, cache entries are stored in the specified database table instead of ConfigMaps. +type MemoizationConfig struct { + DBConfig + // TableName is the name of the table to use for memoization cache entries. + // Defaults to "memoization_cache" if not set. + TableName string `json:"tableName,omitempty"` + // SkipMigration skips automatic database migration on startup. + SkipMigration bool `json:"skipMigration,omitempty"` +} + // ConnectionPool contains database connection pool settings type ConnectionPool struct { // MaxIdleConns sets the maximum number of idle connections in the pool diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 1fe05d1b311f..72ba80321b27 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -27,6 +27,7 @@ This document outlines environment variables that can be used to customize behav | `CACHE_GC_PERIOD` | `time.Duration` | `0s` | How often to perform memoization cache GC, which is disabled by default and can be enabled by providing a non-zero duration. | | `CACHE_GC_AFTER_NOT_HIT_DURATION` | `time.Duration` | `30s` | When a memoization cache has not been hit after this duration, it will be deleted. | | `CRON_SYNC_PERIOD` | `time.Duration` | `10s` | How often to sync cron workflows. | +| `DEFAULT_MAX_AGE` | `string` | `""` (30 days) | Default TTL for SQL-backed memoization cache entries when `memoize.maxAge` is not set on the template. Accepts a Go duration string (e.g. `720h`) or an integer number of seconds. If unset, entries expire after 30 days. | | `DEFAULT_REQUEUE_TIME` | `time.Duration` | `10s` | The re-queue time for the rate limiter of the workflow queue. | | `DISABLE_MAX_RECURSION` | `bool` | `false` | Set to true to disable the recursion preventer, which will stop a workflow running which has called into a child template 100 times | | `EXPRESSION_TEMPLATES` | `bool` | `true` | Escape hatch to disable expression templates. | @@ -40,6 +41,7 @@ This document outlines environment variables that can be used to customize behav | `LEADER_ELECTION_RENEW_DEADLINE` | `time.Duration` | `10s` | The duration that the acting master will retry refreshing leadership before giving up. | | `LEADER_ELECTION_RETRY_PERIOD` | `time.Duration` | `5s` | The duration that the leader election clients should wait between tries of actions. | | `MAX_OPERATION_TIME` | `time.Duration` | `30s` | The maximum time a workflow operation is allowed to run for before re-queuing the workflow onto the work queue. | +| `MEMO_CACHE_GC_PERIOD` | `time.Duration` | `24h` | How often the SQL-backed memoization cache garbage collector runs to prune entries that have exceeded their TTL. | | `OFFLOAD_NODE_STATUS_TTL` | `time.Duration` | `5m` | The TTL to delete the offloaded node status. Currently only used for testing. | | `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. | | `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. | diff --git a/docs/memoization.md b/docs/memoization.md index 02d6ee4dbb92..5616891b821c 100644 --- a/docs/memoization.md +++ b/docs/memoization.md @@ -14,14 +14,70 @@ If you are using workflows prior to version 3.5 you should look at the [work avo In version 3.5 or later all steps can be memoized, whether or not they have outputs. -## Cache Method +## Cache Backends -Currently, the cached data is stored in config-maps. +Argo Workflows supports two backends for storing memoization cache entries: + +### ConfigMap (default) + +By default, cached data is stored in Kubernetes ConfigMaps. This allows you to easily manipulate cache entries manually through `kubectl` and the Kubernetes API without having to go through Argo. -All cache config-maps must have the label `workflows.argoproj.io/configmap-type: Cache` to be used as a cache. This prevents accidental access to other important config-maps in the system +All cache ConfigMaps must have the label `workflows.argoproj.io/configmap-type: Cache` to be used as a cache. This prevents accidental access to other important ConfigMaps in the system. + +### SQL Database + +> v4.0 and after + +Alternatively, cache entries can be stored in a PostgreSQL or MySQL database. This is recommended for production use — it has no size limits, supports long-term persistence, and includes automatic garbage collection. + +To enable SQL-backed memoization, add a `memoization` block to the `workflow-controller-configmap`: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: workflow-controller-configmap + namespace: argo +data: + memoization: | + tableName: memoization_cache + postgresql: + host: postgres + port: 5432 + database: postgres + userNameSecret: + name: argo-postgres-config + key: username + passwordSecret: + name: argo-postgres-config + key: password +``` + +Each cache entry computes an `expires_at` timestamp at save time from the template's `maxAge` field. If `maxAge` is not specified on the template, it defaults to 30 days (2592000 seconds). This default can be overridden by setting the `DEFAULT_MAX_AGE` environment variable on the workflow controller (accepts Go duration strings like `720h` or integer seconds like `2592000`). + +The garbage collector periodically deletes entries whose `expires_at` has elapsed. The GC period defaults to 24 hours and can be configured via the `MEMO_CACHE_GC_PERIOD` environment variable. + +MySQL is also supported: + +```yaml + memoization: | + tableName: memoization_cache + mysql: + host: mysql + port: 3306 + database: argo + userNameSecret: + name: argo-mysql-config + key: username + passwordSecret: + name: argo-mysql-config + key: password +``` ## Using Memoization +Memoization is configured at the template level via the `memoize` field. + Memoization is set at the template level. You must specify a `key`, which can be static strings but more often depend on inputs. You must also specify a name for the `config-map` cache. Optionally you can set a `maxAge` in seconds or hours (e.g. `180s`, `24h`) to define how long should it be considered valid. If an entry is older than the `maxAge`, it will be ignored. @@ -43,18 +99,27 @@ spec: name: print-message-cache ``` +### Fields + +| Field | Required | Description | +|-------|----------|-------------| +| `key` | Yes | The cache lookup key. | +| `cache` | Yes | Specifies the cache storage. When using the ConfigMap backend, a ConfigMap is created. When using the SQL backend, `cache.configMap.name` acts as a logical group name in the database — no ConfigMap is created. | +| `maxAge` | Yes | Maximum age of a cache entry (e.g. `"180s"`, `"24h"`). Entries older than this are treated as misses at lookup time. | + [Find a simple example for memoization here](https://github.com/argoproj/argo-workflows/blob/main/examples/memoize-simple.yaml). !!! Note - In order to use memoization it is necessary to add the verbs `create` and `update` to the `configmaps` resource for the appropriate (cluster) roles. In the case of a cluster install the `argo-cluster-role` cluster role should be updated, whilst for a namespace install the `argo-role` role should be updated. + To use memoization with the ConfigMap backend, add the verbs `create` and `update` to the `configmaps` resource for the appropriate (cluster) roles. For a cluster install, update the `argo-cluster-role` cluster role; for a namespace install, update the `argo-role` role. This is not required when using the SQL database backend. ## FAQ 1. If you see errors like `error creating cache entry: ConfigMap \"reuse-task\" is invalid: []: Too long: must have at most 1048576 characters`, this is due to [the 1MB limit placed on the size of `ConfigMap`](https://github.com/kubernetes/kubernetes/issues/19781). Here are a couple of ways that might help resolve this: - * Delete the existing `ConfigMap` cache or switch to use a different cache. - * Reduce the size of the output parameters for the nodes that are being memoized. - * Split your cache into different memoization keys and cache names so that each cache entry is small. + - Delete the existing `ConfigMap` cache or switch to use a different cache. + - Reduce the size of the output parameters for the nodes that are being memoized. + - Split your cache into different memoization keys and cache names so that each cache entry is small. + - Switch to the SQL database backend which has no size limit. 1. My step isn't getting memoized, why not? If you are running workflows <3.5 ensure that you have specified at least one output on the step. diff --git a/docs/workflow-controller-configmap.md b/docs/workflow-controller-configmap.md index 4fe1a47456ca..999dd99331de 100644 --- a/docs/workflow-controller-configmap.md +++ b/docs/workflow-controller-configmap.md @@ -96,6 +96,7 @@ Config contains the root of the configuration settings for the workflow controll | `NavColor` | `string` | NavColor is an ui navigation bar background color | | `SSO` | [`SSOConfig`](#ssoconfig) | SSO in settings for single-sign on | | `Synchronization` | [`SyncConfig`](#syncconfig) | Synchronization via databases config | +| `Memoization` | [`MemoizationConfig`](#memoizationconfig) | Memoization configures memoization cache storage. When set, cache entries are stored in a database instead of ConfigMaps. ConfigMap-based caching remains the default when omitted. | | `ArtifactDrivers` | `Array<`[`ArtifactDriver`](#artifactdriver)`>` | ArtifactDrivers lists artifact driver plugins we can use | | `FailedPodRestart` | [`FailedPodRestartConfig`](#failedpodrestartconfig) | FailedPodRestart configures automatic restart of pods that fail before entering Running state (e.g., due to Eviction, DiskPressure, Preemption). This allows recovery from transient infrastructure issues without requiring a retryStrategy on templates. | @@ -369,6 +370,21 @@ SyncConfig contains synchronization configuration for database locks (semaphores | `InactiveControllerSeconds` | `int` | InactiveControllerSeconds specifies when to consider a controller dead, if not set, the default value is 300 seconds | | `SemaphoreLimitCacheSeconds` | `int64` | SemaphoreLimitCacheSeconds specifies the duration in seconds before the workflow controller will re-fetch the limit for a semaphore from its associated data source. Defaults to 0 seconds (re-fetch every time the semaphore is checked). | +## MemoizationConfig + +MemoizationConfig contains memoization cache configuration for database-backed storage. When configured, cache entries are stored in the specified database table instead of ConfigMaps. + +### Fields + +| Field Name | Field Type | Description | +|---------------------|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------| +| `PostgreSQL` | [`PostgreSQLConfig`](#postgresqlconfig) | PostgreSQL configuration for PostgreSQL database, don't use MySQL at the same time | +| `MySQL` | [`MySQLConfig`](#mysqlconfig) | MySQL configuration for MySQL database, don't use PostgreSQL at the same time | +| `ConnectionPool` | [`ConnectionPool`](#connectionpool) | Pooled connection settings for all types of database connections | +| `DBReconnectConfig` | [`DBReconnectConfig`](#dbreconnectconfig) | DBReconnectConfig are configuration options for database retries and reconnections | +| `TableName` | `string` | TableName is the name of the table to use for memoization cache entries. Defaults to "memoization_cache" if not set. | +| `SkipMigration` | `bool` | SkipMigration skips automatic database migration on startup. | + ## ArtifactDriver ArtifactDriver is a plugin for an artifact driver diff --git a/manifests/components/postgres/overlays/workflow-controller-configmap.yaml b/manifests/components/postgres/overlays/workflow-controller-configmap.yaml index a903daff5a7e..b64030f6b7a8 100644 --- a/manifests/components/postgres/overlays/workflow-controller-configmap.yaml +++ b/manifests/components/postgres/overlays/workflow-controller-configmap.yaml @@ -43,6 +43,18 @@ data: passwordSecret: name: argo-postgres-config key: password + memoization: | + tableName: memoization_cache + postgresql: + host: postgres + port: 5432 + database: postgres + userNameSecret: + name: argo-postgres-config + key: username + passwordSecret: + name: argo-postgres-config + key: password retentionPolicy: | completed: 10 failed: 3 diff --git a/manifests/quick-start-postgres.yaml b/manifests/quick-start-postgres.yaml index cd87bd93d17a..be01eb79e183 100644 --- a/manifests/quick-start-postgres.yaml +++ b/manifests/quick-start-postgres.yaml @@ -183471,6 +183471,18 @@ data: - name: Completed Workflows scope: workflow-list url: http://workflows?label=workflows.argoproj.io/completed=true + memoization: | + tableName: memoization_cache + postgresql: + host: postgres + port: 5432 + database: postgres + userNameSecret: + name: argo-postgres-config + key: username + passwordSecret: + name: argo-postgres-config + key: password metricsConfig: | enabled: true path: /metrics diff --git a/test/e2e/sqldb_memoize_test.go b/test/e2e/sqldb_memoize_test.go new file mode 100644 index 000000000000..02a46a99597d --- /dev/null +++ b/test/e2e/sqldb_memoize_test.go @@ -0,0 +1,150 @@ +//go:build corefunctional + +package e2e + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v4/test/e2e/fixtures" + "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +type SQLDBMemoizeSuite struct { + fixtures.E2ESuite +} + +// memoWorkflow builds a workflow spec with a unique cache key per test run to avoid +// stale cache hits from previous runs. +func memoWorkflow(cacheKey string) string { + return fmt.Sprintf(`apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: sqldb-memoize- +spec: + entrypoint: hello + templates: + - name: hello + steps: + - - name: run1 + template: memoized + arguments: + parameters: [{name: message, value: "%s"}] + - - name: run2 + template: memoized + arguments: + parameters: [{name: message, value: "%s"}] + - name: memoized + inputs: + parameters: + - name: message + memoize: + key: "{{inputs.parameters.message}}" + maxAge: "10m" + cache: + configMap: + name: sqldb-memo-cache + container: + image: argoproj/argosay:v2 + command: [echo] + args: ["{{inputs.parameters.message}}"] +`, cacheKey, cacheKey) +} + +func (s *SQLDBMemoizeSuite) TestSQLDBMemoize() { + if s.Config.Memoization == nil { + s.T().Skip("memoization DB not configured; skipping SQL cache test") + } + + ctx := logging.TestContext(s.T().Context()) + + // Use a unique key so each test run starts with a cold cache. + cacheKey := fmt.Sprintf("hello-sqldb-%d", time.Now().UnixNano()) + + // Submit the workflow and wait for it to succeed. + s.Given(). + Workflow(memoWorkflow(cacheKey)). + When(). + SubmitWorkflow(). + WaitForWorkflow(fixtures.ToBeSucceeded). + Then(). + ExpectWorkflow(func(t *testing.T, _ *metav1.ObjectMeta, status *wfv1.WorkflowStatus) { + memoHit := false + memoSaved := false + for _, node := range status.Nodes { + if node.MemoizationStatus == nil { + continue + } + if node.MemoizationStatus.Hit { + memoHit = true + } else { + memoSaved = true + } + } + assert.True(t, memoSaved, "expected at least one node to save to the cache") + assert.True(t, memoHit, "expected at least one node to hit the cache") + }) + + // Also verify the entry landed in postgres, not a ConfigMap. + s.assertDBCacheEntry(ctx, cacheKey) + s.assertNoConfigMap(ctx, "sqldb-memo-cache") +} + +// assertDBCacheEntry checks the memoization_cache table directly. +func (s *SQLDBMemoizeSuite) assertDBCacheEntry(ctx context.Context, key string) { + memoCfg := s.Config.Memoization + // E2E tests connect to postgres via a port-forward on localhost. + cfg := *memoCfg + if cfg.PostgreSQL != nil { + pg := *cfg.PostgreSQL + pg.Host = "localhost" + cfg.PostgreSQL = &pg + } + if cfg.MySQL != nil { + my := *cfg.MySQL + my.Host = "localhost" + cfg.MySQL = &my + } + + session, _, err := sqldb.CreateDBSession(ctx, s.KubeClient, fixtures.Namespace, cfg.DBConfig) + s.Require().NoError(err, "could not connect to memoization DB") + defer session.Close() + + tableName := memodb.TableName(&cfg) + + var count int + row, err := session.SQL(). + QueryRow(fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE namespace = ? AND cache_name = ? AND cache_key = ?`, tableName), + fixtures.Namespace, "sqldb-memo-cache", key) + s.Require().NoError(err) + s.Require().NoError(row.Scan(&count)) + s.Equal(1, count, "expected exactly one cache entry in the database for key %q", key) + + // Also verify outputs are stored as valid JSON. + var outputs string + row, err = session.SQL(). + QueryRow(fmt.Sprintf(`SELECT outputs FROM %s WHERE namespace = ? AND cache_name = ? AND cache_key = ?`, tableName), + fixtures.Namespace, "sqldb-memo-cache", key) + s.Require().NoError(err) + s.Require().NoError(row.Scan(&outputs)) + s.NotEmpty(outputs) +} + +// assertNoConfigMap verifies the controller did NOT fall back to creating a ConfigMap cache. +func (s *SQLDBMemoizeSuite) assertNoConfigMap(ctx context.Context, name string) { + _, err := s.KubeClient.CoreV1().ConfigMaps(fixtures.Namespace).Get(ctx, name, metav1.GetOptions{}) + s.Error(err, "ConfigMap %q should not exist when SQL memoization is configured", name) +} + +func TestSQLDBMemoizeSuite(t *testing.T) { + suite.Run(t, new(SQLDBMemoizeSuite)) +} diff --git a/util/file/watch.go b/util/file/watch.go index a68dfbb7c4aa..6238c746cd56 100644 --- a/util/file/watch.go +++ b/util/file/watch.go @@ -113,7 +113,7 @@ func watchFilePoll(ctx context.Context, path string, onChange func()) error { } return err } - if last == nil || fi.ModTime() != last.ModTime() || fi.Size() != last.Size() { + if last == nil || !fi.ModTime().Equal(last.ModTime()) || fi.Size() != last.Size() { onChange() last = fi } diff --git a/util/memo/db/config.go b/util/memo/db/config.go new file mode 100644 index 000000000000..a1acda76e1c7 --- /dev/null +++ b/util/memo/db/config.go @@ -0,0 +1,83 @@ +package db + +import ( + "context" + + "k8s.io/client-go/kubernetes" + + "github.com/argoproj/argo-workflows/v4/config" + "github.com/argoproj/argo-workflows/v4/util/logging" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +const ( + defaultTableName = "memoization_cache" + versionTable = "memoization_schema_history" +) + +// Config holds resolved configuration for database-backed memoization. +type Config struct { + TableName string + SkipMigration bool +} + +func TableName(cfg *config.MemoizationConfig) string { + if cfg == nil || cfg.TableName == "" { + return defaultTableName + } + return cfg.TableName +} + +// ConfigFromConfig converts a controller MemoizationConfig (with DB credentials, connection +// settings, etc.) into the smaller Config struct used by the migration and query layers. +// Returns sensible defaults when cfg is nil. +func ConfigFromConfig(cfg *config.MemoizationConfig) Config { + if cfg == nil { + return Config{TableName: defaultTableName} + } + return Config{ + TableName: TableName(cfg), + SkipMigration: cfg.SkipMigration, + } +} + +// SessionProxyFromConfig creates a SessionProxy from a MemoizationConfig, returning nil and logging +// an error if the connection cannot be established. Callers that receive nil should fall back to +// ConfigMap-based caching. +func SessionProxyFromConfig(ctx context.Context, kubectlConfig kubernetes.Interface, namespace string, cfg *config.MemoizationConfig) *sqldb.SessionProxy { + if cfg == nil { + return nil + } + sessionProxy, err := sqldb.NewSessionProxy(ctx, sqldb.SessionProxyConfig{ + KubectlConfig: kubectlConfig, + Namespace: namespace, + DBConfig: cfg.DBConfig, + }) + if err != nil { + log := logging.RequireLoggerFromContext(ctx) + log.WithError(err).Error(ctx, "unable to create memoization database connection") + return nil + } + sqldb.ConfigureDBSession(sessionProxy.Session(), cfg.ConnectionPool) + return sessionProxy +} + +// Migrate runs database migrations for the memoization cache table. It is a no-op when +// cfg.SkipMigration is true. Returns an error if migration fails; callers should fall back +// to ConfigMap-based caching. +func Migrate(ctx context.Context, sessionProxy *sqldb.SessionProxy, cfg Config) error { + if sessionProxy == nil { + return nil + } + logger := logging.RequireLoggerFromContext(ctx) + if cfg.SkipMigration { + logger.Info(ctx, "Memoization db migration skipped") + return nil + } + logger.Info(ctx, "Running memoization db migration") + if err := migrate(ctx, sessionProxy.Session(), sessionProxy.DBType(), cfg.TableName); err != nil { + return err + } + logger.Info(ctx, "Memoization db migration complete") + return nil +} diff --git a/util/memo/db/migrate.go b/util/memo/db/migrate.go new file mode 100644 index 000000000000..f0aa92af039c --- /dev/null +++ b/util/memo/db/migrate.go @@ -0,0 +1,47 @@ +package db + +import ( + "context" + "fmt" + "regexp" + + "github.com/upper/db/v4" + + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +var validTableName = regexp.MustCompile(`^[A-Za-z0-9_]+$`) + +func migrate(ctx context.Context, session db.Session, dbType sqldb.DBType, tableName string) error { + if !validTableName.MatchString(tableName) { + return fmt.Errorf("invalid table name %q: must match [A-Za-z0-9_]+", tableName) + } + return sqldb.Migrate(ctx, session, dbType, versionTable, []sqldb.Change{ + // MySQL: use LONGTEXT for outputs (TEXT is 64KB). + // Postgres: use text for outputs (no size limit). + // Varchar sizes chosen to keep composite PK within InnoDB's 3072-byte limit with utf8mb4: + // (64 + 128 + 256) * 4 = 1792 bytes. + sqldb.ByType(dbType, sqldb.TypedChanges{ + sqldb.Postgres: sqldb.AnsiSQLChange(`create table if not exists ` + tableName + ` ( + namespace varchar(64) not null, + cache_name varchar(128) not null, + cache_key varchar(256) not null, + node_id text not null, + outputs text not null, + created_at timestamp not null, + expires_at timestamp not null, + primary key (namespace, cache_name, cache_key) +)`), + sqldb.MySQL: sqldb.AnsiSQLChange("create table if not exists " + tableName + " (" + + "namespace varchar(64) not null, " + + "cache_name varchar(128) not null, " + + "cache_key varchar(256) not null, " + + "node_id text not null, " + + "outputs longtext not null, " + + "created_at timestamp not null, " + + "expires_at timestamp not null, " + + "primary key (namespace, cache_name, cache_key))"), + }), + sqldb.AnsiSQLChange(`create index imemo_expires_at on ` + tableName + ` (expires_at)`), + }) +} diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go new file mode 100644 index 000000000000..fab91950407f --- /dev/null +++ b/util/memo/db/queries.go @@ -0,0 +1,124 @@ +package db + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/upper/db/v4" + + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +// CacheRecord is the database row for a single memoization cache entry. +type CacheRecord struct { + Namespace string `db:"namespace"` + CacheName string `db:"cache_name"` + CacheKey string `db:"cache_key"` + NodeID string `db:"node_id"` + Outputs string `db:"outputs"` // JSON + CreatedAt time.Time `db:"created_at"` + ExpiresAt time.Time `db:"expires_at"` +} + +// Queries provides database operations for the memoization cache table. +type Queries struct { + tableName string + dbType sqldb.DBType +} + +func NewQueries(tableName string, dbType sqldb.DBType) (*Queries, error) { + if !validTableName.MatchString(tableName) { + return nil, fmt.Errorf("invalid table name %q: must match [A-Za-z0-9_]+", tableName) + } + return &Queries{tableName: tableName, dbType: dbType}, nil +} + +// Load retrieves the outputs for the given cache key. +// Returns nil when the entry does not exist. +func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, cacheName, cacheKey string) (*CacheRecord, error) { + var r CacheRecord + var found bool + err := sp.With(ctx, func(sess db.Session) error { + // Use raw SQL to avoid upper/db ORM timestamp scanning issues with + // "timestamp without timezone" columns (the ORM may not populate time.Time fields). + var query string + switch q.dbType { + case sqldb.Postgres: + query = fmt.Sprintf(`SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = $1 AND cache_name = $2 AND cache_key = $3`, q.tableName) + case sqldb.MySQL: + query = fmt.Sprintf("SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = ? AND cache_name = ? AND cache_key = ?", q.tableName) + default: + return fmt.Errorf("unsupported database type: %s", q.dbType) + } + rows, err := sess.SQL().QueryContext(ctx, query, namespace, cacheName, cacheKey) + if err != nil { + return err + } + defer rows.Close() + if !rows.Next() { + return rows.Err() + } + found = true + return rows.Scan(&r.Namespace, &r.CacheName, &r.CacheKey, &r.NodeID, &r.Outputs, &r.CreatedAt, &r.ExpiresAt) + }) + if err != nil || !found { + return nil, err + } + return &r, nil +} + +// Prune deletes cache entries whose expires_at has elapsed. It is called +// periodically by the controller to bound the size of the memoization_cache table. +func (q *Queries) Prune(ctx context.Context, sp *sqldb.SessionProxy) (int64, error) { + now := time.Now().UTC() + var n int64 + err := sp.With(ctx, func(sess db.Session) error { + var query string + switch q.dbType { + case sqldb.Postgres: + query = fmt.Sprintf(`DELETE FROM %s WHERE expires_at < $1`, q.tableName) + case sqldb.MySQL: + query = fmt.Sprintf("DELETE FROM %s WHERE expires_at < ?", q.tableName) + default: + return fmt.Errorf("unsupported database type: %s", q.dbType) + } + result, err := sess.SQL().ExecContext(ctx, query, now) + if err != nil { + return err + } + n, err = result.RowsAffected() + return err + }) + return n, err +} + +func (q *Queries) Save(ctx context.Context, sp *sqldb.SessionProxy, namespace, cacheName, cacheKey, nodeID string, outputs *wfv1.Outputs, maxAgeSeconds int64) error { + outputsJSON, err := json.Marshal(outputs) + if err != nil { + return fmt.Errorf("unable to marshal memoization outputs: %w", err) + } + outputsStr := string(outputsJSON) + now := time.Now().UTC() + expiresAt := now.Add(time.Duration(maxAgeSeconds) * time.Second) + return sp.With(ctx, func(sess db.Session) error { + switch q.dbType { + case sqldb.Postgres: + _, err := sess.SQL().ExecContext(ctx, + fmt.Sprintf(`INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) +VALUES ($1, $2, $3, $4, $5, $6, $7) +ON CONFLICT (namespace, cache_name, cache_key) DO UPDATE SET node_id = $4, outputs = $5, expires_at = $7`, q.tableName), + namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt) + return err + case sqldb.MySQL: + _, err := sess.SQL().ExecContext(ctx, + fmt.Sprintf("INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) VALUES (?, ?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE node_id = ?, outputs = ?, expires_at = ?", q.tableName), + namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt, nodeID, outputsStr, expiresAt) + return err + default: + return fmt.Errorf("unsupported database type: %s", q.dbType) + } + }) +} diff --git a/util/memo/db/queries_test.go b/util/memo/db/queries_test.go new file mode 100644 index 000000000000..49f8f49d931a --- /dev/null +++ b/util/memo/db/queries_test.go @@ -0,0 +1,313 @@ +//go:build !windows + +package db_test + +import ( + "context" + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + testcontainers "github.com/testcontainers/testcontainers-go" + testmysql "github.com/testcontainers/testcontainers-go/modules/mysql" + testpostgres "github.com/testcontainers/testcontainers-go/modules/postgres" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/argoproj/argo-workflows/v4/config" + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +const ( + testDBName = "memotest" + testDBUser = "user" + testDBPassword = "pass" + testTableName = "memoization_cache" + testNamespace = "default" + testCacheName = "my-cache" +) + +// setupPostgres starts a throwaway Postgres container and returns a migrated SessionProxy. +func setupPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { + t.Helper() + pg, err := testpostgres.Run(ctx, + "postgres:17.4-alpine", + testpostgres.WithDatabase(testDBName), + testpostgres.WithUsername(testDBUser), + testpostgres.WithPassword(testDBPassword), + testcontainers.WithWaitStrategy( + wait.ForLog("database system is ready to accept connections"). + WithOccurrence(2). + WithStartupTimeout(30*time.Second), + ), + ) + require.NoError(t, err) + t.Cleanup(func() { + if termErr := testcontainers.TerminateContainer(pg); termErr != nil { + t.Logf("failed to terminate postgres container: %s", termErr) + } + }) + + host, err := pg.Host(ctx) + require.NoError(t, err) + portStr, err := pg.MappedPort(ctx, "5432/tcp") + require.NoError(t, err) + port, err := strconv.Atoi(portStr.Port()) + require.NoError(t, err) + + dbCfg := config.DBConfig{ + PostgreSQL: &config.PostgreSQLConfig{ + DatabaseConfig: config.DatabaseConfig{ + Host: host, + Port: port, + Database: testDBName, + }, + }, + } + sp, err := sqldb.NewSessionProxy(ctx, sqldb.SessionProxyConfig{ + DBConfig: dbCfg, + Username: testDBUser, + Password: testDBPassword, + MaxRetries: 5, + BaseDelay: 200 * time.Millisecond, + MaxDelay: 10 * time.Second, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = sp.Close() }) + + memoCfg := &config.MemoizationConfig{ + DBConfig: dbCfg, + TableName: testTableName, + } + require.NoError(t, memodb.Migrate(ctx, sp, memodb.ConfigFromConfig(memoCfg))) + return sp +} + +// setupMySQL starts a throwaway MySQL container and returns a migrated SessionProxy. +func setupMySQL(ctx context.Context, t *testing.T) *sqldb.SessionProxy { + t.Helper() + my, err := testmysql.Run(ctx, + "mysql:8.4.5", + testmysql.WithDatabase(testDBName), + testmysql.WithUsername(testDBUser), + testmysql.WithPassword(testDBPassword), + ) + require.NoError(t, err) + t.Cleanup(func() { + if termErr := testcontainers.TerminateContainer(my); termErr != nil { + t.Logf("failed to terminate mysql container: %s", termErr) + } + }) + + host, err := my.Host(ctx) + require.NoError(t, err) + portStr, err := my.MappedPort(ctx, "3306/tcp") + require.NoError(t, err) + port, err := strconv.Atoi(portStr.Port()) + require.NoError(t, err) + + dbCfg := config.DBConfig{ + MySQL: &config.MySQLConfig{ + DatabaseConfig: config.DatabaseConfig{ + Host: host, + Port: port, + Database: testDBName, + }, + }, + } + sp, err := sqldb.NewSessionProxy(ctx, sqldb.SessionProxyConfig{ + DBConfig: dbCfg, + Username: testDBUser, + Password: testDBPassword, + MaxRetries: 5, + BaseDelay: 200 * time.Millisecond, + MaxDelay: 10 * time.Second, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = sp.Close() }) + + memoCfg := &config.MemoizationConfig{ + DBConfig: dbCfg, + TableName: testTableName, + } + require.NoError(t, memodb.Migrate(ctx, sp, memodb.ConfigFromConfig(memoCfg))) + return sp +} + +func sampleOutputs(message string) *wfv1.Outputs { + return &wfv1.Outputs{ + Parameters: []wfv1.Parameter{ + {Name: "result", Value: wfv1.AnyStringPtr(message)}, + }, + } +} + +func TestQueriesSaveAndLoad(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupPostgres(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.Postgres) + require.NoError(t, err) + + // Load returns nil when no entry exists. + rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") + require.NoError(t, err) + assert.Nil(t, rec, "expected nil for missing key") + + // Save an entry and load it back. + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key1", "node-abc", sampleOutputs("hello"), 2592000)) + rec, err = q.Load(ctx, sp, testNamespace, testCacheName, "key1") + require.NoError(t, err) + require.NotNil(t, rec) + assert.Equal(t, "node-abc", rec.NodeID) + assert.Contains(t, rec.Outputs, "hello") +} + +func TestQueriesNamespaceIsolation(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupPostgres(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.Postgres) + require.NoError(t, err) + + // Save the same cache_name+cache key in two different namespaces. + require.NoError(t, q.Save(ctx, sp, "ns-a", testCacheName, "shared-key", "node-a", sampleOutputs("from-a"), 2592000)) + require.NoError(t, q.Save(ctx, sp, "ns-b", testCacheName, "shared-key", "node-b", sampleOutputs("from-b"), 2592000)) + + // Each namespace should see its own entry. + recA, err := q.Load(ctx, sp, "ns-a", testCacheName, "shared-key") + require.NoError(t, err) + require.NotNil(t, recA) + assert.Equal(t, "node-a", recA.NodeID) + assert.Contains(t, recA.Outputs, "from-a") + + recB, err := q.Load(ctx, sp, "ns-b", testCacheName, "shared-key") + require.NoError(t, err) + require.NotNil(t, recB) + assert.Equal(t, "node-b", recB.NodeID) + assert.Contains(t, recB.Outputs, "from-b") +} + +func TestQueriesSaveReplaces(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupPostgres(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.Postgres) + require.NoError(t, err) + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) + + rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key3") + require.NoError(t, err) + require.NotNil(t, rec) + assert.Equal(t, "node-new", rec.NodeID) + assert.Contains(t, rec.Outputs, "new") +} + +func TestQueriesPruneRemovesOldEntries(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupPostgres(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.Postgres) + require.NoError(t, err) + + // Save an entry with a very short max_age (1 second) and one with 30 days. + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) + + // Backdate old-key's expires_at so it is in the past. + _, err = sp.Session().SQL(). + ExecContext(ctx, `UPDATE `+testTableName+` SET expires_at = $1 WHERE cache_key = $2`, time.Now().Add(-10*time.Second), "old-key") + require.NoError(t, err) + + // Prune — old-key should be deleted (expires_at < now), new-key should survive. + n, err := q.Prune(ctx, sp) + require.NoError(t, err) + assert.EqualValues(t, 1, n, "expected exactly one row pruned") + + old, err := q.Load(ctx, sp, testNamespace, testCacheName, "old-key") + require.NoError(t, err) + assert.Nil(t, old, "old entry should have been pruned") + + fresh, err := q.Load(ctx, sp, testNamespace, testCacheName, "new-key") + require.NoError(t, err) + assert.NotNil(t, fresh, "new entry should still exist") +} + +func TestQueriesPruneKeepsRecentEntries(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupPostgres(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.Postgres) + require.NoError(t, err) + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "recent", "node-1", sampleOutputs("v1"), 2592000)) + + // All entries are recent — nothing should be pruned. + n, err := q.Prune(ctx, sp) + require.NoError(t, err) + assert.EqualValues(t, 0, n, "expected no rows pruned when all entries are fresh") +} + +// MySQL test variants — verify longtext and ON DUPLICATE KEY UPDATE. + +func TestMySQLSaveAndLoad(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupMySQL(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.MySQL) + require.NoError(t, err) + + rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") + require.NoError(t, err) + assert.Nil(t, rec, "expected nil for missing key") + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key1", "node-abc", sampleOutputs("hello"), 2592000)) + rec, err = q.Load(ctx, sp, testNamespace, testCacheName, "key1") + require.NoError(t, err) + require.NotNil(t, rec) + assert.Equal(t, "node-abc", rec.NodeID) + assert.Contains(t, rec.Outputs, "hello") +} + +func TestMySQLSaveReplaces(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupMySQL(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.MySQL) + require.NoError(t, err) + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) + + rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key3") + require.NoError(t, err) + require.NotNil(t, rec) + assert.Equal(t, "node-new", rec.NodeID) + assert.Contains(t, rec.Outputs, "new") +} + +func TestMySQLPruneRemovesOldEntries(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupMySQL(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.MySQL) + require.NoError(t, err) + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) + + // Backdate old-key's expires_at so it is in the past. + _, err = sp.Session().SQL(). + ExecContext(ctx, "UPDATE "+testTableName+" SET expires_at = ? WHERE cache_key = ?", time.Now().Add(-10*time.Second), "old-key") + require.NoError(t, err) + + n, err := q.Prune(ctx, sp) + require.NoError(t, err) + assert.EqualValues(t, 1, n, "expected exactly one row pruned") + + old, err := q.Load(ctx, sp, testNamespace, testCacheName, "old-key") + require.NoError(t, err) + assert.Nil(t, old, "old entry should have been pruned") + + fresh, err := q.Load(ctx, sp, testNamespace, testCacheName, "new-key") + require.NoError(t, err) + assert.NotNil(t, fresh, "new entry should still exist") +} diff --git a/workflow/controller/cache/cache.go b/workflow/controller/cache/cache.go index 4ecfb0e145c0..37f388784d78 100644 --- a/workflow/controller/cache/cache.go +++ b/workflow/controller/cache/cache.go @@ -2,7 +2,10 @@ package cache import ( "context" + "fmt" + "os" "regexp" + "strconv" "sync" "time" @@ -10,13 +13,60 @@ import ( "k8s.io/client-go/kubernetes" wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v4/util/logging" + "github.com/argoproj/argo-workflows/v4/util/sqldb" ) var cacheKeyRegex = regexp.MustCompile("^[a-zA-Z0-9][-a-zA-Z0-9]*$") +// defaultMaxAgeSeconds is 30 days in seconds, used when maxAge is not specified on the template. +const defaultMaxAgeSeconds int64 = 30 * 24 * 60 * 60 + +// resolvedDefaultMaxAge caches the DEFAULT_MAX_AGE env var so it is only read once. +var resolvedDefaultMaxAge struct { + once sync.Once + secs int64 + err error +} + +// ResolveMaxAgeSeconds converts a template's maxAge duration string to seconds. If maxAge is +// empty, it falls back to the DEFAULT_MAX_AGE env var (a Go duration string like "720h"), then +// to 30 days. Returns an error only if the duration string is malformed. +func ResolveMaxAgeSeconds(maxAge string) (int64, error) { + if maxAge == "" { + resolvedDefaultMaxAge.once.Do(func() { + envVal := os.Getenv("DEFAULT_MAX_AGE") + if envVal == "" { + resolvedDefaultMaxAge.secs = defaultMaxAgeSeconds + return + } + // Try parsing as a Go duration first (e.g. "720h") + if d, err := time.ParseDuration(envVal); err == nil { + resolvedDefaultMaxAge.secs = int64(d.Seconds()) + return + } + // Fall back to parsing as raw seconds (e.g. "2592000") + if secs, err := strconv.ParseInt(envVal, 10, 64); err == nil { + resolvedDefaultMaxAge.secs = secs + return + } + resolvedDefaultMaxAge.err = fmt.Errorf("invalid DEFAULT_MAX_AGE value %q: must be a Go duration (e.g. 720h) or integer seconds", envVal) + }) + return resolvedDefaultMaxAge.secs, resolvedDefaultMaxAge.err + } + d, err := time.ParseDuration(maxAge) + if err != nil { + return 0, fmt.Errorf("invalid maxAge %q: %w", maxAge, err) + } + return int64(d.Seconds()), nil +} + type MemoizationCache interface { Load(ctx context.Context, key string) (*Entry, error) - Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs) error + // Save stores the outputs of a completed memoized node. maxAgeSeconds is the number of seconds + // after which this entry expires and becomes eligible for garbage collection. Ignored by the + // ConfigMap backend. + Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, maxAgeSeconds int64) error } type Entry struct { @@ -49,37 +99,79 @@ func (e *Entry) GetOutputsWithMaxAge(maxAge time.Duration) (*wfv1.Outputs, bool) } type cacheFactory struct { - caches map[string]MemoizationCache - kubeclient kubernetes.Interface - namespace string - lock sync.RWMutex + caches map[string]MemoizationCache + kubeclient kubernetes.Interface + lock sync.RWMutex + sessionProxy *sqldb.SessionProxy + tableName string + // sqlEnabled indicates that SQL caching was explicitly configured by the operator, even if + // the session proxy is currently unavailable. When true and sessionProxy is nil, GetCache + // returns nil rather than silently falling back to ConfigMap-based caching. + sqlEnabled bool } type Factory interface { - GetCache(ct Type, name string) MemoizationCache + GetCache(ctx context.Context, ct Type, namespace, name string) MemoizationCache + // SetSessionProxy configures the factory to use database-backed caching with the given + // session proxy and table name. Calling this clears any previously created cache instances + // so they are recreated against the new backend. + SetSessionProxy(sp *sqldb.SessionProxy, tableName string) + // ClearSessionProxy removes any SQL backend configuration. If sqlEnabled is true, GetCache + // returns nil rather than silently falling back to ConfigMap-based caching (e.g. after a + // transient DB failure). If sqlEnabled is false, GetCache falls back to ConfigMap caching. + ClearSessionProxy(sqlEnabled bool) } -func NewCacheFactory(ki kubernetes.Interface, ns string) Factory { +func NewCacheFactory(ki kubernetes.Interface) Factory { return &cacheFactory{ - make(map[string]MemoizationCache), - ki, - ns, - sync.RWMutex{}, + caches: make(map[string]MemoizationCache), + kubeclient: ki, } } type Type string const ( - // Only config maps are currently supported for caching + // ConfigMapCache is a cache type identifier used as a key prefix in the cache map. + // When a database session proxy is configured, SQL-backed caching is used instead. ConfigMapCache Type = "ConfigMapCache" ) -// Returns a cache if it exists and creates it otherwise -func (cf *cacheFactory) GetCache(ct Type, name string) MemoizationCache { +// SetSessionProxy configures the factory to use a SQL backend, clearing any previously +// cached instances so they are recreated against the new backend. +func (cf *cacheFactory) SetSessionProxy(sp *sqldb.SessionProxy, tableName string) { + cf.lock.Lock() + defer cf.lock.Unlock() + cf.sessionProxy = sp + cf.tableName = tableName + cf.sqlEnabled = true + cf.caches = make(map[string]MemoizationCache) +} + +// ClearSessionProxy removes the SQL backend. When sqlEnabled is true (DB configured but +// temporarily unavailable), GetCache returns nil. When false (no DB configured), GetCache +// falls back to ConfigMap-based caching. +func (cf *cacheFactory) ClearSessionProxy(sqlEnabled bool) { + cf.lock.Lock() + defer cf.lock.Unlock() + cf.sessionProxy = nil + cf.tableName = "" + cf.sqlEnabled = sqlEnabled + cf.caches = make(map[string]MemoizationCache) +} + +// GetCache returns a cache scoped to the given workflow namespace if it exists and creates it +// otherwise. +func (cf *cacheFactory) GetCache(ctx context.Context, ct Type, namespace, name string) MemoizationCache { + logger := logging.RequireLoggerFromContext(ctx) + if namespace == "" { + logger.WithField("cacheName", name).Error(ctx, "Workflow namespace is required to resolve memoization cache") + return nil + } + cf.lock.RLock() - idx := string(ct) + "." + name + idx := string(ct) + "." + namespace + "." + name if c := cf.caches[idx]; c != nil { cf.lock.RUnlock() return c @@ -95,7 +187,23 @@ func (cf *cacheFactory) GetCache(ct Type, name string) MemoizationCache { switch ct { case ConfigMapCache: - c := NewConfigMapCache(cf.namespace, cf.kubeclient, name) + var c MemoizationCache + switch { + case cf.sessionProxy != nil: + var err error + c, err = newSQLDBCache(namespace, name, cf.sessionProxy, cf.tableName) + if err != nil { + logger.WithFields(logging.Fields{"cacheName": name, "workflowNamespace": namespace}).WithError(err).Error(ctx, "Failed to create SQL memoization cache") + return nil + } + case cf.sqlEnabled: + // SQL was explicitly configured but is currently unavailable. Return nil so callers + // can skip caching rather than silently redirecting to a ConfigMap backend. + logger.WithFields(logging.Fields{"cacheName": name, "workflowNamespace": namespace}).Warn(ctx, "SQL memoization cache requested but SQL backend is unavailable; skipping cache") + return nil + default: + c = NewConfigMapCache(namespace, cf.kubeclient, name) + } cf.caches[idx] = c return c default: diff --git a/workflow/controller/cache/cache_factory_test.go b/workflow/controller/cache/cache_factory_test.go new file mode 100644 index 000000000000..ae4bf79719a6 --- /dev/null +++ b/workflow/controller/cache/cache_factory_test.go @@ -0,0 +1,36 @@ +package cache + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/client-go/kubernetes/fake" + + "github.com/argoproj/argo-workflows/v4/util/logging" +) + +func TestCacheFactoryNamespacesCachesSeparately(t *testing.T) { + ctx := logging.TestContext(t.Context()) + factory := NewCacheFactory(fake.NewSimpleClientset()) + + cacheA := factory.GetCache(ctx, ConfigMapCache, "ns-a", "shared-cache") + cacheARepeat := factory.GetCache(ctx, ConfigMapCache, "ns-a", "shared-cache") + cacheB := factory.GetCache(ctx, ConfigMapCache, "ns-b", "shared-cache") + + require.NotNil(t, cacheA) + require.NotNil(t, cacheARepeat) + require.NotNil(t, cacheB) + assert.Same(t, cacheA, cacheARepeat) + assert.NotSame(t, cacheA, cacheB) + assert.Equal(t, "ns-a", cacheA.(*configMapCache).namespace) + assert.Equal(t, "ns-b", cacheB.(*configMapCache).namespace) +} + +func TestCacheFactoryRequiresNamespace(t *testing.T) { + ctx := logging.TestContext(t.Context()) + factory := NewCacheFactory(fake.NewSimpleClientset()) + + cache := factory.GetCache(ctx, ConfigMapCache, "", "shared-cache") + assert.Nil(t, cache) +} diff --git a/workflow/controller/cache/configmap_cache.go b/workflow/controller/cache/configmap_cache.go index d9dc06a96203..40c1cf825447 100644 --- a/workflow/controller/cache/configmap_cache.go +++ b/workflow/controller/cache/configmap_cache.go @@ -132,7 +132,7 @@ func (c *configMapCache) load(ctx context.Context, key string) (*Entry, error) { return &entry, nil } -func (c *configMapCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs) error { +func (c *configMapCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, _ int64) error { err := retry.OnError(kwait.Backoff{ Duration: time.Second, Factor: 2, diff --git a/workflow/controller/cache/sqldb_cache.go b/workflow/controller/cache/sqldb_cache.go new file mode 100644 index 000000000000..db3a3dbcd481 --- /dev/null +++ b/workflow/controller/cache/sqldb_cache.go @@ -0,0 +1,63 @@ +package cache + +import ( + "context" + "encoding/json" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +type sqlDBCache struct { + namespace string + name string + sessionProxy *sqldb.SessionProxy + queries *memodb.Queries +} + +func newSQLDBCache(namespace, name string, sp *sqldb.SessionProxy, tableName string) (MemoizationCache, error) { + queries, err := memodb.NewQueries(tableName, sp.DBType()) + if err != nil { + return nil, err + } + return &sqlDBCache{ + namespace: namespace, + name: name, + sessionProxy: sp, + queries: queries, + }, nil +} + +func (c *sqlDBCache) Load(ctx context.Context, key string) (*Entry, error) { + if !cacheKeyRegex.MatchString(key) { + return nil, fmt.Errorf("invalid cache key: %s", key) + } + record, err := c.queries.Load(ctx, c.sessionProxy, c.namespace, c.name, key) + if err != nil { + return nil, fmt.Errorf("memoization db load failed: %w", err) + } + if record == nil { + return nil, nil + } + var outputs wfv1.Outputs + if err := json.Unmarshal([]byte(record.Outputs), &outputs); err != nil { + return nil, fmt.Errorf("malformed memoization db entry: could not unmarshal outputs JSON: %w", err) + } + return &Entry{ + NodeID: record.NodeID, + Outputs: &outputs, + CreationTimestamp: metav1.Time{Time: record.CreatedAt}, + LastHitTimestamp: metav1.Time{Time: record.CreatedAt}, + }, nil +} + +func (c *sqlDBCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, maxAgeSeconds int64) error { + if !cacheKeyRegex.MatchString(key) { + return fmt.Errorf("invalid cache key: %s", key) + } + return c.queries.Save(ctx, c.sessionProxy, c.namespace, c.name, key, nodeID, value, maxAgeSeconds) +} diff --git a/workflow/controller/cache/sqldb_cache_test.go b/workflow/controller/cache/sqldb_cache_test.go new file mode 100644 index 000000000000..5d429fab2f2a --- /dev/null +++ b/workflow/controller/cache/sqldb_cache_test.go @@ -0,0 +1,241 @@ +//go:build !windows + +package cache + +import ( + "context" + "encoding/json" + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + testcontainers "github.com/testcontainers/testcontainers-go" + testpostgres "github.com/testcontainers/testcontainers-go/modules/postgres" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/argoproj/argo-workflows/v4/config" + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +const ( + testDBName = "cachetest" + testDBUser = "user" + testDBPassword = "pass" + testTableName = "memoization_cache" + testNamespace = "default" + testCacheName = "my-cache" +) + +func setupTestPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { + t.Helper() + pg, err := testpostgres.Run(ctx, + "postgres:17.4-alpine", + testpostgres.WithDatabase(testDBName), + testpostgres.WithUsername(testDBUser), + testpostgres.WithPassword(testDBPassword), + testcontainers.WithWaitStrategy( + wait.ForLog("database system is ready to accept connections"). + WithOccurrence(2). + WithStartupTimeout(30*time.Second), + ), + ) + require.NoError(t, err) + t.Cleanup(func() { + if termErr := testcontainers.TerminateContainer(pg); termErr != nil { + t.Logf("failed to terminate postgres container: %s", termErr) + } + }) + + host, err := pg.Host(ctx) + require.NoError(t, err) + portStr, err := pg.MappedPort(ctx, "5432/tcp") + require.NoError(t, err) + port, err := strconv.Atoi(portStr.Port()) + require.NoError(t, err) + + dbCfg := config.DBConfig{ + PostgreSQL: &config.PostgreSQLConfig{ + DatabaseConfig: config.DatabaseConfig{ + Host: host, + Port: port, + Database: testDBName, + }, + }, + } + sp, err := sqldb.NewSessionProxy(ctx, sqldb.SessionProxyConfig{ + DBConfig: dbCfg, + Username: testDBUser, + Password: testDBPassword, + MaxRetries: 5, + BaseDelay: 200 * time.Millisecond, + MaxDelay: 10 * time.Second, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = sp.Close() }) + + memoCfg := &config.MemoizationConfig{ + DBConfig: dbCfg, + TableName: testTableName, + } + require.NoError(t, memodb.Migrate(ctx, sp, memodb.ConfigFromConfig(memoCfg))) + return sp +} + +func TestSQLDBCacheSaveAndLoad(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupTestPostgres(ctx, t) + + c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) + require.NoError(t, err) + + // Load returns nil for missing key. + entry, err := c.Load(ctx, "key1") + require.NoError(t, err) + assert.Nil(t, entry) + + // Save and load back. + outputs := &wfv1.Outputs{ + Parameters: []wfv1.Parameter{ + {Name: "result", Value: wfv1.AnyStringPtr("hello")}, + }, + } + require.NoError(t, c.Save(ctx, "key1", "node-abc", outputs, 2592000)) + + entry, err = c.Load(ctx, "key1") + require.NoError(t, err) + require.NotNil(t, entry) + assert.Equal(t, "node-abc", entry.NodeID) + assert.True(t, entry.Hit()) + require.NotNil(t, entry.Outputs) + require.Len(t, entry.Outputs.Parameters, 1) + assert.Equal(t, "result", entry.Outputs.Parameters[0].Name) + assert.Equal(t, "hello", entry.Outputs.Parameters[0].Value.String()) + assert.False(t, entry.CreationTimestamp.IsZero()) +} + +func TestSQLDBCacheInvalidKey(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupTestPostgres(ctx, t) + + c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) + require.NoError(t, err) + + // Keys with invalid characters should be rejected. + _, err = c.Load(ctx, "invalid key!") + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid cache key") + + err = c.Save(ctx, "invalid key!", "node-1", &wfv1.Outputs{}, 3600) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid cache key") +} + +func TestSQLDBCacheOutputsRoundTrip(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupTestPostgres(ctx, t) + + c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) + require.NoError(t, err) + + // Save complex outputs and verify they round-trip through JSON. + outputs := &wfv1.Outputs{ + Parameters: []wfv1.Parameter{ + {Name: "p1", Value: wfv1.AnyStringPtr("v1")}, + {Name: "p2", Value: wfv1.AnyStringPtr("v2")}, + }, + } + require.NoError(t, c.Save(ctx, "complex-key", "node-x", outputs, 3600)) + + entry, err := c.Load(ctx, "complex-key") + require.NoError(t, err) + require.NotNil(t, entry) + + // Verify the round-tripped outputs match by comparing JSON. + originalJSON, _ := json.Marshal(outputs) + loadedJSON, _ := json.Marshal(entry.Outputs) + assert.JSONEq(t, string(originalJSON), string(loadedJSON)) +} + +func TestSQLDBCacheInvalidTableName(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupTestPostgres(ctx, t) + _ = ctx + + _, err := newSQLDBCache(testNamespace, testCacheName, sp, "invalid-table!") + require.Error(t, err) +} + +func TestSQLDBCacheGetOutputsWithMaxAge(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupTestPostgres(ctx, t) + + c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) + require.NoError(t, err) + + outputs := &wfv1.Outputs{ + Parameters: []wfv1.Parameter{ + {Name: "result", Value: wfv1.AnyStringPtr("cached")}, + }, + } + require.NoError(t, c.Save(ctx, "ttl-key", "node-ttl", outputs, 3600)) + + entry, err := c.Load(ctx, "ttl-key") + require.NoError(t, err) + require.NotNil(t, entry) + + // With a large maxAge, outputs should be returned. + out, ok := entry.GetOutputsWithMaxAge(1 * time.Hour) + assert.True(t, ok) + assert.NotNil(t, out) + + // With a zero maxAge, outputs should be expired. + out, ok = entry.GetOutputsWithMaxAge(0) + assert.False(t, ok) + assert.Nil(t, out) +} + +func TestSQLDBCacheUpsertPreservesCreatedAt(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupTestPostgres(ctx, t) + + c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) + require.NoError(t, err) + + outputs := &wfv1.Outputs{ + Parameters: []wfv1.Parameter{ + {Name: "result", Value: wfv1.AnyStringPtr("v1")}, + }, + } + require.NoError(t, c.Save(ctx, "upsert-key", "node-1", outputs, 3600)) + + entry1, err := c.Load(ctx, "upsert-key") + require.NoError(t, err) + require.NotNil(t, entry1) + createdAt1 := entry1.CreationTimestamp.Time + + // Small delay to ensure timestamps differ. + time.Sleep(10 * time.Millisecond) + + // Re-save with updated outputs. + outputs2 := &wfv1.Outputs{ + Parameters: []wfv1.Parameter{ + {Name: "result", Value: wfv1.AnyStringPtr("v2")}, + }, + } + require.NoError(t, c.Save(ctx, "upsert-key", "node-2", outputs2, 3600)) + + entry2, err := c.Load(ctx, "upsert-key") + require.NoError(t, err) + require.NotNil(t, entry2) + + // created_at should be preserved (not reset) on upsert. + assert.Equal(t, createdAt1.Unix(), entry2.CreationTimestamp.Unix(), + "created_at should be preserved on upsert: first=%v, second=%v", createdAt1, entry2.CreationTimestamp.Time) + // But the outputs should be updated. + assert.Equal(t, "node-2", entry2.NodeID) +} diff --git a/workflow/controller/cache_test.go b/workflow/controller/cache_test.go index 14c69fa41c64..79be3a4828b5 100644 --- a/workflow/controller/cache_test.go +++ b/workflow/controller/cache_test.go @@ -94,7 +94,7 @@ func TestConfigMapCacheSave(t *testing.T) { outputs := wfv1.Outputs{} outputs.Parameters = append(outputs.Parameters, MockParam) - err := c.Save(ctx, "hi-there-world", "", &outputs) + err := c.Save(ctx, "hi-there-world", "", &outputs, 2592000) require.NoError(t, err) cm, err := controller.kubeclientset.CoreV1().ConfigMaps("default").Get(ctx, "whalesay-cache", metav1.GetOptions{}) diff --git a/workflow/controller/config.go b/workflow/controller/config.go index 9a199361c2b0..04dcb1a2782a 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -13,6 +13,7 @@ import ( persist "github.com/argoproj/argo-workflows/v4/persist/sqldb" "github.com/argoproj/argo-workflows/v4/util/instanceid" "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" "github.com/argoproj/argo-workflows/v4/util/sqldb" "github.com/argoproj/argo-workflows/v4/workflow/artifactrepositories" "github.com/argoproj/argo-workflows/v4/workflow/hydrator" @@ -78,6 +79,45 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger.Info(ctx, "Persistence configuration disabled") } + memoCfg := wfc.Config.Memoization + wfc.memoLock.Lock() + wfc.memoConfig = memoCfg + if memoCfg != nil { + logger.Info(ctx, "Memoization database configuration enabled") + if wfc.memoSessionProxy == nil { + sp := memodb.SessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, memoCfg) + if sp == nil { + logger.Error(ctx, "Failed to connect to memoization database; SQL caching unavailable. Workflows using memoization will skip caching until the database is reachable.") + wfc.cacheFactory.ClearSessionProxy(true) + } + wfc.memoSessionProxy = sp + wfc.memoMigrated = false + } + if wfc.memoSessionProxy != nil && !wfc.memoMigrated { + cfg := memodb.ConfigFromConfig(memoCfg) + if err := memodb.Migrate(ctx, wfc.memoSessionProxy, cfg); err != nil { + logger.WithError(err).Error(ctx, "Memoization database migration failed; SQL caching unavailable. Workflows using memoization will skip caching until the database is healthy.") + wfc.memoSessionProxy.Close() + wfc.memoSessionProxy = nil + wfc.memoMigrated = false + wfc.cacheFactory.ClearSessionProxy(true) + } else { + wfc.memoMigrated = true + wfc.cacheFactory.SetSessionProxy(wfc.memoSessionProxy, cfg.TableName) + } + } + } else { + if wfc.memoSessionProxy != nil { + logger.Info(ctx, "Memoization database configuration removed") + wfc.memoSessionProxy.Close() + wfc.memoSessionProxy = nil + wfc.memoMigrated = false + } + wfc.cacheFactory.ClearSessionProxy(false) + logger.Info(ctx, "Memoization database configuration disabled; using ConfigMap-based caching") + } + wfc.memoLock.Unlock() + wfc.hydrator = hydrator.New(wfc.offloadNodeStatusRepo) wfc.updateEstimatorFactory(ctx) wfc.rateLimiter = wfc.newRateLimiter() diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 02cec7e6be70..3653fbeb605a 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -46,6 +46,7 @@ import ( "github.com/argoproj/argo-workflows/v4/util/deprecation" "github.com/argoproj/argo-workflows/v4/util/env" "github.com/argoproj/argo-workflows/v4/util/errors" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" rbacutil "github.com/argoproj/argo-workflows/v4/util/rbac" "github.com/argoproj/argo-workflows/v4/util/retry" utilsqldb "github.com/argoproj/argo-workflows/v4/util/sqldb" @@ -134,6 +135,10 @@ type WorkflowController struct { throttler sync.Throttler workflowKeyLock syncpkg.KeyLock // used to lock workflows for exclusive modification or access sessionProxy *utilsqldb.SessionProxy + memoSessionProxy *utilsqldb.SessionProxy + memoMigrated bool + memoConfig *config.MemoizationConfig // protected by memoLock + memoLock gosync.RWMutex offloadNodeStatusRepo sqldb.OffloadNodeStatusRepo hydrator hydrator.Interface wfArchive sqldb.WorkflowArchive @@ -209,7 +214,7 @@ func NewWorkflowController(ctx context.Context, restConfig *rest.Config, kubecli cliExecutorLogFormat: executorLogFormat, configController: config.NewController(namespace, configMap, kubeclientset), workflowKeyLock: syncpkg.NewKeyLock(), - cacheFactory: controllercache.NewCacheFactory(kubeclientset, namespace), + cacheFactory: controllercache.NewCacheFactory(kubeclientset), eventRecorderManager: events.NewEventRecorderManager(kubeclientset), progressPatchTickDuration: env.LookupEnvDurationOr(ctx, common.EnvVarProgressPatchTickDuration, 1*time.Minute), progressFileTickDuration: env.LookupEnvDurationOr(ctx, common.EnvVarProgressFileTickDuration, 3*time.Second), @@ -404,6 +409,7 @@ func (wfc *WorkflowController) Run(ctx context.Context, wfWorkers, workflowTTLWo go wfc.workflowGarbageCollector(ctx) go wfc.archivedWorkflowGarbageCollector(ctx) + go wfc.memoizationCacheGarbageCollector(ctx) go wfc.runGCcontroller(ctx, workflowTTLWorkers) go wfc.runCronController(ctx, cronWorkflowWorkers) @@ -721,6 +727,55 @@ func (wfc *WorkflowController) archivedWorkflowGarbageCollector(ctx context.Cont } } +func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Context) { + defer runtimeutil.HandleCrashWithContext(ctx, runtimeutil.PanicHandlers...) + + logger := logging.RequireLoggerFromContext(ctx) + logger = logger.WithField("component", "memo_cache_garbage_collector") + ctx = logging.WithLogger(ctx, logger) + + periodicity := env.LookupEnvDurationOr(ctx, "MEMO_CACHE_GC_PERIOD", 24*time.Hour) + if periodicity <= 0 { + logger.Info(ctx, "MEMO_CACHE_GC_PERIOD is zero or negative - cache GC disabled") + return + } + logger.Info(ctx, "Memoization cache GC goroutine started; will check config each tick") + + ticker := time.NewTicker(periodicity) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + wfc.memoLock.RLock() + memoCfg := wfc.memoConfig + sp := wfc.memoSessionProxy + wfc.memoLock.RUnlock() + + if memoCfg == nil || sp == nil { + logger.Debug(ctx, "Memoization DB not configured or unavailable; skipping GC tick") + continue + } + + cfg := memodb.ConfigFromConfig(memoCfg) + queries, err := memodb.NewQueries(cfg.TableName, sp.DBType()) + if err != nil { + logger.WithError(err).Error(ctx, "Invalid memoization cache table name; skipping GC tick") + continue + } + + logger.Info(ctx, "Performing memoization cache GC") + n, err := queries.Prune(ctx, sp) + if err != nil { + logger.WithError(err).Error(ctx, "Failed to prune memoization cache") + } else { + logger.WithField("deleted", n).Info(ctx, "Memoization cache GC complete") + } + } + } +} + func (wfc *WorkflowController) runWorker(ctx context.Context) { defer runtimeutil.HandleCrashWithContext(ctx, runtimeutil.PanicHandlers...) diff --git a/workflow/controller/controller_test.go b/workflow/controller/controller_test.go index 7b24616e755f..93170f93901f 100644 --- a/workflow/controller/controller_test.go +++ b/workflow/controller/controller_test.go @@ -296,7 +296,7 @@ func newController(ctx context.Context, options ...any) (context.CancelFunc, *Wo estimatorFactory: estimation.DummyEstimatorFactory, eventRecorderManager: &testEventRecorderManager{eventRecorder: record.NewFakeRecorder(64)}, archiveLabelSelector: labels.Everything(), - cacheFactory: controllercache.NewCacheFactory(kube, "default"), + cacheFactory: controllercache.NewCacheFactory(kube), progressPatchTickDuration: envutil.LookupEnvDurationOr(ctx, common.EnvVarProgressPatchTickDuration, 1*time.Minute), progressFileTickDuration: envutil.LookupEnvDurationOr(ctx, common.EnvVarProgressFileTickDuration, 3*time.Second), maxStackDepth: maxAllowedStackDepth, diff --git a/workflow/controller/dag.go b/workflow/controller/dag.go index 57fa5651c342..bb647ce15faf 100644 --- a/workflow/controller/dag.go +++ b/workflow/controller/dag.go @@ -379,11 +379,21 @@ func (woc *wfOperationCtx) executeDAG(ctx context.Context, nodeName string, tmpl woc.wf.Status.Nodes.Set(ctx, node.ID, *node) } if node.MemoizationStatus != nil { - c := woc.controller.cacheFactory.GetCache(controllercache.ConfigMapCache, node.MemoizationStatus.CacheName) - saveErr := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs) - if saveErr != nil { - woc.log.WithField("nodeID", node.ID).WithError(saveErr).Error(ctx, "Failed to save node outputs to cache") - node.Phase = wfv1.NodeError + c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, node.MemoizationStatus.CacheName) + switch { + case c == nil: + woc.log.WithField("nodeID", node.ID).Warn(ctx, "Memoization cache unavailable; skipping cache save") + case tmpl.Memoize == nil: + woc.log.WithField("nodeID", node.ID).Warn(ctx, "Node template has no memoize spec; skipping cache save") + default: + maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(tmpl.Memoize.MaxAge) + if maxAgeErr != nil { + woc.log.WithField("nodeID", node.ID).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") + node.Phase = wfv1.NodeError + } else if saveErr := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAgeSeconds); saveErr != nil { + woc.log.WithField("nodeID", node.ID).WithError(saveErr).Error(ctx, "Failed to save node outputs to cache") + node.Phase = wfv1.NodeError + } } } diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index a12da33dd38d..e35af5c68363 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1196,12 +1196,27 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) (bool, error) woc.addOutputsToGlobalScope(ctx, newState.Outputs) if newState.MemoizationStatus != nil { if newState.Succeeded() { - c := woc.controller.cacheFactory.GetCache(controllercache.ConfigMapCache, newState.MemoizationStatus.CacheName) - err := c.Save(ctx, newState.MemoizationStatus.Key, newState.ID, newState.Outputs) - if err != nil { - woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") - newState.Phase = wfv1.NodeError - newState.Message = err.Error() + c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, newState.MemoizationStatus.CacheName) + if c == nil { + woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).Warn(ctx, "Memoization cache unavailable; skipping cache save") + } else { + nodeTmpl, tmplErr := woc.GetNodeTemplate(ctx, newState) + maxAge := "" + if tmplErr != nil { + woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).WithError(tmplErr).Warn(ctx, "Failed to get node template for cache save; using default maxAge") + } else if nodeTmpl != nil && nodeTmpl.Memoize != nil { + maxAge = nodeTmpl.Memoize.MaxAge + } + maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(maxAge) + if maxAgeErr != nil { + woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") + newState.Phase = wfv1.NodeError + newState.Message = maxAgeErr.Error() + } else if err := c.Save(ctx, newState.MemoizationStatus.Key, newState.ID, newState.Outputs, maxAgeSeconds); err != nil { + woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") + newState.Phase = wfv1.NodeError + newState.Message = err.Error() + } } } } @@ -2228,11 +2243,21 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string, // Check memoization cache if the node is about to be created, or was created in the past but is only now allowed to run due to acquiring a lock if processedTmpl.Memoize != nil { + cacheName := "" + if processedTmpl.Memoize.Cache != nil && processedTmpl.Memoize.Cache.ConfigMap != nil { + cacheName = processedTmpl.Memoize.Cache.ConfigMap.Name + } + if cacheName == "" { + cacheErr := fmt.Errorf("memoize.cache.configMap.name is required") + woc.log.WithError(cacheErr).Error(ctx, "Invalid memoize configuration") + errNode := woc.initializeNodeOrMarkError(ctx, node, nodeName, templateScope, orgTmpl, opts.boundaryID, opts.nodeFlag, cacheErr) + return errNode, cacheErr + } if node == nil || unlockedNode { - memoizationCache := woc.controller.cacheFactory.GetCache(controllercache.ConfigMapCache, processedTmpl.Memoize.Cache.ConfigMap.Name) + memoizationCache := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, cacheName) if memoizationCache == nil { cacheErr := fmt.Errorf("cache could not be found or created") - woc.log.WithFields(logging.Fields{"cacheName": processedTmpl.Memoize.Cache.ConfigMap.Name}).WithError(cacheErr) + woc.log.WithFields(logging.Fields{"cacheName": cacheName}).WithError(cacheErr) errNode := woc.initializeNodeOrMarkError(ctx, node, nodeName, templateScope, orgTmpl, opts.boundaryID, opts.nodeFlag, cacheErr) return errNode, cacheErr } @@ -2263,7 +2288,7 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string, memoizationStatus := &wfv1.MemoizationStatus{ Hit: hit, Key: processedTmpl.Memoize.Key, - CacheName: processedTmpl.Memoize.Cache.ConfigMap.Name, + CacheName: cacheName, } if hit { if node == nil { @@ -2808,14 +2833,19 @@ func (woc *wfOperationCtx) initializeExecutableNode(ctx context.Context, nodeNam node.Inputs = executeTmpl.Inputs.DeepCopy() } - // Set the MemoizationStatus - if node.MemoizationStatus == nil && executeTmpl.Memoize != nil { - memoizationStatus := &wfv1.MemoizationStatus{ + // Set the MemoizationStatus only when the user explicitly supplied a key. + // When key is auto-derived, executeTemplate sets MemoizationStatus with the + // computed effectiveKey before reaching this point. + if node.MemoizationStatus == nil && executeTmpl.Memoize != nil && executeTmpl.Memoize.Key != "" { + cacheName := "" + if executeTmpl.Memoize.Cache != nil { + cacheName = executeTmpl.Memoize.Cache.ConfigMap.Name + } + node.MemoizationStatus = &wfv1.MemoizationStatus{ Hit: false, Key: executeTmpl.Memoize.Key, - CacheName: executeTmpl.Memoize.Cache.ConfigMap.Name, + CacheName: cacheName, } - node.MemoizationStatus = memoizationStatus } if nodeType == wfv1.NodeTypeSuspend { diff --git a/workflow/controller/steps.go b/workflow/controller/steps.go index 516850d738d4..1f3e8e457b04 100644 --- a/workflow/controller/steps.go +++ b/workflow/controller/steps.go @@ -188,11 +188,21 @@ func (woc *wfOperationCtx) executeSteps(ctx context.Context, nodeName string, tm } if node.MemoizationStatus != nil { - c := woc.controller.cacheFactory.GetCache(controllercache.ConfigMapCache, node.MemoizationStatus.CacheName) - err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs) - if err != nil { - woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") - node.Phase = wfv1.NodeError + c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, node.MemoizationStatus.CacheName) + switch { + case c == nil: + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).Warn(ctx, "Memoization cache unavailable; skipping cache save") + case tmpl.Memoize == nil: + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).Warn(ctx, "Node template has no memoize spec; skipping cache save") + default: + maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(tmpl.Memoize.MaxAge) + if maxAgeErr != nil { + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") + node.Phase = wfv1.NodeError + } else if err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAgeSeconds); err != nil { + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") + node.Phase = wfv1.NodeError + } } } return woc.markNodePhase(ctx, nodeName, wfv1.NodeSucceeded), nil diff --git a/workflow/controller/taskset.go b/workflow/controller/taskset.go index 86644bc5f7fa..7d67e0e75c2a 100644 --- a/workflow/controller/taskset.go +++ b/workflow/controller/taskset.go @@ -154,10 +154,23 @@ func (woc *wfOperationCtx) reconcileTaskSet(ctx context.Context) error { woc.wf.Status.Nodes.Set(ctx, nodeID, *node) if node.MemoizationStatus != nil && node.Succeeded() { - c := woc.controller.cacheFactory.GetCache(controllercache.ConfigMapCache, node.MemoizationStatus.CacheName) - err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs) - if err != nil { - woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") + c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, node.MemoizationStatus.CacheName) + if c == nil { + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).Warn(ctx, "Memoization cache unavailable; skipping cache save") + } else { + nodeTmpl, tmplErr := woc.GetNodeTemplate(ctx, node) + maxAge := "" + if tmplErr != nil { + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(tmplErr).Warn(ctx, "Failed to get node template for cache save; using default maxAge") + } else if nodeTmpl != nil && nodeTmpl.Memoize != nil { + maxAge = nodeTmpl.Memoize.MaxAge + } + maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(maxAge) + if maxAgeErr != nil { + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") + } else if err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAgeSeconds); err != nil { + woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") + } } } woc.updated = true From 4be56f20120749829012b54178ac5288fa5246d3 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 23 Apr 2026 14:31:18 -0400 Subject: [PATCH 02/15] WIP Signed-off-by: droctothorpe --- util/memo/db/queries.go | 15 ++-- util/memo/db/queries_test.go | 34 +++++++ workflow/controller/cache/sqldb_cache_test.go | 8 +- workflow/controller/config.go | 90 ++++++++++++++----- workflow/controller/config_test.go | 45 ++++++++++ workflow/controller/dag.go | 2 +- workflow/controller/operator.go | 19 ++-- workflow/controller/operator_test.go | 26 ++++++ workflow/controller/steps.go | 2 +- workflow/controller/taskset.go | 2 +- 10 files changed, 197 insertions(+), 46 deletions(-) diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go index fab91950407f..c467c3db0c40 100644 --- a/util/memo/db/queries.go +++ b/util/memo/db/queries.go @@ -37,23 +37,24 @@ func NewQueries(tableName string, dbType sqldb.DBType) (*Queries, error) { } // Load retrieves the outputs for the given cache key. -// Returns nil when the entry does not exist. +// Returns nil when the entry does not exist or has expired. func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, cacheName, cacheKey string) (*CacheRecord, error) { var r CacheRecord var found bool + now := time.Now().UTC() err := sp.With(ctx, func(sess db.Session) error { // Use raw SQL to avoid upper/db ORM timestamp scanning issues with // "timestamp without timezone" columns (the ORM may not populate time.Time fields). var query string switch q.dbType { case sqldb.Postgres: - query = fmt.Sprintf(`SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = $1 AND cache_name = $2 AND cache_key = $3`, q.tableName) + query = fmt.Sprintf(`SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = $1 AND cache_name = $2 AND cache_key = $3 AND expires_at > $4`, q.tableName) case sqldb.MySQL: - query = fmt.Sprintf("SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = ? AND cache_name = ? AND cache_key = ?", q.tableName) + query = fmt.Sprintf("SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = ? AND cache_name = ? AND cache_key = ? AND expires_at > ?", q.tableName) default: return fmt.Errorf("unsupported database type: %s", q.dbType) } - rows, err := sess.SQL().QueryContext(ctx, query, namespace, cacheName, cacheKey) + rows, err := sess.SQL().QueryContext(ctx, query, namespace, cacheName, cacheKey, now) if err != nil { return err } @@ -109,13 +110,13 @@ func (q *Queries) Save(ctx context.Context, sp *sqldb.SessionProxy, namespace, c _, err := sess.SQL().ExecContext(ctx, fmt.Sprintf(`INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) VALUES ($1, $2, $3, $4, $5, $6, $7) -ON CONFLICT (namespace, cache_name, cache_key) DO UPDATE SET node_id = $4, outputs = $5, expires_at = $7`, q.tableName), +ON CONFLICT (namespace, cache_name, cache_key) DO UPDATE SET node_id = $4, outputs = $5, created_at = $6, expires_at = $7`, q.tableName), namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt) return err case sqldb.MySQL: _, err := sess.SQL().ExecContext(ctx, - fmt.Sprintf("INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) VALUES (?, ?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE node_id = ?, outputs = ?, expires_at = ?", q.tableName), - namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt, nodeID, outputsStr, expiresAt) + fmt.Sprintf("INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) VALUES (?, ?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE node_id = ?, outputs = ?, created_at = ?, expires_at = ?", q.tableName), + namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt, nodeID, outputsStr, now, expiresAt) return err default: return fmt.Errorf("unsupported database type: %s", q.dbType) diff --git a/util/memo/db/queries_test.go b/util/memo/db/queries_test.go index 49f8f49d931a..b64369f01729 100644 --- a/util/memo/db/queries_test.go +++ b/util/memo/db/queries_test.go @@ -206,6 +206,23 @@ func TestQueriesSaveReplaces(t *testing.T) { assert.Contains(t, rec.Outputs, "new") } +func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupPostgres(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.Postgres) + require.NoError(t, err) + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) + + _, err = sp.Session().SQL(). + ExecContext(ctx, `UPDATE `+testTableName+` SET expires_at = $1 WHERE cache_key = $2`, time.Now().Add(-10*time.Second), "expired-key") + require.NoError(t, err) + + rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "expired-key") + require.NoError(t, err) + assert.Nil(t, rec, "expired entries should load as a cache miss") +} + func TestQueriesPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) @@ -285,6 +302,23 @@ func TestMySQLSaveReplaces(t *testing.T) { assert.Contains(t, rec.Outputs, "new") } +func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { + ctx := logging.TestContext(t.Context()) + sp := setupMySQL(ctx, t) + q, err := memodb.NewQueries(testTableName, sqldb.MySQL) + require.NoError(t, err) + + require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) + + _, err = sp.Session().SQL(). + ExecContext(ctx, "UPDATE "+testTableName+" SET expires_at = ? WHERE cache_key = ?", time.Now().Add(-10*time.Second), "expired-key") + require.NoError(t, err) + + rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "expired-key") + require.NoError(t, err) + assert.Nil(t, rec, "expired entries should load as a cache miss") +} + func TestMySQLPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) diff --git a/workflow/controller/cache/sqldb_cache_test.go b/workflow/controller/cache/sqldb_cache_test.go index 5d429fab2f2a..f15a1fa9f49a 100644 --- a/workflow/controller/cache/sqldb_cache_test.go +++ b/workflow/controller/cache/sqldb_cache_test.go @@ -199,7 +199,7 @@ func TestSQLDBCacheGetOutputsWithMaxAge(t *testing.T) { assert.Nil(t, out) } -func TestSQLDBCacheUpsertPreservesCreatedAt(t *testing.T) { +func TestSQLDBCacheUpsertRefreshesCreatedAt(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) @@ -233,9 +233,7 @@ func TestSQLDBCacheUpsertPreservesCreatedAt(t *testing.T) { require.NoError(t, err) require.NotNil(t, entry2) - // created_at should be preserved (not reset) on upsert. - assert.Equal(t, createdAt1.Unix(), entry2.CreationTimestamp.Unix(), - "created_at should be preserved on upsert: first=%v, second=%v", createdAt1, entry2.CreationTimestamp.Time) - // But the outputs should be updated. + assert.True(t, entry2.CreationTimestamp.After(createdAt1), + "created_at should be refreshed on upsert: first=%v, second=%v", createdAt1, entry2.CreationTimestamp.Time) assert.Equal(t, "node-2", entry2.NodeID) } diff --git a/workflow/controller/config.go b/workflow/controller/config.go index 04dcb1a2782a..038724f49b5b 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -16,9 +16,76 @@ import ( memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" "github.com/argoproj/argo-workflows/v4/util/sqldb" "github.com/argoproj/argo-workflows/v4/workflow/artifactrepositories" + controllercache "github.com/argoproj/argo-workflows/v4/workflow/controller/cache" "github.com/argoproj/argo-workflows/v4/workflow/hydrator" ) +var ( + memoSessionProxyFromConfig = memodb.SessionProxyFromConfig + memoizationMigrate = memodb.Migrate +) + +// initializeMemoizationBackendLocked creates and migrates the memoization SQL backend. +// The caller must hold wfc.memoLock. +func (wfc *WorkflowController) initializeMemoizationBackendLocked(ctx context.Context, logger logging.Logger) { + if wfc.memoConfig == nil { + return + } + if wfc.memoSessionProxy == nil { + sp := memoSessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, wfc.memoConfig) + if sp == nil { + logger.Error(ctx, "Failed to connect to memoization database; SQL caching unavailable. Workflows using memoization will skip caching until the database is reachable.") + wfc.memoSessionProxy = nil + wfc.memoMigrated = false + wfc.cacheFactory.ClearSessionProxy(true) + return + } + wfc.memoSessionProxy = sp + wfc.memoMigrated = false + } + if wfc.memoSessionProxy != nil && !wfc.memoMigrated { + cfg := memodb.ConfigFromConfig(wfc.memoConfig) + if err := memoizationMigrate(ctx, wfc.memoSessionProxy, cfg); err != nil { + logger.WithError(err).Error(ctx, "Memoization database migration failed; SQL caching unavailable. Workflows using memoization will skip caching until the database is healthy.") + wfc.memoSessionProxy.Close() + wfc.memoSessionProxy = nil + wfc.memoMigrated = false + wfc.cacheFactory.ClearSessionProxy(true) + return + } + wfc.memoMigrated = true + wfc.cacheFactory.SetSessionProxy(wfc.memoSessionProxy, cfg.TableName) + } +} + +// ensureMemoizationBackend makes sure the configured memoization SQL backend is ready for use. +// It returns true when memoization can proceed against the configured backend. +func (wfc *WorkflowController) ensureMemoizationBackend(ctx context.Context) bool { + logger := logging.RequireLoggerFromContext(ctx) + wfc.memoLock.Lock() + defer wfc.memoLock.Unlock() + if wfc.memoConfig == nil { + return true + } + if wfc.memoSessionProxy != nil && wfc.memoMigrated { + return true + } + wfc.initializeMemoizationBackendLocked(ctx, logger) + return wfc.memoSessionProxy != nil && wfc.memoMigrated +} + +// getMemoizationCache returns the memoization cache for the given namespace and name. +// When SQL memoization is configured, it first ensures the backend is available. +func (wfc *WorkflowController) getMemoizationCache(ctx context.Context, namespace, name string) controllercache.MemoizationCache { + wfc.memoLock.RLock() + memoConfigured := wfc.memoConfig != nil + wfc.memoLock.RUnlock() + if memoConfigured && !wfc.ensureMemoizationBackend(ctx) { + return nil + } + return wfc.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, namespace, name) +} + func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger := logging.RequireLoggerFromContext(ctx) _, err := yaml.Marshal(wfc.Config) @@ -84,28 +151,7 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { wfc.memoConfig = memoCfg if memoCfg != nil { logger.Info(ctx, "Memoization database configuration enabled") - if wfc.memoSessionProxy == nil { - sp := memodb.SessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, memoCfg) - if sp == nil { - logger.Error(ctx, "Failed to connect to memoization database; SQL caching unavailable. Workflows using memoization will skip caching until the database is reachable.") - wfc.cacheFactory.ClearSessionProxy(true) - } - wfc.memoSessionProxy = sp - wfc.memoMigrated = false - } - if wfc.memoSessionProxy != nil && !wfc.memoMigrated { - cfg := memodb.ConfigFromConfig(memoCfg) - if err := memodb.Migrate(ctx, wfc.memoSessionProxy, cfg); err != nil { - logger.WithError(err).Error(ctx, "Memoization database migration failed; SQL caching unavailable. Workflows using memoization will skip caching until the database is healthy.") - wfc.memoSessionProxy.Close() - wfc.memoSessionProxy = nil - wfc.memoMigrated = false - wfc.cacheFactory.ClearSessionProxy(true) - } else { - wfc.memoMigrated = true - wfc.cacheFactory.SetSessionProxy(wfc.memoSessionProxy, cfg.TableName) - } - } + wfc.initializeMemoizationBackendLocked(ctx, logger) } else { if wfc.memoSessionProxy != nil { logger.Info(ctx, "Memoization database configuration removed") diff --git a/workflow/controller/config_test.go b/workflow/controller/config_test.go index 6144de86b456..cdd4f8f287a7 100644 --- a/workflow/controller/config_test.go +++ b/workflow/controller/config_test.go @@ -1,12 +1,17 @@ package controller import ( + "context" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "k8s.io/client-go/kubernetes" + "github.com/argoproj/argo-workflows/v4/config" "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + utilsqldb "github.com/argoproj/argo-workflows/v4/util/sqldb" ) func TestUpdateConfig(t *testing.T) { @@ -21,3 +26,43 @@ func TestUpdateConfig(t *testing.T) { assert.NotNil(t, controller.wfArchive) assert.NotNil(t, controller.offloadNodeStatusRepo) } + +func TestGetMemoizationCacheRetriesBackendInitialization(t *testing.T) { + ctx := logging.TestContext(t.Context()) + cancel, controller := newController(ctx) + defer cancel() + + controller.memoLock.Lock() + controller.memoConfig = &config.MemoizationConfig{ + DBConfig: config.DBConfig{ + PostgreSQL: &config.PostgreSQLConfig{}, + }, + } + controller.memoLock.Unlock() + + originalBuilder := memoSessionProxyFromConfig + originalMigrate := memoizationMigrate + t.Cleanup(func() { + memoSessionProxyFromConfig = originalBuilder + memoizationMigrate = originalMigrate + }) + + calls := 0 + memoSessionProxyFromConfig = func(context.Context, kubernetes.Interface, string, *config.MemoizationConfig) *utilsqldb.SessionProxy { + calls++ + if calls == 1 { + return nil + } + return utilsqldb.NewSessionProxyFromSession(nil, &config.DBConfig{PostgreSQL: &config.PostgreSQLConfig{}}, "", "") + } + memoizationMigrate = func(context.Context, *utilsqldb.SessionProxy, memodb.Config) error { + return nil + } + + cache := controller.getMemoizationCache(ctx, "default", "memo-cache") + assert.Nil(t, cache) + + cache = controller.getMemoizationCache(ctx, "default", "memo-cache") + require.NotNil(t, cache) + assert.Equal(t, 2, calls) +} diff --git a/workflow/controller/dag.go b/workflow/controller/dag.go index bb647ce15faf..4ec6e395bd4c 100644 --- a/workflow/controller/dag.go +++ b/workflow/controller/dag.go @@ -379,7 +379,7 @@ func (woc *wfOperationCtx) executeDAG(ctx context.Context, nodeName string, tmpl woc.wf.Status.Nodes.Set(ctx, node.ID, *node) } if node.MemoizationStatus != nil { - c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, node.MemoizationStatus.CacheName) + c := woc.controller.getMemoizationCache(ctx, woc.wf.Namespace, node.MemoizationStatus.CacheName) switch { case c == nil: woc.log.WithField("nodeID", node.ID).Warn(ctx, "Memoization cache unavailable; skipping cache save") diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index e35af5c68363..6597d7141153 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1196,7 +1196,7 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) (bool, error) woc.addOutputsToGlobalScope(ctx, newState.Outputs) if newState.MemoizationStatus != nil { if newState.Succeeded() { - c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, newState.MemoizationStatus.CacheName) + c := woc.controller.getMemoizationCache(ctx, woc.wf.Namespace, newState.MemoizationStatus.CacheName) if c == nil { woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).Warn(ctx, "Memoization cache unavailable; skipping cache save") } else { @@ -2254,17 +2254,18 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string, return errNode, cacheErr } if node == nil || unlockedNode { - memoizationCache := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, cacheName) + memoizationCache := woc.controller.getMemoizationCache(ctx, woc.wf.Namespace, cacheName) if memoizationCache == nil { - cacheErr := fmt.Errorf("cache could not be found or created") - woc.log.WithFields(logging.Fields{"cacheName": cacheName}).WithError(cacheErr) - errNode := woc.initializeNodeOrMarkError(ctx, node, nodeName, templateScope, orgTmpl, opts.boundaryID, opts.nodeFlag, cacheErr) - return errNode, cacheErr + woc.log.WithFields(logging.Fields{"cacheName": cacheName}).Warn(ctx, "Memoization cache unavailable; treating as cache miss") } - entry, loadErr := memoizationCache.Load(ctx, processedTmpl.Memoize.Key) - if loadErr != nil { - return woc.initializeNodeOrMarkError(ctx, node, nodeName, templateScope, orgTmpl, opts.boundaryID, opts.nodeFlag, loadErr), loadErr + var entry *controllercache.Entry + if memoizationCache != nil { + var loadErr error + entry, loadErr = memoizationCache.Load(ctx, processedTmpl.Memoize.Key) + if loadErr != nil { + return woc.initializeNodeOrMarkError(ctx, node, nodeName, templateScope, orgTmpl, opts.boundaryID, opts.nodeFlag, loadErr), loadErr + } } hit := entry.Hit() diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index 1547a90cee5d..ae6135e1d23b 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -6301,6 +6301,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: name: memoized-workflow-test + namespace: default spec: entrypoint: whalesay arguments: @@ -6389,6 +6390,31 @@ func TestConfigMapCacheLoadOperateMaxAge(t *testing.T) { } } +func TestUnavailableSQLMemoizationBackendTreatsLookupAsCacheMiss(t *testing.T) { + wf := wfv1.MustUnmarshalWorkflow(workflowCachedMaxAge) + ctx := logging.TestContext(t.Context()) + cancel, controller := newController(ctx, func(wfc *WorkflowController) { + wfc.memoLock.Lock() + defer wfc.memoLock.Unlock() + wfc.memoConfig = &config.MemoizationConfig{} + }) + defer cancel() + + _, err := controller.wfclientset.ArgoprojV1alpha1().Workflows(wf.ObjectMeta.Namespace).Create(ctx, wf, metav1.CreateOptions{}) + require.NoError(t, err) + + woc := newWorkflowOperationCtx(ctx, wf, controller) + woc.operate(ctx) + + require.Len(t, woc.wf.Status.Nodes, 1) + for _, node := range woc.wf.Status.Nodes { + assert.Equal(t, wfv1.NodePending, node.Phase) + assert.NotNil(t, node.MemoizationStatus) + assert.False(t, node.MemoizationStatus.Hit) + assert.Nil(t, node.Outputs) + } +} + var workflowStepCachedWithRetryStrategy = ` apiVersion: argoproj.io/v1alpha1 kind: Workflow diff --git a/workflow/controller/steps.go b/workflow/controller/steps.go index 1f3e8e457b04..6905241d2a81 100644 --- a/workflow/controller/steps.go +++ b/workflow/controller/steps.go @@ -188,7 +188,7 @@ func (woc *wfOperationCtx) executeSteps(ctx context.Context, nodeName string, tm } if node.MemoizationStatus != nil { - c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, node.MemoizationStatus.CacheName) + c := woc.controller.getMemoizationCache(ctx, woc.wf.Namespace, node.MemoizationStatus.CacheName) switch { case c == nil: woc.log.WithFields(logging.Fields{"nodeID": node.ID}).Warn(ctx, "Memoization cache unavailable; skipping cache save") diff --git a/workflow/controller/taskset.go b/workflow/controller/taskset.go index 7d67e0e75c2a..ab735981c2b1 100644 --- a/workflow/controller/taskset.go +++ b/workflow/controller/taskset.go @@ -154,7 +154,7 @@ func (woc *wfOperationCtx) reconcileTaskSet(ctx context.Context) error { woc.wf.Status.Nodes.Set(ctx, nodeID, *node) if node.MemoizationStatus != nil && node.Succeeded() { - c := woc.controller.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, woc.wf.Namespace, node.MemoizationStatus.CacheName) + c := woc.controller.getMemoizationCache(ctx, woc.wf.Namespace, node.MemoizationStatus.CacheName) if c == nil { woc.log.WithFields(logging.Fields{"nodeID": node.ID}).Warn(ctx, "Memoization cache unavailable; skipping cache save") } else { From c2b52ca1221465dae645f02f570a53d7d872da86 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 23 Apr 2026 21:44:02 -0400 Subject: [PATCH 03/15] test: fix PR #15938 CI failures Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: droctothorpe --- docs/memoization.md | 4 +- test/e2e/retry_test.go | 50 ++++++++++++++++--- .../workflow-template-with-containerset.yaml | 7 +-- workflow/controller/operator_test.go | 2 + 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/docs/memoization.md b/docs/memoization.md index 5616891b821c..9fed4190ebed 100644 --- a/docs/memoization.md +++ b/docs/memoization.md @@ -53,9 +53,9 @@ data: key: password ``` -Each cache entry computes an `expires_at` timestamp at save time from the template's `maxAge` field. If `maxAge` is not specified on the template, it defaults to 30 days (2592000 seconds). This default can be overridden by setting the `DEFAULT_MAX_AGE` environment variable on the workflow controller (accepts Go duration strings like `720h` or integer seconds like `2592000`). +Each cache entry stores its expiry time when it is written, derived from the template's `maxAge` field. If `maxAge` is not specified on the template, it defaults to 30 days (2592000 seconds). This default can be overridden by setting the `DEFAULT_MAX_AGE` environment variable on the workflow controller (accepts Go duration strings like `720h` or integer seconds like `2592000`). -The garbage collector periodically deletes entries whose `expires_at` has elapsed. The GC period defaults to 24 hours and can be configured via the `MEMO_CACHE_GC_PERIOD` environment variable. +The garbage collector periodically deletes expired entries. The GC period defaults to 24 hours and can be configured via the `MEMO_CACHE_GC_PERIOD` environment variable. MySQL is also supported: diff --git a/test/e2e/retry_test.go b/test/e2e/retry_test.go index 67b29f5e7c15..1bc31affc435 100644 --- a/test/e2e/retry_test.go +++ b/test/e2e/retry_test.go @@ -143,12 +143,6 @@ spec: assert.Equal(t, 1, count) assert.Contains(t, logs, "hi") }). - // Command err. No retry logic is entered. - ExpectContainerLogs("c2", func(t *testing.T, logs string) { - count := strings.Count(logs, "capturing logs") - assert.Equal(t, 0, count) - assert.Contains(t, logs, "executable file not found in $PATH") - }). // Retry when err. ExpectContainerLogs("c3", func(t *testing.T, logs string) { count := strings.Count(logs, "capturing logs") @@ -158,6 +152,50 @@ spec: }) } +func (s *RetryTestSuite) TestWorkflowTemplateWithCommandStartErrorInContainerSet() { + s.Given(). + WorkflowTemplate(` +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: containerset-with-command-start-error +spec: + entrypoint: test + templates: + - name: test + containerSet: + retryStrategy: + retries: "2" + containers: + - name: c2 + image: argoproj/argosay:v2 + command: + - invalid + - command +`). + Workflow(` +metadata: + name: workflow-template-containerset-command-error +spec: + workflowTemplateRef: + name: containerset-with-command-start-error +`). + When(). + CreateWorkflowTemplates(). + SubmitWorkflow(). + WaitForWorkflow(fixtures.ToBeFailed). + Then(). + ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *v1alpha1.WorkflowStatus) { + assert.Equal(t, v1alpha1.WorkflowFailed, status.Phase) + }). + // Command start errors bypass retry execution. + ExpectContainerLogs("c2", func(t *testing.T, logs string) { + count := strings.Count(logs, "capturing logs") + assert.Equal(t, 0, count) + assert.Contains(t, logs, "executable file not found in $PATH") + }) +} + func (s *RetryTestSuite) TestRetryNodeAntiAffinity() { s.Given(). Workflow(` diff --git a/test/e2e/testdata/workflow-template-with-containerset.yaml b/test/e2e/testdata/workflow-template-with-containerset.yaml index 204224061a0f..d9bae250eebc 100644 --- a/test/e2e/testdata/workflow-template-with-containerset.yaml +++ b/test/e2e/testdata/workflow-template-with-containerset.yaml @@ -16,12 +16,7 @@ spec: - name: c1 image: argoproj/argosay:v2 args: ["echo", "hi"] - - name: c2 - image: argoproj/argosay:v2 - command: - - invalid - - command - name: c3 image: argoproj/argosay:v2 command: [ sh, -c ] - args: [ "echo intentional failure; exit 1" ] \ No newline at end of file + args: [ "echo intentional failure; exit 1" ] diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index ae6135e1d23b..3d04566bbae5 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -6420,6 +6420,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: name: memoized-workflow-test + namespace: default spec: entrypoint: whalesay arguments: @@ -6454,6 +6455,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: memoized-workflow-test + namespace: default spec: entrypoint: main # podGC: From 81814e13d03c8d459e0ac4dc3846ef5f0a05f788 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 23 Apr 2026 22:35:38 -0400 Subject: [PATCH 04/15] test: avoid Docker Hub rate limits in API e2e Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: droctothorpe --- test/e2e/argo_server_test.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/test/e2e/argo_server_test.go b/test/e2e/argo_server_test.go index 21e67dc2ebcd..b54f7803ab72 100644 --- a/test/e2e/argo_server_test.go +++ b/test/e2e/argo_server_test.go @@ -2021,12 +2021,16 @@ metadata: labels: foo: 1 spec: + arguments: + parameters: + - name: archived-message + value: "hello \\u0001F44D" entrypoint: run-archie templates: - name: run-archie container: - image: argoproj/argosay:v2 - args: [echo, "hello \\u0001F44D"]`). + image: quay.io/argoproj/argoexec:latest + args: [version]`). When(). SubmitWorkflow(). WaitForWorkflow(fixtures.ToBeArchived). @@ -2048,9 +2052,8 @@ spec: templates: - name: run-jughead container: - image: argoproj/argosay:v2 - command: [sh, -c] - args: ["echo intentional failure; exit 1"]`). + image: quay.io/argoproj/argoexec:latest + args: [not-a-real-subcommand]`). When(). SubmitWorkflow(). WaitForWorkflow(fixtures.ToBeArchived). @@ -2070,7 +2073,8 @@ spec: templates: - name: run-betty container: - image: argoproj/argosay:v2`). + image: quay.io/argoproj/argoexec:latest + args: [version]`). When(). SubmitWorkflow(). WaitForWorkflow(fixtures.ToBeArchived) @@ -2179,7 +2183,7 @@ spec: Path("$.metadata.name"). NotNull() j. - Path("$.spec.templates[0].container.args[1]"). + Path("$.spec.arguments.parameters[0].value"). // make sure unicode escape wasn't mangled IsEqual("hello \\u0001F44D") j. From d0b7bf43ab7d9e5175d4bc4973802d4825561101 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Sat, 25 Apr 2026 22:10:02 -0400 Subject: [PATCH 05/15] Address Claude Code feedback Signed-off-by: droctothorpe --- .features/pending/sql-memoization-cache.md | 3 +- config/config.go | 2 +- docs/environment-variables.md | 2 +- docs/memoization.md | 8 +-- docs/workflow-controller-configmap.md | 6 +-- .../workflow-controller-configmap.yaml | 2 +- manifests/quick-start-postgres.yaml | 2 +- test/e2e/sqldb_memoize_test.go | 2 +- util/memo/db/config.go | 32 ++++++++++- util/memo/db/config_test.go | 24 +++++++++ util/memo/db/migrate.go | 12 ++--- util/memo/db/queries.go | 6 +-- util/memo/db/queries_test.go | 54 +++++++++---------- workflow/controller/cache/cache.go | 16 +++--- workflow/controller/cache/configmap_cache.go | 2 +- workflow/controller/cache/sqldb_cache.go | 6 ++- workflow/controller/cache/sqldb_cache_test.go | 27 ++++------ workflow/controller/cache_test.go | 14 ++++- workflow/controller/controller.go | 5 +- workflow/controller/dag.go | 7 +-- workflow/controller/operator.go | 7 +-- workflow/controller/steps.go | 7 +-- workflow/controller/taskset.go | 6 +-- 23 files changed, 144 insertions(+), 108 deletions(-) create mode 100644 util/memo/db/config_test.go diff --git a/.features/pending/sql-memoization-cache.md b/.features/pending/sql-memoization-cache.md index b4a956e46cec..92b8fa3213c4 100644 --- a/.features/pending/sql-memoization-cache.md +++ b/.features/pending/sql-memoization-cache.md @@ -7,6 +7,7 @@ PRs: 15938 Memoization can now store cache entries in a PostgreSQL or MySQL database instead of Kubernetes ConfigMaps. The SQL backend removes the 1 MB ConfigMap size limit and persists cache entries across cluster restarts. ConfigMaps remain the default; opt in by adding a `memoization` block to the `workflow-controller-configmap`. +SQL-backed entries are stored in the configured table, which defaults to `cache_entries`. Each cache entry computes an `expires_at` timestamp at save time from the template's `maxAge` field (default: 30 days). -The default max age can be overridden via the `DEFAULT_MAX_AGE` environment variable on the controller. +The default max age can be overridden via the `DEFAULT_MAX_AGE` environment variable on the controller when SQL memoization is enabled. A periodic garbage collector deletes expired entries whose `expires_at` has elapsed. diff --git a/config/config.go b/config/config.go index 210d0a21c49c..f0af01e154e7 100644 --- a/config/config.go +++ b/config/config.go @@ -362,7 +362,7 @@ type SyncConfig struct { type MemoizationConfig struct { DBConfig // TableName is the name of the table to use for memoization cache entries. - // Defaults to "memoization_cache" if not set. + // Defaults to "cache_entries" if not set. TableName string `json:"tableName,omitempty"` // SkipMigration skips automatic database migration on startup. SkipMigration bool `json:"skipMigration,omitempty"` diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 72ba80321b27..dbca7a9c2bb5 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -27,7 +27,7 @@ This document outlines environment variables that can be used to customize behav | `CACHE_GC_PERIOD` | `time.Duration` | `0s` | How often to perform memoization cache GC, which is disabled by default and can be enabled by providing a non-zero duration. | | `CACHE_GC_AFTER_NOT_HIT_DURATION` | `time.Duration` | `30s` | When a memoization cache has not been hit after this duration, it will be deleted. | | `CRON_SYNC_PERIOD` | `time.Duration` | `10s` | How often to sync cron workflows. | -| `DEFAULT_MAX_AGE` | `string` | `""` (30 days) | Default TTL for SQL-backed memoization cache entries when `memoize.maxAge` is not set on the template. Accepts a Go duration string (e.g. `720h`) or an integer number of seconds. If unset, entries expire after 30 days. | +| `DEFAULT_MAX_AGE` | `string` | `""` (30 days) | Default TTL for SQL-backed memoization cache entries when `memoize.maxAge` is not set on the template. Accepts a Go duration string (e.g. `720h`) or an integer number of seconds. If unset, entries expire after 30 days. This does not affect ConfigMap-backed memoization. | | `DEFAULT_REQUEUE_TIME` | `time.Duration` | `10s` | The re-queue time for the rate limiter of the workflow queue. | | `DISABLE_MAX_RECURSION` | `bool` | `false` | Set to true to disable the recursion preventer, which will stop a workflow running which has called into a child template 100 times | | `EXPRESSION_TEMPLATES` | `bool` | `true` | Escape hatch to disable expression templates. | diff --git a/docs/memoization.md b/docs/memoization.md index 9fed4190ebed..9fc85290db69 100644 --- a/docs/memoization.md +++ b/docs/memoization.md @@ -40,11 +40,11 @@ metadata: namespace: argo data: memoization: | - tableName: memoization_cache postgresql: host: postgres port: 5432 database: postgres + tableName: cache_entries userNameSecret: name: argo-postgres-config key: username @@ -53,7 +53,9 @@ data: key: password ``` -Each cache entry stores its expiry time when it is written, derived from the template's `maxAge` field. If `maxAge` is not specified on the template, it defaults to 30 days (2592000 seconds). This default can be overridden by setting the `DEFAULT_MAX_AGE` environment variable on the workflow controller (accepts Go duration strings like `720h` or integer seconds like `2592000`). +SQL-backed memoization stores entries in the configured table. If `tableName` is omitted, it defaults to `cache_entries`. + +Each cache entry stores its expiry time when it is written, derived from the template's `maxAge` field. If `maxAge` is not specified on the template, it defaults to 30 days (2592000 seconds). This default can be overridden by setting the `DEFAULT_MAX_AGE` environment variable on the workflow controller for SQL-backed memoization (accepts Go duration strings like `720h` or integer seconds like `2592000`). The garbage collector periodically deletes expired entries. The GC period defaults to 24 hours and can be configured via the `MEMO_CACHE_GC_PERIOD` environment variable. @@ -61,11 +63,11 @@ MySQL is also supported: ```yaml memoization: | - tableName: memoization_cache mysql: host: mysql port: 3306 database: argo + tableName: cache_entries userNameSecret: name: argo-mysql-config key: username diff --git a/docs/workflow-controller-configmap.md b/docs/workflow-controller-configmap.md index 999dd99331de..bba779817877 100644 --- a/docs/workflow-controller-configmap.md +++ b/docs/workflow-controller-configmap.md @@ -96,7 +96,7 @@ Config contains the root of the configuration settings for the workflow controll | `NavColor` | `string` | NavColor is an ui navigation bar background color | | `SSO` | [`SSOConfig`](#ssoconfig) | SSO in settings for single-sign on | | `Synchronization` | [`SyncConfig`](#syncconfig) | Synchronization via databases config | -| `Memoization` | [`MemoizationConfig`](#memoizationconfig) | Memoization configures memoization cache storage. When set, cache entries are stored in a database instead of ConfigMaps. ConfigMap-based caching remains the default when omitted. | +| `Memoization` | [`MemoizationConfig`](#memoizationconfig) | Memoization configures memoization cache storage. When set, cache entries are stored in a SQL database table instead of ConfigMaps. ConfigMap-based caching remains the default when omitted. | | `ArtifactDrivers` | `Array<`[`ArtifactDriver`](#artifactdriver)`>` | ArtifactDrivers lists artifact driver plugins we can use | | `FailedPodRestart` | [`FailedPodRestartConfig`](#failedpodrestartconfig) | FailedPodRestart configures automatic restart of pods that fail before entering Running state (e.g., due to Eviction, DiskPressure, Preemption). This allows recovery from transient infrastructure issues without requiring a retryStrategy on templates. | @@ -372,7 +372,7 @@ SyncConfig contains synchronization configuration for database locks (semaphores ## MemoizationConfig -MemoizationConfig contains memoization cache configuration for database-backed storage. When configured, cache entries are stored in the specified database table instead of ConfigMaps. +MemoizationConfig contains memoization cache configuration for database-backed storage. When configured, cache entries are stored in the configured database table instead of ConfigMaps. If `TableName` is omitted, it defaults to `cache_entries`. ### Fields @@ -382,7 +382,7 @@ MemoizationConfig contains memoization cache configuration for database-backed s | `MySQL` | [`MySQLConfig`](#mysqlconfig) | MySQL configuration for MySQL database, don't use PostgreSQL at the same time | | `ConnectionPool` | [`ConnectionPool`](#connectionpool) | Pooled connection settings for all types of database connections | | `DBReconnectConfig` | [`DBReconnectConfig`](#dbreconnectconfig) | DBReconnectConfig are configuration options for database retries and reconnections | -| `TableName` | `string` | TableName is the name of the table to use for memoization cache entries. Defaults to "memoization_cache" if not set. | +| `TableName` | `string` | TableName is the name of the table to use for memoization cache entries. Defaults to `cache_entries` when omitted. | | `SkipMigration` | `bool` | SkipMigration skips automatic database migration on startup. | ## ArtifactDriver diff --git a/manifests/components/postgres/overlays/workflow-controller-configmap.yaml b/manifests/components/postgres/overlays/workflow-controller-configmap.yaml index b64030f6b7a8..159498312586 100644 --- a/manifests/components/postgres/overlays/workflow-controller-configmap.yaml +++ b/manifests/components/postgres/overlays/workflow-controller-configmap.yaml @@ -44,11 +44,11 @@ data: name: argo-postgres-config key: password memoization: | - tableName: memoization_cache postgresql: host: postgres port: 5432 database: postgres + tableName: cache_entries userNameSecret: name: argo-postgres-config key: username diff --git a/manifests/quick-start-postgres.yaml b/manifests/quick-start-postgres.yaml index be01eb79e183..093af6d56059 100644 --- a/manifests/quick-start-postgres.yaml +++ b/manifests/quick-start-postgres.yaml @@ -183472,11 +183472,11 @@ data: scope: workflow-list url: http://workflows?label=workflows.argoproj.io/completed=true memoization: | - tableName: memoization_cache postgresql: host: postgres port: 5432 database: postgres + tableName: cache_entries userNameSecret: name: argo-postgres-config key: username diff --git a/test/e2e/sqldb_memoize_test.go b/test/e2e/sqldb_memoize_test.go index 02a46a99597d..12dba813abd0 100644 --- a/test/e2e/sqldb_memoize_test.go +++ b/test/e2e/sqldb_memoize_test.go @@ -99,7 +99,7 @@ func (s *SQLDBMemoizeSuite) TestSQLDBMemoize() { s.assertNoConfigMap(ctx, "sqldb-memo-cache") } -// assertDBCacheEntry checks the memoization_cache table directly. +// assertDBCacheEntry checks the configured memoization table directly. func (s *SQLDBMemoizeSuite) assertDBCacheEntry(ctx context.Context, key string) { memoCfg := s.Config.Memoization // E2E tests connect to postgres via a port-forward on localhost. diff --git a/util/memo/db/config.go b/util/memo/db/config.go index a1acda76e1c7..986abc710fde 100644 --- a/util/memo/db/config.go +++ b/util/memo/db/config.go @@ -2,6 +2,9 @@ package db import ( "context" + "fmt" + "hash/fnv" + "regexp" "k8s.io/client-go/kubernetes" @@ -11,10 +14,12 @@ import ( ) const ( - defaultTableName = "memoization_cache" + defaultTableName = "cache_entries" versionTable = "memoization_schema_history" ) +var validTableName = regexp.MustCompile(`^[A-Za-z0-9_]+$`) + // Config holds resolved configuration for database-backed memoization. type Config struct { TableName string @@ -28,6 +33,28 @@ func TableName(cfg *config.MemoizationConfig) string { return cfg.TableName } +func validateTableName(tableName string) error { + if !validTableName.MatchString(tableName) { + return fmt.Errorf("invalid table name %q: must match [A-Za-z0-9_]+", tableName) + } + return nil +} + +func memoizationVersionTableName(tableName string) string { + if tableName == defaultTableName { + return versionTable + } + hasher := fnv.New64a() + _, _ = hasher.Write([]byte(tableName)) + return fmt.Sprintf("memoization_schema_history_%x", hasher.Sum64()) +} + +func memoizationExpiresAtIndexName(tableName string) string { + hasher := fnv.New64a() + _, _ = hasher.Write([]byte(tableName)) + return fmt.Sprintf("memoization_expires_at_%x", hasher.Sum64()) +} + // ConfigFromConfig converts a controller MemoizationConfig (with DB credentials, connection // settings, etc.) into the smaller Config struct used by the migration and query layers. // Returns sensible defaults when cfg is nil. @@ -70,6 +97,9 @@ func Migrate(ctx context.Context, sessionProxy *sqldb.SessionProxy, cfg Config) return nil } logger := logging.RequireLoggerFromContext(ctx) + if err := validateTableName(cfg.TableName); err != nil { + return err + } if cfg.SkipMigration { logger.Info(ctx, "Memoization db migration skipped") return nil diff --git a/util/memo/db/config_test.go b/util/memo/db/config_test.go new file mode 100644 index 000000000000..84636f0c1922 --- /dev/null +++ b/util/memo/db/config_test.go @@ -0,0 +1,24 @@ +package db_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/argoproj/argo-workflows/v4/config" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + "github.com/argoproj/argo-workflows/v4/util/sqldb" +) + +func TestTableNameDefaultsAndOverrides(t *testing.T) { + assert.Equal(t, "cache_entries", memodb.TableName(nil)) + assert.Equal(t, "cache_entries", memodb.TableName(&config.MemoizationConfig{})) + assert.Equal(t, "custom_cache_entries", memodb.TableName(&config.MemoizationConfig{TableName: "custom_cache_entries"})) +} + +func TestNewQueriesRejectsInvalidTableName(t *testing.T) { + queries, err := memodb.NewQueries("invalid-table-name", sqldb.Postgres) + require.Error(t, err) + assert.Nil(t, queries) +} diff --git a/util/memo/db/migrate.go b/util/memo/db/migrate.go index f0aa92af039c..45ff46fb7443 100644 --- a/util/memo/db/migrate.go +++ b/util/memo/db/migrate.go @@ -2,21 +2,17 @@ package db import ( "context" - "fmt" - "regexp" "github.com/upper/db/v4" "github.com/argoproj/argo-workflows/v4/util/sqldb" ) -var validTableName = regexp.MustCompile(`^[A-Za-z0-9_]+$`) - func migrate(ctx context.Context, session db.Session, dbType sqldb.DBType, tableName string) error { - if !validTableName.MatchString(tableName) { - return fmt.Errorf("invalid table name %q: must match [A-Za-z0-9_]+", tableName) + if err := validateTableName(tableName); err != nil { + return err } - return sqldb.Migrate(ctx, session, dbType, versionTable, []sqldb.Change{ + return sqldb.Migrate(ctx, session, dbType, memoizationVersionTableName(tableName), []sqldb.Change{ // MySQL: use LONGTEXT for outputs (TEXT is 64KB). // Postgres: use text for outputs (no size limit). // Varchar sizes chosen to keep composite PK within InnoDB's 3072-byte limit with utf8mb4: @@ -42,6 +38,6 @@ func migrate(ctx context.Context, session db.Session, dbType sqldb.DBType, table "expires_at timestamp not null, " + "primary key (namespace, cache_name, cache_key))"), }), - sqldb.AnsiSQLChange(`create index imemo_expires_at on ` + tableName + ` (expires_at)`), + sqldb.AnsiSQLChange(`create index ` + memoizationExpiresAtIndexName(tableName) + ` on ` + tableName + ` (expires_at)`), }) } diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go index c467c3db0c40..1f247d3954bb 100644 --- a/util/memo/db/queries.go +++ b/util/memo/db/queries.go @@ -30,8 +30,8 @@ type Queries struct { } func NewQueries(tableName string, dbType sqldb.DBType) (*Queries, error) { - if !validTableName.MatchString(tableName) { - return nil, fmt.Errorf("invalid table name %q: must match [A-Za-z0-9_]+", tableName) + if err := validateTableName(tableName); err != nil { + return nil, err } return &Queries{tableName: tableName, dbType: dbType}, nil } @@ -72,7 +72,7 @@ func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, c } // Prune deletes cache entries whose expires_at has elapsed. It is called -// periodically by the controller to bound the size of the memoization_cache table. +// periodically by the controller to bound the size of the configured memoization cache table. func (q *Queries) Prune(ctx context.Context, sp *sqldb.SessionProxy) (int64, error) { now := time.Now().UTC() var n int64 diff --git a/util/memo/db/queries_test.go b/util/memo/db/queries_test.go index b64369f01729..97746e7a1098 100644 --- a/util/memo/db/queries_test.go +++ b/util/memo/db/queries_test.go @@ -26,11 +26,19 @@ const ( testDBName = "memotest" testDBUser = "user" testDBPassword = "pass" - testTableName = "memoization_cache" testNamespace = "default" testCacheName = "my-cache" ) +var testTableName = memodb.TableName(nil) + +func newQueries(t *testing.T, dbType sqldb.DBType) *memodb.Queries { + t.Helper() + q, err := memodb.NewQueries(testTableName, dbType) + require.NoError(t, err) + return q +} + // setupPostgres starts a throwaway Postgres container and returns a migrated SessionProxy. func setupPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { t.Helper() @@ -80,8 +88,7 @@ func setupPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { t.Cleanup(func() { _ = sp.Close() }) memoCfg := &config.MemoizationConfig{ - DBConfig: dbCfg, - TableName: testTableName, + DBConfig: dbCfg, } require.NoError(t, memodb.Migrate(ctx, sp, memodb.ConfigFromConfig(memoCfg))) return sp @@ -131,8 +138,7 @@ func setupMySQL(ctx context.Context, t *testing.T) *sqldb.SessionProxy { t.Cleanup(func() { _ = sp.Close() }) memoCfg := &config.MemoizationConfig{ - DBConfig: dbCfg, - TableName: testTableName, + DBConfig: dbCfg, } require.NoError(t, memodb.Migrate(ctx, sp, memodb.ConfigFromConfig(memoCfg))) return sp @@ -149,8 +155,7 @@ func sampleOutputs(message string) *wfv1.Outputs { func TestQueriesSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.Postgres) - require.NoError(t, err) + q := newQueries(t, sqldb.Postgres) // Load returns nil when no entry exists. rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") @@ -169,8 +174,7 @@ func TestQueriesSaveAndLoad(t *testing.T) { func TestQueriesNamespaceIsolation(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.Postgres) - require.NoError(t, err) + q := newQueries(t, sqldb.Postgres) // Save the same cache_name+cache key in two different namespaces. require.NoError(t, q.Save(ctx, sp, "ns-a", testCacheName, "shared-key", "node-a", sampleOutputs("from-a"), 2592000)) @@ -193,8 +197,7 @@ func TestQueriesNamespaceIsolation(t *testing.T) { func TestQueriesSaveReplaces(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.Postgres) - require.NoError(t, err) + q := newQueries(t, sqldb.Postgres) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) @@ -209,12 +212,11 @@ func TestQueriesSaveReplaces(t *testing.T) { func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.Postgres) - require.NoError(t, err) + q := newQueries(t, sqldb.Postgres) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) - _, err = sp.Session().SQL(). + _, err := sp.Session().SQL(). ExecContext(ctx, `UPDATE `+testTableName+` SET expires_at = $1 WHERE cache_key = $2`, time.Now().Add(-10*time.Second), "expired-key") require.NoError(t, err) @@ -226,15 +228,14 @@ func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { func TestQueriesPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.Postgres) - require.NoError(t, err) + q := newQueries(t, sqldb.Postgres) // Save an entry with a very short max_age (1 second) and one with 30 days. require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) // Backdate old-key's expires_at so it is in the past. - _, err = sp.Session().SQL(). + _, err := sp.Session().SQL(). ExecContext(ctx, `UPDATE `+testTableName+` SET expires_at = $1 WHERE cache_key = $2`, time.Now().Add(-10*time.Second), "old-key") require.NoError(t, err) @@ -255,8 +256,7 @@ func TestQueriesPruneRemovesOldEntries(t *testing.T) { func TestQueriesPruneKeepsRecentEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.Postgres) - require.NoError(t, err) + q := newQueries(t, sqldb.Postgres) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "recent", "node-1", sampleOutputs("v1"), 2592000)) @@ -271,8 +271,7 @@ func TestQueriesPruneKeepsRecentEntries(t *testing.T) { func TestMySQLSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.MySQL) - require.NoError(t, err) + q := newQueries(t, sqldb.MySQL) rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") require.NoError(t, err) @@ -289,8 +288,7 @@ func TestMySQLSaveAndLoad(t *testing.T) { func TestMySQLSaveReplaces(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.MySQL) - require.NoError(t, err) + q := newQueries(t, sqldb.MySQL) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) @@ -305,12 +303,11 @@ func TestMySQLSaveReplaces(t *testing.T) { func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.MySQL) - require.NoError(t, err) + q := newQueries(t, sqldb.MySQL) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) - _, err = sp.Session().SQL(). + _, err := sp.Session().SQL(). ExecContext(ctx, "UPDATE "+testTableName+" SET expires_at = ? WHERE cache_key = ?", time.Now().Add(-10*time.Second), "expired-key") require.NoError(t, err) @@ -322,14 +319,13 @@ func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { func TestMySQLPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q, err := memodb.NewQueries(testTableName, sqldb.MySQL) - require.NoError(t, err) + q := newQueries(t, sqldb.MySQL) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) // Backdate old-key's expires_at so it is in the past. - _, err = sp.Session().SQL(). + _, err := sp.Session().SQL(). ExecContext(ctx, "UPDATE "+testTableName+" SET expires_at = ? WHERE cache_key = ?", time.Now().Add(-10*time.Second), "old-key") require.NoError(t, err) diff --git a/workflow/controller/cache/cache.go b/workflow/controller/cache/cache.go index 37f388784d78..0358528871ca 100644 --- a/workflow/controller/cache/cache.go +++ b/workflow/controller/cache/cache.go @@ -29,9 +29,10 @@ var resolvedDefaultMaxAge struct { err error } -// ResolveMaxAgeSeconds converts a template's maxAge duration string to seconds. If maxAge is -// empty, it falls back to the DEFAULT_MAX_AGE env var (a Go duration string like "720h"), then -// to 30 days. Returns an error only if the duration string is malformed. +// ResolveMaxAgeSeconds converts a template's maxAge duration string to seconds for SQL-backed +// memoization cache entries. If maxAge is empty, it falls back to the DEFAULT_MAX_AGE env var +// (a Go duration string like "720h"), then to 30 days. Returns an error only if the duration +// string is malformed. func ResolveMaxAgeSeconds(maxAge string) (int64, error) { if maxAge == "" { resolvedDefaultMaxAge.once.Do(func() { @@ -63,10 +64,9 @@ func ResolveMaxAgeSeconds(maxAge string) (int64, error) { type MemoizationCache interface { Load(ctx context.Context, key string) (*Entry, error) - // Save stores the outputs of a completed memoized node. maxAgeSeconds is the number of seconds - // after which this entry expires and becomes eligible for garbage collection. Ignored by the - // ConfigMap backend. - Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, maxAgeSeconds int64) error + // Save stores the outputs of a completed memoized node. ConfigMap-backed caches ignore maxAge. + // SQL-backed caches use maxAge, or DEFAULT_MAX_AGE when maxAge is empty, to compute expires_at. + Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, maxAge string) error } type Entry struct { @@ -114,7 +114,7 @@ type Factory interface { GetCache(ctx context.Context, ct Type, namespace, name string) MemoizationCache // SetSessionProxy configures the factory to use database-backed caching with the given // session proxy and table name. Calling this clears any previously created cache instances - // so they are recreated against the new backend. + // so they are recreated against the SQL backend. SetSessionProxy(sp *sqldb.SessionProxy, tableName string) // ClearSessionProxy removes any SQL backend configuration. If sqlEnabled is true, GetCache // returns nil rather than silently falling back to ConfigMap-based caching (e.g. after a diff --git a/workflow/controller/cache/configmap_cache.go b/workflow/controller/cache/configmap_cache.go index 40c1cf825447..cf6573ae5d19 100644 --- a/workflow/controller/cache/configmap_cache.go +++ b/workflow/controller/cache/configmap_cache.go @@ -132,7 +132,7 @@ func (c *configMapCache) load(ctx context.Context, key string) (*Entry, error) { return &entry, nil } -func (c *configMapCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, _ int64) error { +func (c *configMapCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, _ string) error { err := retry.OnError(kwait.Backoff{ Duration: time.Second, Factor: 2, diff --git a/workflow/controller/cache/sqldb_cache.go b/workflow/controller/cache/sqldb_cache.go index db3a3dbcd481..31bcec24ab2c 100644 --- a/workflow/controller/cache/sqldb_cache.go +++ b/workflow/controller/cache/sqldb_cache.go @@ -55,9 +55,13 @@ func (c *sqlDBCache) Load(ctx context.Context, key string) (*Entry, error) { }, nil } -func (c *sqlDBCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, maxAgeSeconds int64) error { +func (c *sqlDBCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, maxAge string) error { if !cacheKeyRegex.MatchString(key) { return fmt.Errorf("invalid cache key: %s", key) } + maxAgeSeconds, err := ResolveMaxAgeSeconds(maxAge) + if err != nil { + return err + } return c.queries.Save(ctx, c.sessionProxy, c.namespace, c.name, key, nodeID, value, maxAgeSeconds) } diff --git a/workflow/controller/cache/sqldb_cache_test.go b/workflow/controller/cache/sqldb_cache_test.go index f15a1fa9f49a..d3763b15ed37 100644 --- a/workflow/controller/cache/sqldb_cache_test.go +++ b/workflow/controller/cache/sqldb_cache_test.go @@ -26,11 +26,12 @@ const ( testDBName = "cachetest" testDBUser = "user" testDBPassword = "pass" - testTableName = "memoization_cache" testNamespace = "default" testCacheName = "my-cache" ) +var testTableName = memodb.TableName(nil) + func setupTestPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { t.Helper() pg, err := testpostgres.Run(ctx, @@ -79,8 +80,7 @@ func setupTestPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { t.Cleanup(func() { _ = sp.Close() }) memoCfg := &config.MemoizationConfig{ - DBConfig: dbCfg, - TableName: testTableName, + DBConfig: dbCfg, } require.NoError(t, memodb.Migrate(ctx, sp, memodb.ConfigFromConfig(memoCfg))) return sp @@ -104,7 +104,7 @@ func TestSQLDBCacheSaveAndLoad(t *testing.T) { {Name: "result", Value: wfv1.AnyStringPtr("hello")}, }, } - require.NoError(t, c.Save(ctx, "key1", "node-abc", outputs, 2592000)) + require.NoError(t, c.Save(ctx, "key1", "node-abc", outputs, "720h")) entry, err = c.Load(ctx, "key1") require.NoError(t, err) @@ -130,7 +130,7 @@ func TestSQLDBCacheInvalidKey(t *testing.T) { require.Error(t, err) assert.Contains(t, err.Error(), "invalid cache key") - err = c.Save(ctx, "invalid key!", "node-1", &wfv1.Outputs{}, 3600) + err = c.Save(ctx, "invalid key!", "node-1", &wfv1.Outputs{}, "1h") require.Error(t, err) assert.Contains(t, err.Error(), "invalid cache key") } @@ -149,7 +149,7 @@ func TestSQLDBCacheOutputsRoundTrip(t *testing.T) { {Name: "p2", Value: wfv1.AnyStringPtr("v2")}, }, } - require.NoError(t, c.Save(ctx, "complex-key", "node-x", outputs, 3600)) + require.NoError(t, c.Save(ctx, "complex-key", "node-x", outputs, "1h")) entry, err := c.Load(ctx, "complex-key") require.NoError(t, err) @@ -161,15 +161,6 @@ func TestSQLDBCacheOutputsRoundTrip(t *testing.T) { assert.JSONEq(t, string(originalJSON), string(loadedJSON)) } -func TestSQLDBCacheInvalidTableName(t *testing.T) { - ctx := logging.TestContext(t.Context()) - sp := setupTestPostgres(ctx, t) - _ = ctx - - _, err := newSQLDBCache(testNamespace, testCacheName, sp, "invalid-table!") - require.Error(t, err) -} - func TestSQLDBCacheGetOutputsWithMaxAge(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) @@ -182,7 +173,7 @@ func TestSQLDBCacheGetOutputsWithMaxAge(t *testing.T) { {Name: "result", Value: wfv1.AnyStringPtr("cached")}, }, } - require.NoError(t, c.Save(ctx, "ttl-key", "node-ttl", outputs, 3600)) + require.NoError(t, c.Save(ctx, "ttl-key", "node-ttl", outputs, "1h")) entry, err := c.Load(ctx, "ttl-key") require.NoError(t, err) @@ -211,7 +202,7 @@ func TestSQLDBCacheUpsertRefreshesCreatedAt(t *testing.T) { {Name: "result", Value: wfv1.AnyStringPtr("v1")}, }, } - require.NoError(t, c.Save(ctx, "upsert-key", "node-1", outputs, 3600)) + require.NoError(t, c.Save(ctx, "upsert-key", "node-1", outputs, "1h")) entry1, err := c.Load(ctx, "upsert-key") require.NoError(t, err) @@ -227,7 +218,7 @@ func TestSQLDBCacheUpsertRefreshesCreatedAt(t *testing.T) { {Name: "result", Value: wfv1.AnyStringPtr("v2")}, }, } - require.NoError(t, c.Save(ctx, "upsert-key", "node-2", outputs2, 3600)) + require.NoError(t, c.Save(ctx, "upsert-key", "node-2", outputs2, "1h")) entry2, err := c.Load(ctx, "upsert-key") require.NoError(t, err) diff --git a/workflow/controller/cache_test.go b/workflow/controller/cache_test.go index 79be3a4828b5..e01885fdb5bd 100644 --- a/workflow/controller/cache_test.go +++ b/workflow/controller/cache_test.go @@ -94,7 +94,7 @@ func TestConfigMapCacheSave(t *testing.T) { outputs := wfv1.Outputs{} outputs.Parameters = append(outputs.Parameters, MockParam) - err := c.Save(ctx, "hi-there-world", "", &outputs, 2592000) + err := c.Save(ctx, "hi-there-world", "", &outputs, "") require.NoError(t, err) cm, err := controller.kubeclientset.CoreV1().ConfigMaps("default").Get(ctx, "whalesay-cache", metav1.GetOptions{}) @@ -104,3 +104,15 @@ func TestConfigMapCacheSave(t *testing.T) { wfv1.MustUnmarshal([]byte(cm.Data["hi-there-world"]), &entry) assert.Equal(t, entry.LastHitTimestamp.Time, entry.CreationTimestamp.Time) } + +func TestConfigMapCacheSaveIgnoresInvalidDefaultMaxAge(t *testing.T) { + t.Setenv("DEFAULT_MAX_AGE", "definitely-not-a-duration") + + ctx := logging.TestContext(t.Context()) + cancel, controller := newController(ctx) + defer cancel() + + c := cache.NewConfigMapCache("default", controller.kubeclientset, "whalesay-cache") + outputs := &wfv1.Outputs{} + require.NoError(t, c.Save(ctx, "hi-there-world", "", outputs, "")) +} diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 3653fbeb605a..8eb7ed891861 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -758,10 +758,9 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont continue } - cfg := memodb.ConfigFromConfig(memoCfg) - queries, err := memodb.NewQueries(cfg.TableName, sp.DBType()) + queries, err := memodb.NewQueries(memodb.TableName(memoCfg), sp.DBType()) if err != nil { - logger.WithError(err).Error(ctx, "Invalid memoization cache table name; skipping GC tick") + logger.WithError(err).Error(ctx, "Failed to initialize memoization cache queries") continue } diff --git a/workflow/controller/dag.go b/workflow/controller/dag.go index 4ec6e395bd4c..cdd87bd0c6d9 100644 --- a/workflow/controller/dag.go +++ b/workflow/controller/dag.go @@ -15,7 +15,6 @@ import ( "github.com/argoproj/argo-workflows/v4/util/logging" "github.com/argoproj/argo-workflows/v4/util/template" "github.com/argoproj/argo-workflows/v4/workflow/common" - controllercache "github.com/argoproj/argo-workflows/v4/workflow/controller/cache" "github.com/argoproj/argo-workflows/v4/workflow/templateresolution" ) @@ -386,11 +385,7 @@ func (woc *wfOperationCtx) executeDAG(ctx context.Context, nodeName string, tmpl case tmpl.Memoize == nil: woc.log.WithField("nodeID", node.ID).Warn(ctx, "Node template has no memoize spec; skipping cache save") default: - maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(tmpl.Memoize.MaxAge) - if maxAgeErr != nil { - woc.log.WithField("nodeID", node.ID).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") - node.Phase = wfv1.NodeError - } else if saveErr := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAgeSeconds); saveErr != nil { + if saveErr := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, tmpl.Memoize.MaxAge); saveErr != nil { woc.log.WithField("nodeID", node.ID).WithError(saveErr).Error(ctx, "Failed to save node outputs to cache") node.Phase = wfv1.NodeError } diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 6597d7141153..c53512614c2a 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1207,12 +1207,7 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) (bool, error) } else if nodeTmpl != nil && nodeTmpl.Memoize != nil { maxAge = nodeTmpl.Memoize.MaxAge } - maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(maxAge) - if maxAgeErr != nil { - woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") - newState.Phase = wfv1.NodeError - newState.Message = maxAgeErr.Error() - } else if err := c.Save(ctx, newState.MemoizationStatus.Key, newState.ID, newState.Outputs, maxAgeSeconds); err != nil { + if err := c.Save(ctx, newState.MemoizationStatus.Key, newState.ID, newState.Outputs, maxAge); err != nil { woc.log.WithFields(logging.Fields{"nodeID": newState.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") newState.Phase = wfv1.NodeError newState.Message = err.Error() diff --git a/workflow/controller/steps.go b/workflow/controller/steps.go index 6905241d2a81..3fecd89795cb 100644 --- a/workflow/controller/steps.go +++ b/workflow/controller/steps.go @@ -18,7 +18,6 @@ import ( "github.com/argoproj/argo-workflows/v4/util/logging" "github.com/argoproj/argo-workflows/v4/util/template" "github.com/argoproj/argo-workflows/v4/workflow/common" - controllercache "github.com/argoproj/argo-workflows/v4/workflow/controller/cache" "github.com/argoproj/argo-workflows/v4/workflow/templateresolution" ) @@ -195,11 +194,7 @@ func (woc *wfOperationCtx) executeSteps(ctx context.Context, nodeName string, tm case tmpl.Memoize == nil: woc.log.WithFields(logging.Fields{"nodeID": node.ID}).Warn(ctx, "Node template has no memoize spec; skipping cache save") default: - maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(tmpl.Memoize.MaxAge) - if maxAgeErr != nil { - woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") - node.Phase = wfv1.NodeError - } else if err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAgeSeconds); err != nil { + if err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, tmpl.Memoize.MaxAge); err != nil { woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") node.Phase = wfv1.NodeError } diff --git a/workflow/controller/taskset.go b/workflow/controller/taskset.go index ab735981c2b1..1618468f8839 100644 --- a/workflow/controller/taskset.go +++ b/workflow/controller/taskset.go @@ -15,7 +15,6 @@ import ( wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" "github.com/argoproj/argo-workflows/v4/util/logging" "github.com/argoproj/argo-workflows/v4/workflow/common" - controllercache "github.com/argoproj/argo-workflows/v4/workflow/controller/cache" ) func (woc *wfOperationCtx) mergePatchTaskSet(ctx context.Context, patch any, subresources ...string) error { @@ -165,10 +164,7 @@ func (woc *wfOperationCtx) reconcileTaskSet(ctx context.Context) error { } else if nodeTmpl != nil && nodeTmpl.Memoize != nil { maxAge = nodeTmpl.Memoize.MaxAge } - maxAgeSeconds, maxAgeErr := controllercache.ResolveMaxAgeSeconds(maxAge) - if maxAgeErr != nil { - woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(maxAgeErr).Error(ctx, "Failed to resolve maxAge for cache save") - } else if err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAgeSeconds); err != nil { + if err := c.Save(ctx, node.MemoizationStatus.Key, node.ID, node.Outputs, maxAge); err != nil { woc.log.WithFields(logging.Fields{"nodeID": node.ID}).WithError(err).Error(ctx, "Failed to save node outputs to cache") } } From 650f534bdf3ff3111856a8e0b0b40b582440c078 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Sat, 25 Apr 2026 22:48:54 -0400 Subject: [PATCH 06/15] docs: sync memoization config reference Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: droctothorpe --- docs/workflow-controller-configmap.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/workflow-controller-configmap.md b/docs/workflow-controller-configmap.md index bba779817877..e56ac256abdb 100644 --- a/docs/workflow-controller-configmap.md +++ b/docs/workflow-controller-configmap.md @@ -96,7 +96,7 @@ Config contains the root of the configuration settings for the workflow controll | `NavColor` | `string` | NavColor is an ui navigation bar background color | | `SSO` | [`SSOConfig`](#ssoconfig) | SSO in settings for single-sign on | | `Synchronization` | [`SyncConfig`](#syncconfig) | Synchronization via databases config | -| `Memoization` | [`MemoizationConfig`](#memoizationconfig) | Memoization configures memoization cache storage. When set, cache entries are stored in a SQL database table instead of ConfigMaps. ConfigMap-based caching remains the default when omitted. | +| `Memoization` | [`MemoizationConfig`](#memoizationconfig) | Memoization configures memoization cache storage. When set, cache entries are stored in a database instead of ConfigMaps. ConfigMap-based caching remains the default when omitted. | | `ArtifactDrivers` | `Array<`[`ArtifactDriver`](#artifactdriver)`>` | ArtifactDrivers lists artifact driver plugins we can use | | `FailedPodRestart` | [`FailedPodRestartConfig`](#failedpodrestartconfig) | FailedPodRestart configures automatic restart of pods that fail before entering Running state (e.g., due to Eviction, DiskPressure, Preemption). This allows recovery from transient infrastructure issues without requiring a retryStrategy on templates. | @@ -372,18 +372,18 @@ SyncConfig contains synchronization configuration for database locks (semaphores ## MemoizationConfig -MemoizationConfig contains memoization cache configuration for database-backed storage. When configured, cache entries are stored in the configured database table instead of ConfigMaps. If `TableName` is omitted, it defaults to `cache_entries`. +MemoizationConfig contains memoization cache configuration for database-backed storage. When configured, cache entries are stored in the specified database table instead of ConfigMaps. ### Fields -| Field Name | Field Type | Description | -|---------------------|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------| -| `PostgreSQL` | [`PostgreSQLConfig`](#postgresqlconfig) | PostgreSQL configuration for PostgreSQL database, don't use MySQL at the same time | -| `MySQL` | [`MySQLConfig`](#mysqlconfig) | MySQL configuration for MySQL database, don't use PostgreSQL at the same time | -| `ConnectionPool` | [`ConnectionPool`](#connectionpool) | Pooled connection settings for all types of database connections | -| `DBReconnectConfig` | [`DBReconnectConfig`](#dbreconnectconfig) | DBReconnectConfig are configuration options for database retries and reconnections | -| `TableName` | `string` | TableName is the name of the table to use for memoization cache entries. Defaults to `cache_entries` when omitted. | -| `SkipMigration` | `bool` | SkipMigration skips automatic database migration on startup. | +| Field Name | Field Type | Description | +|---------------------|-------------------------------------------|------------------------------------------------------------------------------------------------------------------| +| `PostgreSQL` | [`PostgreSQLConfig`](#postgresqlconfig) | PostgreSQL configuration for PostgreSQL database, don't use MySQL at the same time | +| `MySQL` | [`MySQLConfig`](#mysqlconfig) | MySQL configuration for MySQL database, don't use PostgreSQL at the same time | +| `ConnectionPool` | [`ConnectionPool`](#connectionpool) | Pooled connection settings for all types of database connections | +| `DBReconnectConfig` | [`DBReconnectConfig`](#dbreconnectconfig) | DBReconnectConfig are configuration options for database retries and reconnections | +| `TableName` | `string` | TableName is the name of the table to use for memoization cache entries. Defaults to "cache_entries" if not set. | +| `SkipMigration` | `bool` | SkipMigration skips automatic database migration on startup. | ## ArtifactDriver From ae634ec40752b31ac99acfbaaa6096219b0913fb Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Mon, 27 Apr 2026 09:26:00 -0400 Subject: [PATCH 07/15] Revert e2e test flakiness fixes Signed-off-by: droctothorpe --- test/e2e/argo_server_test.go | 18 +++---- test/e2e/retry_test.go | 50 +++---------------- .../workflow-template-with-containerset.yaml | 7 ++- 3 files changed, 19 insertions(+), 56 deletions(-) diff --git a/test/e2e/argo_server_test.go b/test/e2e/argo_server_test.go index b54f7803ab72..21e67dc2ebcd 100644 --- a/test/e2e/argo_server_test.go +++ b/test/e2e/argo_server_test.go @@ -2021,16 +2021,12 @@ metadata: labels: foo: 1 spec: - arguments: - parameters: - - name: archived-message - value: "hello \\u0001F44D" entrypoint: run-archie templates: - name: run-archie container: - image: quay.io/argoproj/argoexec:latest - args: [version]`). + image: argoproj/argosay:v2 + args: [echo, "hello \\u0001F44D"]`). When(). SubmitWorkflow(). WaitForWorkflow(fixtures.ToBeArchived). @@ -2052,8 +2048,9 @@ spec: templates: - name: run-jughead container: - image: quay.io/argoproj/argoexec:latest - args: [not-a-real-subcommand]`). + image: argoproj/argosay:v2 + command: [sh, -c] + args: ["echo intentional failure; exit 1"]`). When(). SubmitWorkflow(). WaitForWorkflow(fixtures.ToBeArchived). @@ -2073,8 +2070,7 @@ spec: templates: - name: run-betty container: - image: quay.io/argoproj/argoexec:latest - args: [version]`). + image: argoproj/argosay:v2`). When(). SubmitWorkflow(). WaitForWorkflow(fixtures.ToBeArchived) @@ -2183,7 +2179,7 @@ spec: Path("$.metadata.name"). NotNull() j. - Path("$.spec.arguments.parameters[0].value"). + Path("$.spec.templates[0].container.args[1]"). // make sure unicode escape wasn't mangled IsEqual("hello \\u0001F44D") j. diff --git a/test/e2e/retry_test.go b/test/e2e/retry_test.go index 1bc31affc435..67b29f5e7c15 100644 --- a/test/e2e/retry_test.go +++ b/test/e2e/retry_test.go @@ -143,6 +143,12 @@ spec: assert.Equal(t, 1, count) assert.Contains(t, logs, "hi") }). + // Command err. No retry logic is entered. + ExpectContainerLogs("c2", func(t *testing.T, logs string) { + count := strings.Count(logs, "capturing logs") + assert.Equal(t, 0, count) + assert.Contains(t, logs, "executable file not found in $PATH") + }). // Retry when err. ExpectContainerLogs("c3", func(t *testing.T, logs string) { count := strings.Count(logs, "capturing logs") @@ -152,50 +158,6 @@ spec: }) } -func (s *RetryTestSuite) TestWorkflowTemplateWithCommandStartErrorInContainerSet() { - s.Given(). - WorkflowTemplate(` -apiVersion: argoproj.io/v1alpha1 -kind: WorkflowTemplate -metadata: - name: containerset-with-command-start-error -spec: - entrypoint: test - templates: - - name: test - containerSet: - retryStrategy: - retries: "2" - containers: - - name: c2 - image: argoproj/argosay:v2 - command: - - invalid - - command -`). - Workflow(` -metadata: - name: workflow-template-containerset-command-error -spec: - workflowTemplateRef: - name: containerset-with-command-start-error -`). - When(). - CreateWorkflowTemplates(). - SubmitWorkflow(). - WaitForWorkflow(fixtures.ToBeFailed). - Then(). - ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *v1alpha1.WorkflowStatus) { - assert.Equal(t, v1alpha1.WorkflowFailed, status.Phase) - }). - // Command start errors bypass retry execution. - ExpectContainerLogs("c2", func(t *testing.T, logs string) { - count := strings.Count(logs, "capturing logs") - assert.Equal(t, 0, count) - assert.Contains(t, logs, "executable file not found in $PATH") - }) -} - func (s *RetryTestSuite) TestRetryNodeAntiAffinity() { s.Given(). Workflow(` diff --git a/test/e2e/testdata/workflow-template-with-containerset.yaml b/test/e2e/testdata/workflow-template-with-containerset.yaml index d9bae250eebc..204224061a0f 100644 --- a/test/e2e/testdata/workflow-template-with-containerset.yaml +++ b/test/e2e/testdata/workflow-template-with-containerset.yaml @@ -16,7 +16,12 @@ spec: - name: c1 image: argoproj/argosay:v2 args: ["echo", "hi"] + - name: c2 + image: argoproj/argosay:v2 + command: + - invalid + - command - name: c3 image: argoproj/argosay:v2 command: [ sh, -c ] - args: [ "echo intentional failure; exit 1" ] + args: [ "echo intentional failure; exit 1" ] \ No newline at end of file From 2514f1ca2eabbf7be50c1ea5997b9cbfd48cc577 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Mon, 27 Apr 2026 14:25:53 -0400 Subject: [PATCH 08/15] Switch to ORM Signed-off-by: droctothorpe --- util/memo/db/config_test.go | 3 +- util/memo/db/queries.go | 96 +++++++++++------------- util/memo/db/queries_test.go | 24 +++--- workflow/controller/cache/sqldb_cache.go | 2 +- workflow/controller/controller.go | 2 +- 5 files changed, 59 insertions(+), 68 deletions(-) diff --git a/util/memo/db/config_test.go b/util/memo/db/config_test.go index 84636f0c1922..ad443e63ecac 100644 --- a/util/memo/db/config_test.go +++ b/util/memo/db/config_test.go @@ -8,7 +8,6 @@ import ( "github.com/argoproj/argo-workflows/v4/config" memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" - "github.com/argoproj/argo-workflows/v4/util/sqldb" ) func TestTableNameDefaultsAndOverrides(t *testing.T) { @@ -18,7 +17,7 @@ func TestTableNameDefaultsAndOverrides(t *testing.T) { } func TestNewQueriesRejectsInvalidTableName(t *testing.T) { - queries, err := memodb.NewQueries("invalid-table-name", sqldb.Postgres) + queries, err := memodb.NewQueries("invalid-table-name") require.Error(t, err) assert.Nil(t, queries) } diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go index 1f247d3954bb..dd61059dd091 100644 --- a/util/memo/db/queries.go +++ b/util/memo/db/queries.go @@ -12,6 +12,13 @@ import ( "github.com/argoproj/argo-workflows/v4/util/sqldb" ) +const ( + colNamespace = "namespace" + colCacheName = "cache_name" + colCacheKey = "cache_key" + colExpiresAt = "expires_at" +) + // CacheRecord is the database row for a single memoization cache entry. type CacheRecord struct { Namespace string `db:"namespace"` @@ -26,46 +33,33 @@ type CacheRecord struct { // Queries provides database operations for the memoization cache table. type Queries struct { tableName string - dbType sqldb.DBType } -func NewQueries(tableName string, dbType sqldb.DBType) (*Queries, error) { +func NewQueries(tableName string) (*Queries, error) { if err := validateTableName(tableName); err != nil { return nil, err } - return &Queries{tableName: tableName, dbType: dbType}, nil + return &Queries{tableName: tableName}, nil } // Load retrieves the outputs for the given cache key. // Returns nil when the entry does not exist or has expired. func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, cacheName, cacheKey string) (*CacheRecord, error) { var r CacheRecord - var found bool now := time.Now().UTC() err := sp.With(ctx, func(sess db.Session) error { - // Use raw SQL to avoid upper/db ORM timestamp scanning issues with - // "timestamp without timezone" columns (the ORM may not populate time.Time fields). - var query string - switch q.dbType { - case sqldb.Postgres: - query = fmt.Sprintf(`SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = $1 AND cache_name = $2 AND cache_key = $3 AND expires_at > $4`, q.tableName) - case sqldb.MySQL: - query = fmt.Sprintf("SELECT namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at FROM %s WHERE namespace = ? AND cache_name = ? AND cache_key = ? AND expires_at > ?", q.tableName) - default: - return fmt.Errorf("unsupported database type: %s", q.dbType) - } - rows, err := sess.SQL().QueryContext(ctx, query, namespace, cacheName, cacheKey, now) - if err != nil { - return err - } - defer rows.Close() - if !rows.Next() { - return rows.Err() - } - found = true - return rows.Scan(&r.Namespace, &r.CacheName, &r.CacheKey, &r.NodeID, &r.Outputs, &r.CreatedAt, &r.ExpiresAt) + return sess.SQL(). + SelectFrom(q.tableName). + Where(db.Cond{colNamespace: namespace}). + And(db.Cond{colCacheName: cacheName}). + And(db.Cond{colCacheKey: cacheKey}). + And(db.Cond{colExpiresAt + " >": now}). + One(&r) }) - if err != nil || !found { + if err == db.ErrNoMoreRows { + return nil, nil + } + if err != nil { return nil, err } return &r, nil @@ -77,16 +71,10 @@ func (q *Queries) Prune(ctx context.Context, sp *sqldb.SessionProxy) (int64, err now := time.Now().UTC() var n int64 err := sp.With(ctx, func(sess db.Session) error { - var query string - switch q.dbType { - case sqldb.Postgres: - query = fmt.Sprintf(`DELETE FROM %s WHERE expires_at < $1`, q.tableName) - case sqldb.MySQL: - query = fmt.Sprintf("DELETE FROM %s WHERE expires_at < ?", q.tableName) - default: - return fmt.Errorf("unsupported database type: %s", q.dbType) - } - result, err := sess.SQL().ExecContext(ctx, query, now) + result, err := sess.SQL(). + DeleteFrom(q.tableName). + Where(db.Cond{colExpiresAt + " <": now}). + Exec() if err != nil { return err } @@ -101,25 +89,29 @@ func (q *Queries) Save(ctx context.Context, sp *sqldb.SessionProxy, namespace, c if err != nil { return fmt.Errorf("unable to marshal memoization outputs: %w", err) } - outputsStr := string(outputsJSON) now := time.Now().UTC() expiresAt := now.Add(time.Duration(maxAgeSeconds) * time.Second) return sp.With(ctx, func(sess db.Session) error { - switch q.dbType { - case sqldb.Postgres: - _, err := sess.SQL().ExecContext(ctx, - fmt.Sprintf(`INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) -VALUES ($1, $2, $3, $4, $5, $6, $7) -ON CONFLICT (namespace, cache_name, cache_key) DO UPDATE SET node_id = $4, outputs = $5, created_at = $6, expires_at = $7`, q.tableName), - namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt) - return err - case sqldb.MySQL: - _, err := sess.SQL().ExecContext(ctx, - fmt.Sprintf("INSERT INTO %s (namespace, cache_name, cache_key, node_id, outputs, created_at, expires_at) VALUES (?, ?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE node_id = ?, outputs = ?, created_at = ?, expires_at = ?", q.tableName), - namespace, cacheName, cacheKey, nodeID, outputsStr, now, expiresAt, nodeID, outputsStr, now, expiresAt) + return sess.TxContext(ctx, func(tx db.Session) error { + _, err := tx.SQL(). + DeleteFrom(q.tableName). + Where(db.Cond{colNamespace: namespace}). + And(db.Cond{colCacheName: cacheName}). + And(db.Cond{colCacheKey: cacheKey}). + Exec() + if err != nil { + return err + } + _, err = tx.Collection(q.tableName).Insert(&CacheRecord{ + Namespace: namespace, + CacheName: cacheName, + CacheKey: cacheKey, + NodeID: nodeID, + Outputs: string(outputsJSON), + CreatedAt: now, + ExpiresAt: expiresAt, + }) return err - default: - return fmt.Errorf("unsupported database type: %s", q.dbType) - } + }, nil) }) } diff --git a/util/memo/db/queries_test.go b/util/memo/db/queries_test.go index 97746e7a1098..ffd7fe1dfccb 100644 --- a/util/memo/db/queries_test.go +++ b/util/memo/db/queries_test.go @@ -32,9 +32,9 @@ const ( var testTableName = memodb.TableName(nil) -func newQueries(t *testing.T, dbType sqldb.DBType) *memodb.Queries { +func newQueries(t *testing.T) *memodb.Queries { t.Helper() - q, err := memodb.NewQueries(testTableName, dbType) + q, err := memodb.NewQueries(testTableName) require.NoError(t, err) return q } @@ -155,7 +155,7 @@ func sampleOutputs(message string) *wfv1.Outputs { func TestQueriesSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t, sqldb.Postgres) + q := newQueries(t) // Load returns nil when no entry exists. rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") @@ -174,7 +174,7 @@ func TestQueriesSaveAndLoad(t *testing.T) { func TestQueriesNamespaceIsolation(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t, sqldb.Postgres) + q := newQueries(t) // Save the same cache_name+cache key in two different namespaces. require.NoError(t, q.Save(ctx, sp, "ns-a", testCacheName, "shared-key", "node-a", sampleOutputs("from-a"), 2592000)) @@ -197,7 +197,7 @@ func TestQueriesNamespaceIsolation(t *testing.T) { func TestQueriesSaveReplaces(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t, sqldb.Postgres) + q := newQueries(t) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) @@ -212,7 +212,7 @@ func TestQueriesSaveReplaces(t *testing.T) { func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t, sqldb.Postgres) + q := newQueries(t) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) @@ -228,7 +228,7 @@ func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { func TestQueriesPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t, sqldb.Postgres) + q := newQueries(t) // Save an entry with a very short max_age (1 second) and one with 30 days. require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) @@ -256,7 +256,7 @@ func TestQueriesPruneRemovesOldEntries(t *testing.T) { func TestQueriesPruneKeepsRecentEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t, sqldb.Postgres) + q := newQueries(t) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "recent", "node-1", sampleOutputs("v1"), 2592000)) @@ -271,7 +271,7 @@ func TestQueriesPruneKeepsRecentEntries(t *testing.T) { func TestMySQLSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t, sqldb.MySQL) + q := newQueries(t) rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") require.NoError(t, err) @@ -288,7 +288,7 @@ func TestMySQLSaveAndLoad(t *testing.T) { func TestMySQLSaveReplaces(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t, sqldb.MySQL) + q := newQueries(t) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) @@ -303,7 +303,7 @@ func TestMySQLSaveReplaces(t *testing.T) { func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t, sqldb.MySQL) + q := newQueries(t) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) @@ -319,7 +319,7 @@ func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { func TestMySQLPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t, sqldb.MySQL) + q := newQueries(t) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) diff --git a/workflow/controller/cache/sqldb_cache.go b/workflow/controller/cache/sqldb_cache.go index 31bcec24ab2c..e38453d938e8 100644 --- a/workflow/controller/cache/sqldb_cache.go +++ b/workflow/controller/cache/sqldb_cache.go @@ -20,7 +20,7 @@ type sqlDBCache struct { } func newSQLDBCache(namespace, name string, sp *sqldb.SessionProxy, tableName string) (MemoizationCache, error) { - queries, err := memodb.NewQueries(tableName, sp.DBType()) + queries, err := memodb.NewQueries(tableName) if err != nil { return nil, err } diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 8eb7ed891861..5d58a04ef4ba 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -758,7 +758,7 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont continue } - queries, err := memodb.NewQueries(memodb.TableName(memoCfg), sp.DBType()) + queries, err := memodb.NewQueries(memodb.TableName(memoCfg)) if err != nil { logger.WithError(err).Error(ctx, "Failed to initialize memoization cache queries") continue From ceed9ae9ce72ed707cffb8d1efe60d61182c803c Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Mon, 27 Apr 2026 18:01:02 -0400 Subject: [PATCH 09/15] Align with workflow archive Signed-off-by: droctothorpe --- util/memo/db/config_test.go | 2 +- util/memo/db/queries.go | 67 +++++++++--- util/memo/db/queries_test.go | 100 +++++++++--------- workflow/controller/cache/cache.go | 64 +++-------- workflow/controller/cache/sqldb_cache.go | 27 ++--- workflow/controller/cache/sqldb_cache_test.go | 24 +++-- workflow/controller/config.go | 91 +++++----------- workflow/controller/config_test.go | 45 +------- workflow/controller/controller.go | 29 ++--- workflow/controller/operator_test.go | 6 +- 10 files changed, 175 insertions(+), 280 deletions(-) diff --git a/util/memo/db/config_test.go b/util/memo/db/config_test.go index ad443e63ecac..3dd07a638dd1 100644 --- a/util/memo/db/config_test.go +++ b/util/memo/db/config_test.go @@ -17,7 +17,7 @@ func TestTableNameDefaultsAndOverrides(t *testing.T) { } func TestNewQueriesRejectsInvalidTableName(t *testing.T) { - queries, err := memodb.NewQueries("invalid-table-name") + queries, err := memodb.NewQueries("invalid-table-name", nil) require.Error(t, err) assert.Nil(t, queries) } diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go index dd61059dd091..9131a6136caf 100644 --- a/util/memo/db/queries.go +++ b/util/memo/db/queries.go @@ -3,6 +3,7 @@ package db import ( "context" "encoding/json" + "errors" "fmt" "time" @@ -30,24 +31,60 @@ type CacheRecord struct { ExpiresAt time.Time `db:"expires_at"` } -// Queries provides database operations for the memoization cache table. -type Queries struct { - tableName string +// MemoizationDB is the interface for database-backed memoization cache operations. +type MemoizationDB interface { + Load(ctx context.Context, namespace, cacheName, cacheKey string) (*CacheRecord, error) + Save(ctx context.Context, namespace, cacheName, cacheKey, nodeID string, outputs *wfv1.Outputs, maxAgeSeconds int64) error + Prune(ctx context.Context) (int64, error) + IsEnabled() bool } -func NewQueries(tableName string) (*Queries, error) { +// NullMemoizationDB is a no-op implementation used when database memoization is disabled. +var NullMemoizationDB MemoizationDB = &nullMemoizationDB{} + +type nullMemoizationDB struct{} + +func (n *nullMemoizationDB) Load(context.Context, string, string, string) (*CacheRecord, error) { + return nil, nil +} + +func (n *nullMemoizationDB) Save(context.Context, string, string, string, string, *wfv1.Outputs, int64) error { + return nil +} + +func (n *nullMemoizationDB) Prune(context.Context) (int64, error) { + return 0, nil +} + +func (n *nullMemoizationDB) IsEnabled() bool { + return false +} + +var _ MemoizationDB = &queries{} + +// queries provides database operations for the memoization cache table. +type queries struct { + tableName string + sessionProxy *sqldb.SessionProxy +} + +func NewQueries(tableName string, sessionProxy *sqldb.SessionProxy) (MemoizationDB, error) { if err := validateTableName(tableName); err != nil { return nil, err } - return &Queries{tableName: tableName}, nil + return &queries{tableName: tableName, sessionProxy: sessionProxy}, nil +} + +func (q *queries) IsEnabled() bool { + return true } // Load retrieves the outputs for the given cache key. // Returns nil when the entry does not exist or has expired. -func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, cacheName, cacheKey string) (*CacheRecord, error) { +func (q *queries) Load(ctx context.Context, namespace, cacheName, cacheKey string) (*CacheRecord, error) { var r CacheRecord now := time.Now().UTC() - err := sp.With(ctx, func(sess db.Session) error { + err := q.sessionProxy.With(ctx, func(sess db.Session) error { return sess.SQL(). SelectFrom(q.tableName). Where(db.Cond{colNamespace: namespace}). @@ -56,7 +93,7 @@ func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, c And(db.Cond{colExpiresAt + " >": now}). One(&r) }) - if err == db.ErrNoMoreRows { + if errors.Is(err, db.ErrNoMoreRows) { return nil, nil } if err != nil { @@ -67,10 +104,10 @@ func (q *Queries) Load(ctx context.Context, sp *sqldb.SessionProxy, namespace, c // Prune deletes cache entries whose expires_at has elapsed. It is called // periodically by the controller to bound the size of the configured memoization cache table. -func (q *Queries) Prune(ctx context.Context, sp *sqldb.SessionProxy) (int64, error) { +func (q *queries) Prune(ctx context.Context) (int64, error) { now := time.Now().UTC() var n int64 - err := sp.With(ctx, func(sess db.Session) error { + err := q.sessionProxy.With(ctx, func(sess db.Session) error { result, err := sess.SQL(). DeleteFrom(q.tableName). Where(db.Cond{colExpiresAt + " <": now}). @@ -84,15 +121,15 @@ func (q *Queries) Prune(ctx context.Context, sp *sqldb.SessionProxy) (int64, err return n, err } -func (q *Queries) Save(ctx context.Context, sp *sqldb.SessionProxy, namespace, cacheName, cacheKey, nodeID string, outputs *wfv1.Outputs, maxAgeSeconds int64) error { +func (q *queries) Save(ctx context.Context, namespace, cacheName, cacheKey, nodeID string, outputs *wfv1.Outputs, maxAgeSeconds int64) error { outputsJSON, err := json.Marshal(outputs) if err != nil { return fmt.Errorf("unable to marshal memoization outputs: %w", err) } now := time.Now().UTC() expiresAt := now.Add(time.Duration(maxAgeSeconds) * time.Second) - return sp.With(ctx, func(sess db.Session) error { - return sess.TxContext(ctx, func(tx db.Session) error { + return q.sessionProxy.TxWith(ctx, func(sp *sqldb.SessionProxy) error { + return sp.With(ctx, func(tx db.Session) error { _, err := tx.SQL(). DeleteFrom(q.tableName). Where(db.Cond{colNamespace: namespace}). @@ -112,6 +149,6 @@ func (q *Queries) Save(ctx context.Context, sp *sqldb.SessionProxy, namespace, c ExpiresAt: expiresAt, }) return err - }, nil) - }) + }) + }, nil) } diff --git a/util/memo/db/queries_test.go b/util/memo/db/queries_test.go index ffd7fe1dfccb..496f90ef24b0 100644 --- a/util/memo/db/queries_test.go +++ b/util/memo/db/queries_test.go @@ -32,13 +32,6 @@ const ( var testTableName = memodb.TableName(nil) -func newQueries(t *testing.T) *memodb.Queries { - t.Helper() - q, err := memodb.NewQueries(testTableName) - require.NoError(t, err) - return q -} - // setupPostgres starts a throwaway Postgres container and returns a migrated SessionProxy. func setupPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { t.Helper() @@ -144,6 +137,13 @@ func setupMySQL(ctx context.Context, t *testing.T) *sqldb.SessionProxy { return sp } +func newQueries(t *testing.T, sp *sqldb.SessionProxy) memodb.MemoizationDB { + t.Helper() + q, err := memodb.NewQueries(testTableName, sp) + require.NoError(t, err) + return q +} + func sampleOutputs(message string) *wfv1.Outputs { return &wfv1.Outputs{ Parameters: []wfv1.Parameter{ @@ -155,16 +155,16 @@ func sampleOutputs(message string) *wfv1.Outputs { func TestQueriesSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) // Load returns nil when no entry exists. - rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") + rec, err := q.Load(ctx, testNamespace, testCacheName, "key1") require.NoError(t, err) assert.Nil(t, rec, "expected nil for missing key") // Save an entry and load it back. - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key1", "node-abc", sampleOutputs("hello"), 2592000)) - rec, err = q.Load(ctx, sp, testNamespace, testCacheName, "key1") + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "key1", "node-abc", sampleOutputs("hello"), 2592000)) + rec, err = q.Load(ctx, testNamespace, testCacheName, "key1") require.NoError(t, err) require.NotNil(t, rec) assert.Equal(t, "node-abc", rec.NodeID) @@ -174,20 +174,20 @@ func TestQueriesSaveAndLoad(t *testing.T) { func TestQueriesNamespaceIsolation(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) // Save the same cache_name+cache key in two different namespaces. - require.NoError(t, q.Save(ctx, sp, "ns-a", testCacheName, "shared-key", "node-a", sampleOutputs("from-a"), 2592000)) - require.NoError(t, q.Save(ctx, sp, "ns-b", testCacheName, "shared-key", "node-b", sampleOutputs("from-b"), 2592000)) + require.NoError(t, q.Save(ctx, "ns-a", testCacheName, "shared-key", "node-a", sampleOutputs("from-a"), 2592000)) + require.NoError(t, q.Save(ctx, "ns-b", testCacheName, "shared-key", "node-b", sampleOutputs("from-b"), 2592000)) // Each namespace should see its own entry. - recA, err := q.Load(ctx, sp, "ns-a", testCacheName, "shared-key") + recA, err := q.Load(ctx, "ns-a", testCacheName, "shared-key") require.NoError(t, err) require.NotNil(t, recA) assert.Equal(t, "node-a", recA.NodeID) assert.Contains(t, recA.Outputs, "from-a") - recB, err := q.Load(ctx, sp, "ns-b", testCacheName, "shared-key") + recB, err := q.Load(ctx, "ns-b", testCacheName, "shared-key") require.NoError(t, err) require.NotNil(t, recB) assert.Equal(t, "node-b", recB.NodeID) @@ -197,12 +197,12 @@ func TestQueriesNamespaceIsolation(t *testing.T) { func TestQueriesSaveReplaces(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) - rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key3") + rec, err := q.Load(ctx, testNamespace, testCacheName, "key3") require.NoError(t, err) require.NotNil(t, rec) assert.Equal(t, "node-new", rec.NodeID) @@ -212,15 +212,15 @@ func TestQueriesSaveReplaces(t *testing.T) { func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) _, err := sp.Session().SQL(). ExecContext(ctx, `UPDATE `+testTableName+` SET expires_at = $1 WHERE cache_key = $2`, time.Now().Add(-10*time.Second), "expired-key") require.NoError(t, err) - rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "expired-key") + rec, err := q.Load(ctx, testNamespace, testCacheName, "expired-key") require.NoError(t, err) assert.Nil(t, rec, "expired entries should load as a cache miss") } @@ -228,11 +228,11 @@ func TestQueriesLoadSkipsExpiredEntries(t *testing.T) { func TestQueriesPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) // Save an entry with a very short max_age (1 second) and one with 30 days. - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) // Backdate old-key's expires_at so it is in the past. _, err := sp.Session().SQL(). @@ -240,15 +240,15 @@ func TestQueriesPruneRemovesOldEntries(t *testing.T) { require.NoError(t, err) // Prune — old-key should be deleted (expires_at < now), new-key should survive. - n, err := q.Prune(ctx, sp) + n, err := q.Prune(ctx) require.NoError(t, err) assert.EqualValues(t, 1, n, "expected exactly one row pruned") - old, err := q.Load(ctx, sp, testNamespace, testCacheName, "old-key") + old, err := q.Load(ctx, testNamespace, testCacheName, "old-key") require.NoError(t, err) assert.Nil(t, old, "old entry should have been pruned") - fresh, err := q.Load(ctx, sp, testNamespace, testCacheName, "new-key") + fresh, err := q.Load(ctx, testNamespace, testCacheName, "new-key") require.NoError(t, err) assert.NotNil(t, fresh, "new entry should still exist") } @@ -256,29 +256,29 @@ func TestQueriesPruneRemovesOldEntries(t *testing.T) { func TestQueriesPruneKeepsRecentEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupPostgres(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "recent", "node-1", sampleOutputs("v1"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "recent", "node-1", sampleOutputs("v1"), 2592000)) // All entries are recent — nothing should be pruned. - n, err := q.Prune(ctx, sp) + n, err := q.Prune(ctx) require.NoError(t, err) assert.EqualValues(t, 0, n, "expected no rows pruned when all entries are fresh") } -// MySQL test variants — verify longtext and ON DUPLICATE KEY UPDATE. +// MySQL test variants — verify longtext and upsert behavior. func TestMySQLSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key1") + rec, err := q.Load(ctx, testNamespace, testCacheName, "key1") require.NoError(t, err) assert.Nil(t, rec, "expected nil for missing key") - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key1", "node-abc", sampleOutputs("hello"), 2592000)) - rec, err = q.Load(ctx, sp, testNamespace, testCacheName, "key1") + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "key1", "node-abc", sampleOutputs("hello"), 2592000)) + rec, err = q.Load(ctx, testNamespace, testCacheName, "key1") require.NoError(t, err) require.NotNil(t, rec) assert.Equal(t, "node-abc", rec.NodeID) @@ -288,12 +288,12 @@ func TestMySQLSaveAndLoad(t *testing.T) { func TestMySQLSaveReplaces(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "key3", "node-old", sampleOutputs("old"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "key3", "node-new", sampleOutputs("new"), 2592000)) - rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "key3") + rec, err := q.Load(ctx, testNamespace, testCacheName, "key3") require.NoError(t, err) require.NotNil(t, rec) assert.Equal(t, "node-new", rec.NodeID) @@ -303,15 +303,15 @@ func TestMySQLSaveReplaces(t *testing.T) { func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "expired-key", "node-old", sampleOutputs("old"), 2592000)) _, err := sp.Session().SQL(). ExecContext(ctx, "UPDATE "+testTableName+" SET expires_at = ? WHERE cache_key = ?", time.Now().Add(-10*time.Second), "expired-key") require.NoError(t, err) - rec, err := q.Load(ctx, sp, testNamespace, testCacheName, "expired-key") + rec, err := q.Load(ctx, testNamespace, testCacheName, "expired-key") require.NoError(t, err) assert.Nil(t, rec, "expired entries should load as a cache miss") } @@ -319,25 +319,25 @@ func TestMySQLLoadSkipsExpiredEntries(t *testing.T) { func TestMySQLPruneRemovesOldEntries(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupMySQL(ctx, t) - q := newQueries(t) + q := newQueries(t, sp) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) - require.NoError(t, q.Save(ctx, sp, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "old-key", "node-old", sampleOutputs("old"), 1)) + require.NoError(t, q.Save(ctx, testNamespace, testCacheName, "new-key", "node-new", sampleOutputs("new"), 2592000)) // Backdate old-key's expires_at so it is in the past. _, err := sp.Session().SQL(). ExecContext(ctx, "UPDATE "+testTableName+" SET expires_at = ? WHERE cache_key = ?", time.Now().Add(-10*time.Second), "old-key") require.NoError(t, err) - n, err := q.Prune(ctx, sp) + n, err := q.Prune(ctx) require.NoError(t, err) assert.EqualValues(t, 1, n, "expected exactly one row pruned") - old, err := q.Load(ctx, sp, testNamespace, testCacheName, "old-key") + old, err := q.Load(ctx, testNamespace, testCacheName, "old-key") require.NoError(t, err) assert.Nil(t, old, "old entry should have been pruned") - fresh, err := q.Load(ctx, sp, testNamespace, testCacheName, "new-key") + fresh, err := q.Load(ctx, testNamespace, testCacheName, "new-key") require.NoError(t, err) assert.NotNil(t, fresh, "new entry should still exist") } diff --git a/workflow/controller/cache/cache.go b/workflow/controller/cache/cache.go index 0358528871ca..2ed3d2e974d1 100644 --- a/workflow/controller/cache/cache.go +++ b/workflow/controller/cache/cache.go @@ -14,7 +14,7 @@ import ( wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" "github.com/argoproj/argo-workflows/v4/util/logging" - "github.com/argoproj/argo-workflows/v4/util/sqldb" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" ) var cacheKeyRegex = regexp.MustCompile("^[a-zA-Z0-9][-a-zA-Z0-9]*$") @@ -99,27 +99,18 @@ func (e *Entry) GetOutputsWithMaxAge(maxAge time.Duration) (*wfv1.Outputs, bool) } type cacheFactory struct { - caches map[string]MemoizationCache - kubeclient kubernetes.Interface - lock sync.RWMutex - sessionProxy *sqldb.SessionProxy - tableName string - // sqlEnabled indicates that SQL caching was explicitly configured by the operator, even if - // the session proxy is currently unavailable. When true and sessionProxy is nil, GetCache - // returns nil rather than silently falling back to ConfigMap-based caching. - sqlEnabled bool + caches map[string]MemoizationCache + kubeclient kubernetes.Interface + lock sync.RWMutex + queries memodb.MemoizationDB } type Factory interface { GetCache(ctx context.Context, ct Type, namespace, name string) MemoizationCache - // SetSessionProxy configures the factory to use database-backed caching with the given - // session proxy and table name. Calling this clears any previously created cache instances + // SetQueries configures the factory to use database-backed caching with the given + // MemoizationDB. Calling this clears any previously created cache instances // so they are recreated against the SQL backend. - SetSessionProxy(sp *sqldb.SessionProxy, tableName string) - // ClearSessionProxy removes any SQL backend configuration. If sqlEnabled is true, GetCache - // returns nil rather than silently falling back to ConfigMap-based caching (e.g. after a - // transient DB failure). If sqlEnabled is false, GetCache falls back to ConfigMap caching. - ClearSessionProxy(sqlEnabled bool) + SetQueries(q memodb.MemoizationDB) } func NewCacheFactory(ki kubernetes.Interface) Factory { @@ -133,30 +124,16 @@ type Type string const ( // ConfigMapCache is a cache type identifier used as a key prefix in the cache map. - // When a database session proxy is configured, SQL-backed caching is used instead. + // When a MemoizationDB is configured, SQL-backed caching is used instead. ConfigMapCache Type = "ConfigMapCache" ) -// SetSessionProxy configures the factory to use a SQL backend, clearing any previously +// SetQueries configures the factory to use a SQL backend, clearing any previously // cached instances so they are recreated against the new backend. -func (cf *cacheFactory) SetSessionProxy(sp *sqldb.SessionProxy, tableName string) { +func (cf *cacheFactory) SetQueries(q memodb.MemoizationDB) { cf.lock.Lock() defer cf.lock.Unlock() - cf.sessionProxy = sp - cf.tableName = tableName - cf.sqlEnabled = true - cf.caches = make(map[string]MemoizationCache) -} - -// ClearSessionProxy removes the SQL backend. When sqlEnabled is true (DB configured but -// temporarily unavailable), GetCache returns nil. When false (no DB configured), GetCache -// falls back to ConfigMap-based caching. -func (cf *cacheFactory) ClearSessionProxy(sqlEnabled bool) { - cf.lock.Lock() - defer cf.lock.Unlock() - cf.sessionProxy = nil - cf.tableName = "" - cf.sqlEnabled = sqlEnabled + cf.queries = q cf.caches = make(map[string]MemoizationCache) } @@ -188,20 +165,9 @@ func (cf *cacheFactory) GetCache(ctx context.Context, ct Type, namespace, name s switch ct { case ConfigMapCache: var c MemoizationCache - switch { - case cf.sessionProxy != nil: - var err error - c, err = newSQLDBCache(namespace, name, cf.sessionProxy, cf.tableName) - if err != nil { - logger.WithFields(logging.Fields{"cacheName": name, "workflowNamespace": namespace}).WithError(err).Error(ctx, "Failed to create SQL memoization cache") - return nil - } - case cf.sqlEnabled: - // SQL was explicitly configured but is currently unavailable. Return nil so callers - // can skip caching rather than silently redirecting to a ConfigMap backend. - logger.WithFields(logging.Fields{"cacheName": name, "workflowNamespace": namespace}).Warn(ctx, "SQL memoization cache requested but SQL backend is unavailable; skipping cache") - return nil - default: + if cf.queries != nil && cf.queries.IsEnabled() { + c = newSQLDBCache(namespace, name, cf.queries) + } else { c = NewConfigMapCache(namespace, cf.kubeclient, name) } cf.caches[idx] = c diff --git a/workflow/controller/cache/sqldb_cache.go b/workflow/controller/cache/sqldb_cache.go index e38453d938e8..0b1c504fcc2e 100644 --- a/workflow/controller/cache/sqldb_cache.go +++ b/workflow/controller/cache/sqldb_cache.go @@ -9,34 +9,27 @@ import ( wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" - "github.com/argoproj/argo-workflows/v4/util/sqldb" ) type sqlDBCache struct { - namespace string - name string - sessionProxy *sqldb.SessionProxy - queries *memodb.Queries + namespace string + name string + queries memodb.MemoizationDB } -func newSQLDBCache(namespace, name string, sp *sqldb.SessionProxy, tableName string) (MemoizationCache, error) { - queries, err := memodb.NewQueries(tableName) - if err != nil { - return nil, err - } +func newSQLDBCache(namespace, name string, queries memodb.MemoizationDB) MemoizationCache { return &sqlDBCache{ - namespace: namespace, - name: name, - sessionProxy: sp, - queries: queries, - }, nil + namespace: namespace, + name: name, + queries: queries, + } } func (c *sqlDBCache) Load(ctx context.Context, key string) (*Entry, error) { if !cacheKeyRegex.MatchString(key) { return nil, fmt.Errorf("invalid cache key: %s", key) } - record, err := c.queries.Load(ctx, c.sessionProxy, c.namespace, c.name, key) + record, err := c.queries.Load(ctx, c.namespace, c.name, key) if err != nil { return nil, fmt.Errorf("memoization db load failed: %w", err) } @@ -63,5 +56,5 @@ func (c *sqlDBCache) Save(ctx context.Context, key string, nodeID string, value if err != nil { return err } - return c.queries.Save(ctx, c.sessionProxy, c.namespace, c.name, key, nodeID, value, maxAgeSeconds) + return c.queries.Save(ctx, c.namespace, c.name, key, nodeID, value, maxAgeSeconds) } diff --git a/workflow/controller/cache/sqldb_cache_test.go b/workflow/controller/cache/sqldb_cache_test.go index d3763b15ed37..0c19039c000c 100644 --- a/workflow/controller/cache/sqldb_cache_test.go +++ b/workflow/controller/cache/sqldb_cache_test.go @@ -86,12 +86,18 @@ func setupTestPostgres(ctx context.Context, t *testing.T) *sqldb.SessionProxy { return sp } +func newTestSQLDBCache(t *testing.T, sp *sqldb.SessionProxy) MemoizationCache { + t.Helper() + queries, err := memodb.NewQueries(testTableName, sp) + require.NoError(t, err) + return newSQLDBCache(testNamespace, testCacheName, queries) +} + func TestSQLDBCacheSaveAndLoad(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) - c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) - require.NoError(t, err) + c := newTestSQLDBCache(t, sp) // Load returns nil for missing key. entry, err := c.Load(ctx, "key1") @@ -122,11 +128,10 @@ func TestSQLDBCacheInvalidKey(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) - c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) - require.NoError(t, err) + c := newTestSQLDBCache(t, sp) // Keys with invalid characters should be rejected. - _, err = c.Load(ctx, "invalid key!") + _, err := c.Load(ctx, "invalid key!") require.Error(t, err) assert.Contains(t, err.Error(), "invalid cache key") @@ -139,8 +144,7 @@ func TestSQLDBCacheOutputsRoundTrip(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) - c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) - require.NoError(t, err) + c := newTestSQLDBCache(t, sp) // Save complex outputs and verify they round-trip through JSON. outputs := &wfv1.Outputs{ @@ -165,8 +169,7 @@ func TestSQLDBCacheGetOutputsWithMaxAge(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) - c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) - require.NoError(t, err) + c := newTestSQLDBCache(t, sp) outputs := &wfv1.Outputs{ Parameters: []wfv1.Parameter{ @@ -194,8 +197,7 @@ func TestSQLDBCacheUpsertRefreshesCreatedAt(t *testing.T) { ctx := logging.TestContext(t.Context()) sp := setupTestPostgres(ctx, t) - c, err := newSQLDBCache(testNamespace, testCacheName, sp, testTableName) - require.NoError(t, err) + c := newTestSQLDBCache(t, sp) outputs := &wfv1.Outputs{ Parameters: []wfv1.Parameter{ diff --git a/workflow/controller/config.go b/workflow/controller/config.go index 038724f49b5b..27b7220e607b 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -25,67 +25,6 @@ var ( memoizationMigrate = memodb.Migrate ) -// initializeMemoizationBackendLocked creates and migrates the memoization SQL backend. -// The caller must hold wfc.memoLock. -func (wfc *WorkflowController) initializeMemoizationBackendLocked(ctx context.Context, logger logging.Logger) { - if wfc.memoConfig == nil { - return - } - if wfc.memoSessionProxy == nil { - sp := memoSessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, wfc.memoConfig) - if sp == nil { - logger.Error(ctx, "Failed to connect to memoization database; SQL caching unavailable. Workflows using memoization will skip caching until the database is reachable.") - wfc.memoSessionProxy = nil - wfc.memoMigrated = false - wfc.cacheFactory.ClearSessionProxy(true) - return - } - wfc.memoSessionProxy = sp - wfc.memoMigrated = false - } - if wfc.memoSessionProxy != nil && !wfc.memoMigrated { - cfg := memodb.ConfigFromConfig(wfc.memoConfig) - if err := memoizationMigrate(ctx, wfc.memoSessionProxy, cfg); err != nil { - logger.WithError(err).Error(ctx, "Memoization database migration failed; SQL caching unavailable. Workflows using memoization will skip caching until the database is healthy.") - wfc.memoSessionProxy.Close() - wfc.memoSessionProxy = nil - wfc.memoMigrated = false - wfc.cacheFactory.ClearSessionProxy(true) - return - } - wfc.memoMigrated = true - wfc.cacheFactory.SetSessionProxy(wfc.memoSessionProxy, cfg.TableName) - } -} - -// ensureMemoizationBackend makes sure the configured memoization SQL backend is ready for use. -// It returns true when memoization can proceed against the configured backend. -func (wfc *WorkflowController) ensureMemoizationBackend(ctx context.Context) bool { - logger := logging.RequireLoggerFromContext(ctx) - wfc.memoLock.Lock() - defer wfc.memoLock.Unlock() - if wfc.memoConfig == nil { - return true - } - if wfc.memoSessionProxy != nil && wfc.memoMigrated { - return true - } - wfc.initializeMemoizationBackendLocked(ctx, logger) - return wfc.memoSessionProxy != nil && wfc.memoMigrated -} - -// getMemoizationCache returns the memoization cache for the given namespace and name. -// When SQL memoization is configured, it first ensures the backend is available. -func (wfc *WorkflowController) getMemoizationCache(ctx context.Context, namespace, name string) controllercache.MemoizationCache { - wfc.memoLock.RLock() - memoConfigured := wfc.memoConfig != nil - wfc.memoLock.RUnlock() - if memoConfigured && !wfc.ensureMemoizationBackend(ctx) { - return nil - } - return wfc.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, namespace, name) -} - func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger := logging.RequireLoggerFromContext(ctx) _, err := yaml.Marshal(wfc.Config) @@ -146,23 +85,36 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger.Info(ctx, "Persistence configuration disabled") } + wfc.memoQueries = memodb.NullMemoizationDB memoCfg := wfc.Config.Memoization - wfc.memoLock.Lock() - wfc.memoConfig = memoCfg if memoCfg != nil { logger.Info(ctx, "Memoization database configuration enabled") - wfc.initializeMemoizationBackendLocked(ctx, logger) + if wfc.memoSessionProxy == nil { + sp := memoSessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, memoCfg) + if sp == nil { + return fmt.Errorf("failed to create memoization database session") + } + wfc.memoSessionProxy = sp + } + cfg := memodb.ConfigFromConfig(memoCfg) + if err := memoizationMigrate(ctx, wfc.memoSessionProxy, cfg); err != nil { + return fmt.Errorf("memoization database migration failed: %w", err) + } + queries, err := memodb.NewQueries(cfg.TableName, wfc.memoSessionProxy) + if err != nil { + return err + } + wfc.memoQueries = queries + wfc.cacheFactory.SetQueries(queries) } else { if wfc.memoSessionProxy != nil { logger.Info(ctx, "Memoization database configuration removed") wfc.memoSessionProxy.Close() wfc.memoSessionProxy = nil - wfc.memoMigrated = false } - wfc.cacheFactory.ClearSessionProxy(false) + wfc.cacheFactory.SetQueries(nil) logger.Info(ctx, "Memoization database configuration disabled; using ConfigMap-based caching") } - wfc.memoLock.Unlock() wfc.hydrator = hydrator.New(wfc.offloadNodeStatusRepo) wfc.updateEstimatorFactory(ctx) @@ -176,6 +128,11 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { return nil } +// getMemoizationCache returns the memoization cache for the given namespace and name. +func (wfc *WorkflowController) getMemoizationCache(ctx context.Context, namespace, name string) controllercache.MemoizationCache { + return wfc.cacheFactory.GetCache(ctx, controllercache.ConfigMapCache, namespace, name) +} + // initDB inits argo DB tables func (wfc *WorkflowController) initDB(ctx context.Context) error { persistence := wfc.Config.Persistence diff --git a/workflow/controller/config_test.go b/workflow/controller/config_test.go index cdd4f8f287a7..7397272b052a 100644 --- a/workflow/controller/config_test.go +++ b/workflow/controller/config_test.go @@ -1,17 +1,13 @@ package controller import ( - "context" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "k8s.io/client-go/kubernetes" - "github.com/argoproj/argo-workflows/v4/config" "github.com/argoproj/argo-workflows/v4/util/logging" memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" - utilsqldb "github.com/argoproj/argo-workflows/v4/util/sqldb" ) func TestUpdateConfig(t *testing.T) { @@ -25,44 +21,5 @@ func TestUpdateConfig(t *testing.T) { assert.NotNil(t, controller.archiveLabelSelector) assert.NotNil(t, controller.wfArchive) assert.NotNil(t, controller.offloadNodeStatusRepo) -} - -func TestGetMemoizationCacheRetriesBackendInitialization(t *testing.T) { - ctx := logging.TestContext(t.Context()) - cancel, controller := newController(ctx) - defer cancel() - - controller.memoLock.Lock() - controller.memoConfig = &config.MemoizationConfig{ - DBConfig: config.DBConfig{ - PostgreSQL: &config.PostgreSQLConfig{}, - }, - } - controller.memoLock.Unlock() - - originalBuilder := memoSessionProxyFromConfig - originalMigrate := memoizationMigrate - t.Cleanup(func() { - memoSessionProxyFromConfig = originalBuilder - memoizationMigrate = originalMigrate - }) - - calls := 0 - memoSessionProxyFromConfig = func(context.Context, kubernetes.Interface, string, *config.MemoizationConfig) *utilsqldb.SessionProxy { - calls++ - if calls == 1 { - return nil - } - return utilsqldb.NewSessionProxyFromSession(nil, &config.DBConfig{PostgreSQL: &config.PostgreSQLConfig{}}, "", "") - } - memoizationMigrate = func(context.Context, *utilsqldb.SessionProxy, memodb.Config) error { - return nil - } - - cache := controller.getMemoizationCache(ctx, "default", "memo-cache") - assert.Nil(t, cache) - - cache = controller.getMemoizationCache(ctx, "default", "memo-cache") - require.NotNil(t, cache) - assert.Equal(t, 2, calls) + assert.Equal(t, memodb.NullMemoizationDB, controller.memoQueries) } diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 5d58a04ef4ba..f81c133a3eee 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -136,9 +136,7 @@ type WorkflowController struct { workflowKeyLock syncpkg.KeyLock // used to lock workflows for exclusive modification or access sessionProxy *utilsqldb.SessionProxy memoSessionProxy *utilsqldb.SessionProxy - memoMigrated bool - memoConfig *config.MemoizationConfig // protected by memoLock - memoLock gosync.RWMutex + memoQueries memodb.MemoizationDB offloadNodeStatusRepo sqldb.OffloadNodeStatusRepo hydrator hydrator.Interface wfArchive sqldb.WorkflowArchive @@ -734,12 +732,17 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont logger = logger.WithField("component", "memo_cache_garbage_collector") ctx = logging.WithLogger(ctx, logger) + if wfc.memoQueries == nil || !wfc.memoQueries.IsEnabled() { + logger.Debug(ctx, "Memoization DB not configured; GC disabled") + return + } + periodicity := env.LookupEnvDurationOr(ctx, "MEMO_CACHE_GC_PERIOD", 24*time.Hour) if periodicity <= 0 { logger.Info(ctx, "MEMO_CACHE_GC_PERIOD is zero or negative - cache GC disabled") return } - logger.Info(ctx, "Memoization cache GC goroutine started; will check config each tick") + logger.Info(ctx, "Memoization cache GC goroutine started") ticker := time.NewTicker(periodicity) defer ticker.Stop() @@ -748,24 +751,8 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont case <-ctx.Done(): return case <-ticker.C: - wfc.memoLock.RLock() - memoCfg := wfc.memoConfig - sp := wfc.memoSessionProxy - wfc.memoLock.RUnlock() - - if memoCfg == nil || sp == nil { - logger.Debug(ctx, "Memoization DB not configured or unavailable; skipping GC tick") - continue - } - - queries, err := memodb.NewQueries(memodb.TableName(memoCfg)) - if err != nil { - logger.WithError(err).Error(ctx, "Failed to initialize memoization cache queries") - continue - } - logger.Info(ctx, "Performing memoization cache GC") - n, err := queries.Prune(ctx, sp) + n, err := wfc.memoQueries.Prune(ctx) if err != nil { logger.WithError(err).Error(ctx, "Failed to prune memoization cache") } else { diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index 3d04566bbae5..f3f537460e9a 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -6393,11 +6393,7 @@ func TestConfigMapCacheLoadOperateMaxAge(t *testing.T) { func TestUnavailableSQLMemoizationBackendTreatsLookupAsCacheMiss(t *testing.T) { wf := wfv1.MustUnmarshalWorkflow(workflowCachedMaxAge) ctx := logging.TestContext(t.Context()) - cancel, controller := newController(ctx, func(wfc *WorkflowController) { - wfc.memoLock.Lock() - defer wfc.memoLock.Unlock() - wfc.memoConfig = &config.MemoizationConfig{} - }) + cancel, controller := newController(ctx) defer cancel() _, err := controller.wfclientset.ArgoprojV1alpha1().Workflows(wf.ObjectMeta.Namespace).Create(ctx, wf, metav1.CreateOptions{}) From 56877ee922b92004e1d94040ded7e2bb02adc649 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Mon, 27 Apr 2026 19:37:56 -0400 Subject: [PATCH 10/15] Handle duplicate keys Signed-off-by: droctothorpe --- docs/memoization.md | 9 +-- .../workflow-controller-configmap.yaml | 2 +- manifests/quick-start-postgres.yaml | 2 +- util/memo/db/queries.go | 72 +++++++++++++------ util/memo/db/queries_internal_test.go | 50 +++++++++++++ workflow/controller/config.go | 4 +- workflow/controller/controller.go | 30 ++++++-- workflow/controller/memoization_gc_test.go | 67 +++++++++++++++++ 8 files changed, 199 insertions(+), 37 deletions(-) create mode 100644 util/memo/db/queries_internal_test.go create mode 100644 workflow/controller/memoization_gc_test.go diff --git a/docs/memoization.md b/docs/memoization.md index 9fc85290db69..6a51c07bcf4b 100644 --- a/docs/memoization.md +++ b/docs/memoization.md @@ -40,11 +40,11 @@ metadata: namespace: argo data: memoization: | + tableName: cache_entries postgresql: host: postgres port: 5432 database: postgres - tableName: cache_entries userNameSecret: name: argo-postgres-config key: username @@ -53,7 +53,8 @@ data: key: password ``` -SQL-backed memoization stores entries in the configured table. If `tableName` is omitted, it defaults to `cache_entries`. +SQL-backed memoization stores entries in the configured table. Set `memoization.tableName` to override the default table name; if omitted, it defaults to `cache_entries`. +The database connection settings remain under `postgresql` or `mysql`. Each cache entry stores its expiry time when it is written, derived from the template's `maxAge` field. If `maxAge` is not specified on the template, it defaults to 30 days (2592000 seconds). This default can be overridden by setting the `DEFAULT_MAX_AGE` environment variable on the workflow controller for SQL-backed memoization (accepts Go duration strings like `720h` or integer seconds like `2592000`). @@ -63,11 +64,11 @@ MySQL is also supported: ```yaml memoization: | + tableName: cache_entries mysql: host: mysql port: 3306 database: argo - tableName: cache_entries userNameSecret: name: argo-mysql-config key: username @@ -107,7 +108,7 @@ spec: |-------|----------|-------------| | `key` | Yes | The cache lookup key. | | `cache` | Yes | Specifies the cache storage. When using the ConfigMap backend, a ConfigMap is created. When using the SQL backend, `cache.configMap.name` acts as a logical group name in the database — no ConfigMap is created. | -| `maxAge` | Yes | Maximum age of a cache entry (e.g. `"180s"`, `"24h"`). Entries older than this are treated as misses at lookup time. | +| `maxAge` | No | Maximum age of a cache entry (e.g. `"180s"`, `"24h"`). Entries older than this are treated as misses at lookup time. When omitted for SQL-backed memoization, it defaults to 30 days or the controller's `DEFAULT_MAX_AGE` setting. | [Find a simple example for memoization here](https://github.com/argoproj/argo-workflows/blob/main/examples/memoize-simple.yaml). diff --git a/manifests/components/postgres/overlays/workflow-controller-configmap.yaml b/manifests/components/postgres/overlays/workflow-controller-configmap.yaml index 159498312586..39572e2a9dda 100644 --- a/manifests/components/postgres/overlays/workflow-controller-configmap.yaml +++ b/manifests/components/postgres/overlays/workflow-controller-configmap.yaml @@ -44,11 +44,11 @@ data: name: argo-postgres-config key: password memoization: | + tableName: cache_entries postgresql: host: postgres port: 5432 database: postgres - tableName: cache_entries userNameSecret: name: argo-postgres-config key: username diff --git a/manifests/quick-start-postgres.yaml b/manifests/quick-start-postgres.yaml index 093af6d56059..a7f8195128e4 100644 --- a/manifests/quick-start-postgres.yaml +++ b/manifests/quick-start-postgres.yaml @@ -183472,11 +183472,11 @@ data: scope: workflow-list url: http://workflows?label=workflows.argoproj.io/completed=true memoization: | + tableName: cache_entries postgresql: host: postgres port: 5432 database: postgres - tableName: cache_entries userNameSecret: name: argo-postgres-config key: username diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go index 9131a6136caf..46991dd73253 100644 --- a/util/memo/db/queries.go +++ b/util/memo/db/queries.go @@ -5,6 +5,7 @@ import ( "encoding/json" "errors" "fmt" + "strings" "time" "github.com/upper/db/v4" @@ -79,6 +80,42 @@ func (q *queries) IsEnabled() bool { return true } +func cacheRecordCond(record *CacheRecord) db.Cond { + return db.Cond{ + colNamespace: record.Namespace, + colCacheName: record.CacheName, + colCacheKey: record.CacheKey, + } +} + +func cacheRecordUpdates(record *CacheRecord) map[string]any { + return map[string]any{ + "node_id": record.NodeID, + "outputs": record.Outputs, + "created_at": record.CreatedAt, + "expires_at": record.ExpiresAt, + } +} + +func isDuplicateKeyError(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "duplicate key") || strings.Contains(err.Error(), "Duplicate entry") +} + +func saveRecord(sess db.Session, tableName string, record *CacheRecord) error { + collection := sess.Collection(tableName) + _, err := collection.Insert(record) + if err == nil { + return nil + } + if !isDuplicateKeyError(err) { + return err + } + return collection.Find(cacheRecordCond(record)).Update(cacheRecordUpdates(record)) +} + // Load retrieves the outputs for the given cache key. // Returns nil when the entry does not exist or has expired. func (q *queries) Load(ctx context.Context, namespace, cacheName, cacheKey string) (*CacheRecord, error) { @@ -128,27 +165,16 @@ func (q *queries) Save(ctx context.Context, namespace, cacheName, cacheKey, node } now := time.Now().UTC() expiresAt := now.Add(time.Duration(maxAgeSeconds) * time.Second) - return q.sessionProxy.TxWith(ctx, func(sp *sqldb.SessionProxy) error { - return sp.With(ctx, func(tx db.Session) error { - _, err := tx.SQL(). - DeleteFrom(q.tableName). - Where(db.Cond{colNamespace: namespace}). - And(db.Cond{colCacheName: cacheName}). - And(db.Cond{colCacheKey: cacheKey}). - Exec() - if err != nil { - return err - } - _, err = tx.Collection(q.tableName).Insert(&CacheRecord{ - Namespace: namespace, - CacheName: cacheName, - CacheKey: cacheKey, - NodeID: nodeID, - Outputs: string(outputsJSON), - CreatedAt: now, - ExpiresAt: expiresAt, - }) - return err - }) - }, nil) + record := &CacheRecord{ + Namespace: namespace, + CacheName: cacheName, + CacheKey: cacheKey, + NodeID: nodeID, + Outputs: string(outputsJSON), + CreatedAt: now, + ExpiresAt: expiresAt, + } + return q.sessionProxy.With(ctx, func(sess db.Session) error { + return saveRecord(sess, q.tableName, record) + }) } diff --git a/util/memo/db/queries_internal_test.go b/util/memo/db/queries_internal_test.go new file mode 100644 index 000000000000..e8aa4050c21f --- /dev/null +++ b/util/memo/db/queries_internal_test.go @@ -0,0 +1,50 @@ +package db + +import ( + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + upperdb "github.com/upper/db/v4" +) + +func TestCacheRecordCond(t *testing.T) { + record := &CacheRecord{ + Namespace: "my-ns", + CacheName: "my-cache", + CacheKey: "my-key", + NodeID: "ignored", + } + + assert.Equal(t, upperdb.Cond{ + colNamespace: "my-ns", + colCacheName: "my-cache", + colCacheKey: "my-key", + }, cacheRecordCond(record)) +} + +func TestCacheRecordUpdates(t *testing.T) { + now := time.Unix(100, 0).UTC() + expiresAt := time.Unix(200, 0).UTC() + record := &CacheRecord{ + NodeID: "node-1", + Outputs: `{"result":"ok"}`, + CreatedAt: now, + ExpiresAt: expiresAt, + } + + assert.Equal(t, map[string]any{ + "node_id": "node-1", + "outputs": `{"result":"ok"}`, + "created_at": now, + "expires_at": expiresAt, + }, cacheRecordUpdates(record)) +} + +func TestIsDuplicateKeyError(t *testing.T) { + assert.True(t, isDuplicateKeyError(errors.New("pq: duplicate key value violates unique constraint"))) + assert.True(t, isDuplicateKeyError(errors.New("Error 1062: Duplicate entry 'x' for key 'PRIMARY'"))) + assert.False(t, isDuplicateKeyError(errors.New("some other error"))) + assert.False(t, isDuplicateKeyError(nil)) +} diff --git a/workflow/controller/config.go b/workflow/controller/config.go index 27b7220e607b..6dd190359995 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -85,7 +85,7 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger.Info(ctx, "Persistence configuration disabled") } - wfc.memoQueries = memodb.NullMemoizationDB + wfc.setMemoizationQueries(memodb.NullMemoizationDB) memoCfg := wfc.Config.Memoization if memoCfg != nil { logger.Info(ctx, "Memoization database configuration enabled") @@ -104,7 +104,7 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { if err != nil { return err } - wfc.memoQueries = queries + wfc.setMemoizationQueries(queries) wfc.cacheFactory.SetQueries(queries) } else { if wfc.memoSessionProxy != nil { diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index f81c133a3eee..53b9a4079281 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -137,6 +137,7 @@ type WorkflowController struct { sessionProxy *utilsqldb.SessionProxy memoSessionProxy *utilsqldb.SessionProxy memoQueries memodb.MemoizationDB + memoizationLock gosync.RWMutex offloadNodeStatusRepo sqldb.OffloadNodeStatusRepo hydrator hydrator.Interface wfArchive sqldb.WorkflowArchive @@ -262,6 +263,24 @@ func (wfc *WorkflowController) newThrottler() sync.Throttler { return sync.NewMultiThrottler(wfc.Config.Parallelism, wfc.Config.NamespaceParallelism, f) } +func (wfc *WorkflowController) getMemoizationQueries() memodb.MemoizationDB { + wfc.memoizationLock.RLock() + defer wfc.memoizationLock.RUnlock() + if wfc.memoQueries == nil { + return memodb.NullMemoizationDB + } + return wfc.memoQueries +} + +func (wfc *WorkflowController) setMemoizationQueries(queries memodb.MemoizationDB) { + if queries == nil { + queries = memodb.NullMemoizationDB + } + wfc.memoizationLock.Lock() + defer wfc.memoizationLock.Unlock() + wfc.memoQueries = queries +} + // runGCcontroller runs the workflow garbage collector controller func (wfc *WorkflowController) runGCcontroller(ctx context.Context, workflowTTLWorkers int) { defer runtimeutil.HandleCrashWithContext(ctx, runtimeutil.PanicHandlers...) @@ -732,11 +751,6 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont logger = logger.WithField("component", "memo_cache_garbage_collector") ctx = logging.WithLogger(ctx, logger) - if wfc.memoQueries == nil || !wfc.memoQueries.IsEnabled() { - logger.Debug(ctx, "Memoization DB not configured; GC disabled") - return - } - periodicity := env.LookupEnvDurationOr(ctx, "MEMO_CACHE_GC_PERIOD", 24*time.Hour) if periodicity <= 0 { logger.Info(ctx, "MEMO_CACHE_GC_PERIOD is zero or negative - cache GC disabled") @@ -751,8 +765,12 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont case <-ctx.Done(): return case <-ticker.C: + queries := wfc.getMemoizationQueries() + if !queries.IsEnabled() { + continue + } logger.Info(ctx, "Performing memoization cache GC") - n, err := wfc.memoQueries.Prune(ctx) + n, err := queries.Prune(ctx) if err != nil { logger.WithError(err).Error(ctx, "Failed to prune memoization cache") } else { diff --git a/workflow/controller/memoization_gc_test.go b/workflow/controller/memoization_gc_test.go new file mode 100644 index 000000000000..548209841350 --- /dev/null +++ b/workflow/controller/memoization_gc_test.go @@ -0,0 +1,67 @@ +package controller + +import ( + "context" + "testing" + "time" + + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" +) + +type testMemoizationDB struct { + pruned chan struct{} +} + +func (*testMemoizationDB) Load(context.Context, string, string, string) (*memodb.CacheRecord, error) { + return nil, nil +} + +func (*testMemoizationDB) Save(context.Context, string, string, string, string, *wfv1.Outputs, int64) error { + return nil +} + +func (t *testMemoizationDB) Prune(context.Context) (int64, error) { + select { + case t.pruned <- struct{}{}: + default: + } + return 1, nil +} + +func (*testMemoizationDB) IsEnabled() bool { + return true +} + +func TestMemoizationCacheGarbageCollectorHandlesRuntimeEnable(t *testing.T) { + t.Setenv("MEMO_CACHE_GC_PERIOD", "10ms") + + ctx, cancel := context.WithCancel(logging.TestContext(t.Context())) + defer cancel() + + controller := &WorkflowController{} + done := make(chan struct{}) + go func() { + defer close(done) + controller.memoizationCacheGarbageCollector(ctx) + }() + + time.Sleep(25 * time.Millisecond) + + queries := &testMemoizationDB{pruned: make(chan struct{}, 1)} + controller.setMemoizationQueries(queries) + + select { + case <-queries.pruned: + case <-time.After(250 * time.Millisecond): + t.Fatal("expected memoization cache GC to observe runtime enablement and prune") + } + + cancel() + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("expected memoization cache GC goroutine to stop after context cancellation") + } +} From b0658c007af6f187aa4d92a9a1a2b717fc59ac48 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Mon, 27 Apr 2026 23:31:18 -0400 Subject: [PATCH 11/15] Address GPT 5.4 feedback Signed-off-by: droctothorpe --- util/memo/db/config.go | 8 +- workflow/controller/cache/cache.go | 12 +- .../controller/cache/cache_factory_test.go | 102 ++++++++++++++++ workflow/controller/cache/sqldb_cache.go | 58 +++++++--- workflow/controller/cache/sqldb_cache_test.go | 4 +- workflow/controller/config.go | 46 +++++--- workflow/controller/config_test.go | 109 +++++++++++++++++- workflow/controller/controller.go | 28 ++++- 8 files changed, 326 insertions(+), 41 deletions(-) diff --git a/util/memo/db/config.go b/util/memo/db/config.go index 986abc710fde..b144682fef26 100644 --- a/util/memo/db/config.go +++ b/util/memo/db/config.go @@ -69,8 +69,8 @@ func ConfigFromConfig(cfg *config.MemoizationConfig) Config { } // SessionProxyFromConfig creates a SessionProxy from a MemoizationConfig, returning nil and logging -// an error if the connection cannot be established. Callers that receive nil should fall back to -// ConfigMap-based caching. +// an error if the connection cannot be established. Callers that receive nil should decide how to +// degrade memoization without crashing the controller. func SessionProxyFromConfig(ctx context.Context, kubectlConfig kubernetes.Interface, namespace string, cfg *config.MemoizationConfig) *sqldb.SessionProxy { if cfg == nil { return nil @@ -90,8 +90,8 @@ func SessionProxyFromConfig(ctx context.Context, kubectlConfig kubernetes.Interf } // Migrate runs database migrations for the memoization cache table. It is a no-op when -// cfg.SkipMigration is true. Returns an error if migration fails; callers should fall back -// to ConfigMap-based caching. +// cfg.SkipMigration is true. Returns an error if migration fails; callers should decide how to +// degrade memoization without crashing the controller. func Migrate(ctx context.Context, sessionProxy *sqldb.SessionProxy, cfg Config) error { if sessionProxy == nil { return nil diff --git a/workflow/controller/cache/cache.go b/workflow/controller/cache/cache.go index 2ed3d2e974d1..4a066a5036e5 100644 --- a/workflow/controller/cache/cache.go +++ b/workflow/controller/cache/cache.go @@ -124,12 +124,14 @@ type Type string const ( // ConfigMapCache is a cache type identifier used as a key prefix in the cache map. - // When a MemoizationDB is configured, SQL-backed caching is used instead. + // When a MemoizationDB is configured, SQL-backed memoization semantics are used instead. ConfigMapCache Type = "ConfigMapCache" ) -// SetQueries configures the factory to use a SQL backend, clearing any previously -// cached instances so they are recreated against the new backend. +// SetQueries configures the factory's memoization backend, clearing any previously +// cached instances so they are recreated against the new backend. A nil MemoizationDB +// selects ConfigMap-backed caching; a non-nil MemoizationDB selects SQL-backed +// memoization semantics, even if the DB implementation is disabled/no-op. func (cf *cacheFactory) SetQueries(q memodb.MemoizationDB) { cf.lock.Lock() defer cf.lock.Unlock() @@ -165,8 +167,8 @@ func (cf *cacheFactory) GetCache(ctx context.Context, ct Type, namespace, name s switch ct { case ConfigMapCache: var c MemoizationCache - if cf.queries != nil && cf.queries.IsEnabled() { - c = newSQLDBCache(namespace, name, cf.queries) + if cf.queries != nil { + c = newSQLDBCache(namespace, name, func() memodb.MemoizationDB { return cf.queries }, &cf.lock) } else { c = NewConfigMapCache(namespace, cf.kubeclient, name) } diff --git a/workflow/controller/cache/cache_factory_test.go b/workflow/controller/cache/cache_factory_test.go index ae4bf79719a6..d8c2255757a9 100644 --- a/workflow/controller/cache/cache_factory_test.go +++ b/workflow/controller/cache/cache_factory_test.go @@ -1,13 +1,18 @@ package cache import ( + "context" + "sync/atomic" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "k8s.io/client-go/kubernetes/fake" + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" "github.com/argoproj/argo-workflows/v4/util/logging" + memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" ) func TestCacheFactoryNamespacesCachesSeparately(t *testing.T) { @@ -34,3 +39,100 @@ func TestCacheFactoryRequiresNamespace(t *testing.T) { cache := factory.GetCache(ctx, ConfigMapCache, "", "shared-cache") assert.Nil(t, cache) } + +type testMemoizationDB struct { + enabled bool + saveCalls atomic.Int32 + loadStart chan struct{} + loadBlock chan struct{} +} + +func (t *testMemoizationDB) Load(context.Context, string, string, string) (*memodb.CacheRecord, error) { + if t.loadStart != nil { + close(t.loadStart) + } + if t.loadBlock != nil { + <-t.loadBlock + } + return nil, nil +} + +func (t *testMemoizationDB) Save(context.Context, string, string, string, string, *wfv1.Outputs, int64) error { + t.saveCalls.Add(1) + return nil +} + +func (*testMemoizationDB) Prune(context.Context) (int64, error) { + return 0, nil +} + +func (t *testMemoizationDB) IsEnabled() bool { + return t.enabled +} + +func TestCacheFactoryStaleSQLCacheNoopsAfterDisable(t *testing.T) { + ctx := logging.TestContext(t.Context()) + factory := NewCacheFactory(fake.NewSimpleClientset()).(*cacheFactory) + queries := &testMemoizationDB{enabled: true} + factory.SetQueries(queries) + + cache := factory.GetCache(ctx, ConfigMapCache, "default", "shared-cache") + require.NotNil(t, cache) + + factory.SetQueries(nil) + + require.NoError(t, cache.Save(ctx, "memo-key", "node-1", &wfv1.Outputs{}, "1h")) + assert.Zero(t, queries.saveCalls.Load()) +} + +func TestCacheFactoryDisableWaitsForInflightSQLLoad(t *testing.T) { + ctx := logging.TestContext(t.Context()) + factory := NewCacheFactory(fake.NewSimpleClientset()).(*cacheFactory) + queries := &testMemoizationDB{ + enabled: true, + loadStart: make(chan struct{}), + loadBlock: make(chan struct{}), + } + factory.SetQueries(queries) + + cache := factory.GetCache(ctx, ConfigMapCache, "default", "shared-cache") + require.NotNil(t, cache) + + loadDone := make(chan struct{}) + go func() { + defer close(loadDone) + _, _ = cache.Load(ctx, "memo-key") + }() + + select { + case <-queries.loadStart: + case <-time.After(time.Second): + t.Fatal("expected SQL load to start") + } + + setQueriesDone := make(chan struct{}) + go func() { + factory.SetQueries(nil) + close(setQueriesDone) + }() + + select { + case <-setQueriesDone: + t.Fatal("expected SetQueries to wait for in-flight SQL load") + case <-time.After(50 * time.Millisecond): + } + + close(queries.loadBlock) + + select { + case <-setQueriesDone: + case <-time.After(time.Second): + t.Fatal("expected SetQueries to finish after load completes") + } + + select { + case <-loadDone: + case <-time.After(time.Second): + t.Fatal("expected SQL load goroutine to finish") + } +} diff --git a/workflow/controller/cache/sqldb_cache.go b/workflow/controller/cache/sqldb_cache.go index 0b1c504fcc2e..353719186055 100644 --- a/workflow/controller/cache/sqldb_cache.go +++ b/workflow/controller/cache/sqldb_cache.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "sync" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -12,24 +13,50 @@ import ( ) type sqlDBCache struct { - namespace string - name string - queries memodb.MemoizationDB + namespace string + name string + getQueries func() memodb.MemoizationDB + lock *sync.RWMutex } -func newSQLDBCache(namespace, name string, queries memodb.MemoizationDB) MemoizationCache { +func newSQLDBCache(namespace, name string, getQueries func() memodb.MemoizationDB, lock *sync.RWMutex) MemoizationCache { return &sqlDBCache{ - namespace: namespace, - name: name, - queries: queries, + namespace: namespace, + name: name, + getQueries: getQueries, + lock: lock, } } +func (c *sqlDBCache) withQueries(fn func(memodb.MemoizationDB) error) error { + if c.lock == nil { + return fn(memodb.NullMemoizationDB) + } + c.lock.RLock() + defer c.lock.RUnlock() + + queries := memodb.NullMemoizationDB + if c.getQueries != nil { + if q := c.getQueries(); q != nil { + queries = q + } + } + return fn(queries) +} + func (c *sqlDBCache) Load(ctx context.Context, key string) (*Entry, error) { if !cacheKeyRegex.MatchString(key) { return nil, fmt.Errorf("invalid cache key: %s", key) } - record, err := c.queries.Load(ctx, c.namespace, c.name, key) + var record *memodb.CacheRecord + err := c.withQueries(func(queries memodb.MemoizationDB) error { + if !queries.IsEnabled() { + return nil + } + var err error + record, err = queries.Load(ctx, c.namespace, c.name, key) + return err + }) if err != nil { return nil, fmt.Errorf("memoization db load failed: %w", err) } @@ -52,9 +79,14 @@ func (c *sqlDBCache) Save(ctx context.Context, key string, nodeID string, value if !cacheKeyRegex.MatchString(key) { return fmt.Errorf("invalid cache key: %s", key) } - maxAgeSeconds, err := ResolveMaxAgeSeconds(maxAge) - if err != nil { - return err - } - return c.queries.Save(ctx, c.namespace, c.name, key, nodeID, value, maxAgeSeconds) + return c.withQueries(func(queries memodb.MemoizationDB) error { + if !queries.IsEnabled() { + return nil + } + maxAgeSeconds, err := ResolveMaxAgeSeconds(maxAge) + if err != nil { + return err + } + return queries.Save(ctx, c.namespace, c.name, key, nodeID, value, maxAgeSeconds) + }) } diff --git a/workflow/controller/cache/sqldb_cache_test.go b/workflow/controller/cache/sqldb_cache_test.go index 0c19039c000c..d358d86fcfb9 100644 --- a/workflow/controller/cache/sqldb_cache_test.go +++ b/workflow/controller/cache/sqldb_cache_test.go @@ -6,6 +6,7 @@ import ( "context" "encoding/json" "strconv" + "sync" "testing" "time" @@ -90,7 +91,8 @@ func newTestSQLDBCache(t *testing.T, sp *sqldb.SessionProxy) MemoizationCache { t.Helper() queries, err := memodb.NewQueries(testTableName, sp) require.NoError(t, err) - return newSQLDBCache(testNamespace, testCacheName, queries) + var lock sync.RWMutex + return newSQLDBCache(testNamespace, testCacheName, func() memodb.MemoizationDB { return queries }, &lock) } func TestSQLDBCacheSaveAndLoad(t *testing.T) { diff --git a/workflow/controller/config.go b/workflow/controller/config.go index 6dd190359995..b87c0ff7b36f 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -25,6 +25,21 @@ var ( memoizationMigrate = memodb.Migrate ) +func (wfc *WorkflowController) resetMemoizationBackend(ctx context.Context, sessionProxy *sqldb.SessionProxy, cacheQueries memodb.MemoizationDB) { + logger := logging.RequireLoggerFromContext(ctx) + wfc.setMemoizationQueries(cacheQueries) + wfc.cacheFactory.SetQueries(cacheQueries) + if sessionProxy == nil { + sessionProxy = wfc.memoSessionProxy + } + wfc.memoSessionProxy = nil + if sessionProxy != nil { + if err := sessionProxy.Close(); err != nil { + logger.WithError(err).Warn(ctx, "Failed to close memoization database session") + } + } +} + func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger := logging.RequireLoggerFromContext(ctx) _, err := yaml.Marshal(wfc.Config) @@ -85,37 +100,42 @@ func (wfc *WorkflowController) updateConfig(ctx context.Context) error { logger.Info(ctx, "Persistence configuration disabled") } - wfc.setMemoizationQueries(memodb.NullMemoizationDB) memoCfg := wfc.Config.Memoization if memoCfg != nil { logger.Info(ctx, "Memoization database configuration enabled") - if wfc.memoSessionProxy == nil { - sp := memoSessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, memoCfg) - if sp == nil { - return fmt.Errorf("failed to create memoization database session") + sessionProxy := wfc.memoSessionProxy + if sessionProxy == nil { + sessionProxy = memoSessionProxyFromConfig(ctx, wfc.kubeclientset, wfc.namespace, memoCfg) + if sessionProxy == nil { + logger.Warn(ctx, "Memoization database unavailable; memoization disabled") + wfc.resetMemoizationBackend(ctx, nil, memodb.NullMemoizationDB) + goto memoizationConfigured } - wfc.memoSessionProxy = sp } cfg := memodb.ConfigFromConfig(memoCfg) - if err := memoizationMigrate(ctx, wfc.memoSessionProxy, cfg); err != nil { - return fmt.Errorf("memoization database migration failed: %w", err) + if err := memoizationMigrate(ctx, sessionProxy, cfg); err != nil { + logger.WithError(err).Error(ctx, "Memoization database migration failed; memoization disabled") + wfc.resetMemoizationBackend(ctx, sessionProxy, memodb.NullMemoizationDB) + goto memoizationConfigured } - queries, err := memodb.NewQueries(cfg.TableName, wfc.memoSessionProxy) + queries, err := memodb.NewQueries(cfg.TableName, sessionProxy) if err != nil { - return err + logger.WithError(err).Error(ctx, "Memoization database initialization failed; memoization disabled") + wfc.resetMemoizationBackend(ctx, sessionProxy, memodb.NullMemoizationDB) + goto memoizationConfigured } + wfc.memoSessionProxy = sessionProxy wfc.setMemoizationQueries(queries) wfc.cacheFactory.SetQueries(queries) } else { if wfc.memoSessionProxy != nil { logger.Info(ctx, "Memoization database configuration removed") - wfc.memoSessionProxy.Close() - wfc.memoSessionProxy = nil } - wfc.cacheFactory.SetQueries(nil) + wfc.resetMemoizationBackend(ctx, nil, nil) logger.Info(ctx, "Memoization database configuration disabled; using ConfigMap-based caching") } +memoizationConfigured: wfc.hydrator = hydrator.New(wfc.offloadNodeStatusRepo) wfc.updateEstimatorFactory(ctx) wfc.rateLimiter = wfc.newRateLimiter() diff --git a/workflow/controller/config_test.go b/workflow/controller/config_test.go index 7397272b052a..9ca4a8c7e26e 100644 --- a/workflow/controller/config_test.go +++ b/workflow/controller/config_test.go @@ -1,13 +1,24 @@ package controller import ( + "context" + stderrors "errors" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/upper/db/v4" + apierr "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + + "github.com/argoproj/argo-workflows/v4/config" + wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" + "github.com/stretchr/testify/assert" "github.com/argoproj/argo-workflows/v4/util/logging" memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" + "github.com/argoproj/argo-workflows/v4/util/sqldb" + controllercache "github.com/argoproj/argo-workflows/v4/workflow/controller/cache" ) func TestUpdateConfig(t *testing.T) { @@ -23,3 +34,99 @@ func TestUpdateConfig(t *testing.T) { assert.NotNil(t, controller.offloadNodeStatusRepo) assert.Equal(t, memodb.NullMemoizationDB, controller.memoQueries) } + +func TestUpdateConfigMemoizationSessionFailureDisablesMemoization(t *testing.T) { + ctx := logging.TestContext(t.Context()) + cancel, controller := newController(ctx) + defer cancel() + + controller.Config.Memoization = &config.MemoizationConfig{} + + origMemoSessionProxyFromConfig := memoSessionProxyFromConfig + memoSessionProxyFromConfig = func(context.Context, kubernetes.Interface, string, *config.MemoizationConfig) *sqldb.SessionProxy { + return nil + } + t.Cleanup(func() { + memoSessionProxyFromConfig = origMemoSessionProxyFromConfig + }) + + err := controller.updateConfig(ctx) + require.NoError(t, err) + assert.Nil(t, controller.memoSessionProxy) + assert.Equal(t, memodb.NullMemoizationDB, controller.getMemoizationQueries()) + + cache := controller.getMemoizationCache(ctx, "default", "memo-disabled-cache") + require.NotNil(t, cache) + require.NoError(t, cache.Save(ctx, "memo-key", "", &wfv1.Outputs{}, "")) + + _, err = controller.kubeclientset.CoreV1().ConfigMaps("default").Get(ctx, "memo-disabled-cache", metav1.GetOptions{}) + assert.True(t, apierr.IsNotFound(err)) +} + +type observingCacheFactory struct { + setQueries func(memodb.MemoizationDB) +} + +func (o *observingCacheFactory) GetCache(context.Context, controllercache.Type, string, string) controllercache.MemoizationCache { + return nil +} + +func (o *observingCacheFactory) SetQueries(q memodb.MemoizationDB) { + if o.setQueries != nil { + o.setQueries(q) + } +} + +func TestUpdateConfigMemoizationDisableDetachesCachesBeforeClosingSession(t *testing.T) { + ctx := logging.TestContext(t.Context()) + cancel, controller := newController(ctx) + defer cancel() + + sessionProxy := sqldb.NewSessionProxyFromSession(nil, &config.DBConfig{}, "user", "password") + controller.memoSessionProxy = sessionProxy + + var setQueriesSawClosedProxy bool + controller.cacheFactory = &observingCacheFactory{ + setQueries: func(memodb.MemoizationDB) { + err := sessionProxy.With(ctx, func(db.Session) error { return nil }) + setQueriesSawClosedProxy = err != nil && err.Error() == "session proxy is closed" + }, + } + + err := controller.updateConfig(ctx) + require.NoError(t, err) + assert.False(t, setQueriesSawClosedProxy) + assert.Nil(t, controller.memoSessionProxy) + assert.EqualError(t, sessionProxy.With(ctx, func(db.Session) error { return nil }), "session proxy is closed") +} + +func TestUpdateConfigMemoizationMigrationFailureDisablesMemoization(t *testing.T) { + ctx := logging.TestContext(t.Context()) + cancel, controller := newController(ctx) + defer cancel() + + controller.Config.Memoization = &config.MemoizationConfig{} + sessionProxy := sqldb.NewSessionProxyFromSession(nil, &config.DBConfig{}, "user", "password") + controller.memoSessionProxy = sessionProxy + + origMemoizationMigrate := memoizationMigrate + memoizationMigrate = func(context.Context, *sqldb.SessionProxy, memodb.Config) error { + return stderrors.New("boom") + } + t.Cleanup(func() { + memoizationMigrate = origMemoizationMigrate + }) + + err := controller.updateConfig(ctx) + require.NoError(t, err) + assert.Nil(t, controller.memoSessionProxy) + assert.Equal(t, memodb.NullMemoizationDB, controller.getMemoizationQueries()) + assert.EqualError(t, sessionProxy.With(ctx, func(db.Session) error { return nil }), "session proxy is closed") + + cache := controller.getMemoizationCache(ctx, "default", "memo-migrate-disabled-cache") + require.NotNil(t, cache) + require.NoError(t, cache.Save(ctx, "memo-key", "", &wfv1.Outputs{}, "")) + + _, err = controller.kubeclientset.CoreV1().ConfigMaps("default").Get(ctx, "memo-migrate-disabled-cache", metav1.GetOptions{}) + assert.True(t, apierr.IsNotFound(err)) +} diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 53b9a4079281..c373c56c9122 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -281,6 +281,15 @@ func (wfc *WorkflowController) setMemoizationQueries(queries memodb.MemoizationD wfc.memoQueries = queries } +func (wfc *WorkflowController) withMemoizationQueries(fn func(memodb.MemoizationDB) error) error { + wfc.memoizationLock.RLock() + defer wfc.memoizationLock.RUnlock() + if wfc.memoQueries == nil { + return fn(memodb.NullMemoizationDB) + } + return fn(wfc.memoQueries) +} + // runGCcontroller runs the workflow garbage collector controller func (wfc *WorkflowController) runGCcontroller(ctx context.Context, workflowTTLWorkers int) { defer runtimeutil.HandleCrashWithContext(ctx, runtimeutil.PanicHandlers...) @@ -765,12 +774,23 @@ func (wfc *WorkflowController) memoizationCacheGarbageCollector(ctx context.Cont case <-ctx.Done(): return case <-ticker.C: - queries := wfc.getMemoizationQueries() - if !queries.IsEnabled() { + var ( + n int64 + active bool + ) + err := wfc.withMemoizationQueries(func(queries memodb.MemoizationDB) error { + if !queries.IsEnabled() { + return nil + } + active = true + logger.Info(ctx, "Performing memoization cache GC") + var err error + n, err = queries.Prune(ctx) + return err + }) + if !active { continue } - logger.Info(ctx, "Performing memoization cache GC") - n, err := queries.Prune(ctx) if err != nil { logger.WithError(err).Error(ctx, "Failed to prune memoization cache") } else { From ce83cde1fe44795874f2ef970b6d88fc6e02cba1 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Wed, 13 May 2026 22:44:01 -0400 Subject: [PATCH 12/15] Cleanup Signed-off-by: droctothorpe --- docs/memoization.md | 4 +--- util/memo/db/config.go | 6 ++++++ util/memo/db/queries.go | 3 ++- workflow/controller/cache/configmap_cache.go | 3 +++ workflow/controller/cache/sqldb_cache.go | 2 ++ workflow/controller/config.go | 4 ++++ workflow/controller/config_test.go | 7 +++---- workflow/controller/controller.go | 6 ++++++ 8 files changed, 27 insertions(+), 8 deletions(-) diff --git a/docs/memoization.md b/docs/memoization.md index 6a51c07bcf4b..7c0c16f31f6c 100644 --- a/docs/memoization.md +++ b/docs/memoization.md @@ -79,9 +79,7 @@ MySQL is also supported: ## Using Memoization -Memoization is configured at the template level via the `memoize` field. - -Memoization is set at the template level. You must specify a `key`, which can be static strings but more often depend on inputs. +Memoization is configured at the template level via the `memoize` field. You must specify a `key`, which can be static strings but more often depend on inputs. You must also specify a name for the `config-map` cache. Optionally you can set a `maxAge` in seconds or hours (e.g. `180s`, `24h`) to define how long should it be considered valid. If an entry is older than the `maxAge`, it will be ignored. diff --git a/util/memo/db/config.go b/util/memo/db/config.go index b144682fef26..8fb58bc0aac1 100644 --- a/util/memo/db/config.go +++ b/util/memo/db/config.go @@ -40,6 +40,9 @@ func validateTableName(tableName string) error { return nil } +// memoizationVersionTableName returns the schema history table name for the given memoization +// cache table. The default table name uses a fixed well-known name; custom table names get a +// deterministic hash suffix to avoid collisions. func memoizationVersionTableName(tableName string) string { if tableName == defaultTableName { return versionTable @@ -49,6 +52,9 @@ func memoizationVersionTableName(tableName string) string { return fmt.Sprintf("memoization_schema_history_%x", hasher.Sum64()) } +// memoizationExpiresAtIndexName returns the name of the expires_at index for the given +// memoization cache table, using a deterministic hash suffix to avoid collisions across +// multiple cache tables. func memoizationExpiresAtIndexName(tableName string) string { hasher := fnv.New64a() _, _ = hasher.Write([]byte(tableName)) diff --git a/util/memo/db/queries.go b/util/memo/db/queries.go index 46991dd73253..cda000b22a7e 100644 --- a/util/memo/db/queries.go +++ b/util/memo/db/queries.go @@ -158,7 +158,8 @@ func (q *queries) Prune(ctx context.Context) (int64, error) { return n, err } -func (q *queries) Save(ctx context.Context, namespace, cacheName, cacheKey, nodeID string, outputs *wfv1.Outputs, maxAgeSeconds int64) error { +func (q *queries) Save( + ctx context.Context, namespace, cacheName, cacheKey, nodeID string, outputs *wfv1.Outputs, maxAgeSeconds int64) error { outputsJSON, err := json.Marshal(outputs) if err != nil { return fmt.Errorf("unable to marshal memoization outputs: %w", err) diff --git a/workflow/controller/cache/configmap_cache.go b/workflow/controller/cache/configmap_cache.go index cf6573ae5d19..2004b75ba36c 100644 --- a/workflow/controller/cache/configmap_cache.go +++ b/workflow/controller/cache/configmap_cache.go @@ -132,6 +132,9 @@ func (c *configMapCache) load(ctx context.Context, key string) (*Entry, error) { return &entry, nil } +// Save stores a memoization entry in a ConfigMap. The final argument is maxAge from the +// MemoizationCache interface; ConfigMap-backed memoization does not support TTL/expiration, +// so this implementation intentionally ignores it. func (c *configMapCache) Save(ctx context.Context, key string, nodeID string, value *wfv1.Outputs, _ string) error { err := retry.OnError(kwait.Backoff{ Duration: time.Second, diff --git a/workflow/controller/cache/sqldb_cache.go b/workflow/controller/cache/sqldb_cache.go index 353719186055..b4c3824bf2c9 100644 --- a/workflow/controller/cache/sqldb_cache.go +++ b/workflow/controller/cache/sqldb_cache.go @@ -28,6 +28,8 @@ func newSQLDBCache(namespace, name string, getQueries func() memodb.MemoizationD } } +// withQueries is necessary to make runtime enable/disable of SQL memoization +// safe, deterministic, and backward-safe for already-created cache instances. func (c *sqlDBCache) withQueries(fn func(memodb.MemoizationDB) error) error { if c.lock == nil { return fn(memodb.NullMemoizationDB) diff --git a/workflow/controller/config.go b/workflow/controller/config.go index b87c0ff7b36f..d58c26c8104c 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -25,6 +25,10 @@ var ( memoizationMigrate = memodb.Migrate ) +// resetMemoizationBackend switches memoization query backend state and closes the previous +// database session. It updates both controller-level query access and cache-factory query access, +// then clears wfc.memoSessionProxy. If sessionProxy is nil, it closes the currently tracked +// session; otherwise it closes the explicitly provided one. func (wfc *WorkflowController) resetMemoizationBackend(ctx context.Context, sessionProxy *sqldb.SessionProxy, cacheQueries memodb.MemoizationDB) { logger := logging.RequireLoggerFromContext(ctx) wfc.setMemoizationQueries(cacheQueries) diff --git a/workflow/controller/config_test.go b/workflow/controller/config_test.go index 9ca4a8c7e26e..b770e6c1e309 100644 --- a/workflow/controller/config_test.go +++ b/workflow/controller/config_test.go @@ -5,6 +5,7 @@ import ( stderrors "errors" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/upper/db/v4" apierr "k8s.io/apimachinery/pkg/api/errors" @@ -13,8 +14,6 @@ import ( "github.com/argoproj/argo-workflows/v4/config" wfv1 "github.com/argoproj/argo-workflows/v4/pkg/apis/workflow/v1alpha1" - "github.com/stretchr/testify/assert" - "github.com/argoproj/argo-workflows/v4/util/logging" memodb "github.com/argoproj/argo-workflows/v4/util/memo/db" "github.com/argoproj/argo-workflows/v4/util/sqldb" @@ -97,7 +96,7 @@ func TestUpdateConfigMemoizationDisableDetachesCachesBeforeClosingSession(t *tes require.NoError(t, err) assert.False(t, setQueriesSawClosedProxy) assert.Nil(t, controller.memoSessionProxy) - assert.EqualError(t, sessionProxy.With(ctx, func(db.Session) error { return nil }), "session proxy is closed") + require.EqualError(t, sessionProxy.With(ctx, func(db.Session) error { return nil }), "session proxy is closed") } func TestUpdateConfigMemoizationMigrationFailureDisablesMemoization(t *testing.T) { @@ -121,7 +120,7 @@ func TestUpdateConfigMemoizationMigrationFailureDisablesMemoization(t *testing.T require.NoError(t, err) assert.Nil(t, controller.memoSessionProxy) assert.Equal(t, memodb.NullMemoizationDB, controller.getMemoizationQueries()) - assert.EqualError(t, sessionProxy.With(ctx, func(db.Session) error { return nil }), "session proxy is closed") + require.EqualError(t, sessionProxy.With(ctx, func(db.Session) error { return nil }), "session proxy is closed") cache := controller.getMemoizationCache(ctx, "default", "memo-migrate-disabled-cache") require.NotNil(t, cache) diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index c373c56c9122..0a5bf8723be0 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -263,6 +263,8 @@ func (wfc *WorkflowController) newThrottler() sync.Throttler { return sync.NewMultiThrottler(wfc.Config.Parallelism, wfc.Config.NamespaceParallelism, f) } +// getMemoizationQueries returns the currently configured memoization query backend. +// If no backend is configured, it returns NullMemoizationDB to provide no-op behavior. func (wfc *WorkflowController) getMemoizationQueries() memodb.MemoizationDB { wfc.memoizationLock.RLock() defer wfc.memoizationLock.RUnlock() @@ -272,6 +274,8 @@ func (wfc *WorkflowController) getMemoizationQueries() memodb.MemoizationDB { return wfc.memoQueries } +// setMemoizationQueries updates the memoization query backend used by the controller. +// A nil backend is normalized to NullMemoizationDB to keep callers nil-safe. func (wfc *WorkflowController) setMemoizationQueries(queries memodb.MemoizationDB) { if queries == nil { queries = memodb.NullMemoizationDB @@ -281,6 +285,8 @@ func (wfc *WorkflowController) setMemoizationQueries(queries memodb.MemoizationD wfc.memoQueries = queries } +// withMemoizationQueries executes fn while holding a read lock on the memoization backend. +// If no backend is configured, fn is invoked with NullMemoizationDB. func (wfc *WorkflowController) withMemoizationQueries(fn func(memodb.MemoizationDB) error) error { wfc.memoizationLock.RLock() defer wfc.memoizationLock.RUnlock() From 492bf018bdf8637ee35582c4f5cc9bce8fd93c27 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 14 May 2026 08:55:20 -0400 Subject: [PATCH 13/15] test: enable sql-backed caching for mysql overlay and e2e tests Signed-off-by: droctothorpe --- .github/workflows/ci-build.yaml | 6 ++++++ .../overlays/workflow-controller-configmap.yaml | 12 ++++++++++++ manifests/quick-start-mysql.yaml | 12 ++++++++++++ test/e2e/sqldb_memoize_test.go | 4 ++-- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build.yaml b/.github/workflows/ci-build.yaml index f0493ee0560f..7b61facb4ec1 100644 --- a/.github/workflows/ci-build.yaml +++ b/.github/workflows/ci-build.yaml @@ -251,6 +251,12 @@ jobs: - test: test-corefunctional profile: minimal use-api: false + - test: test-corefunctional + profile: mysql + use-api: false + - test: test-corefunctional + profile: postgres + use-api: false - test: test-functional profile: minimal use-api: false diff --git a/manifests/components/mysql/overlays/workflow-controller-configmap.yaml b/manifests/components/mysql/overlays/workflow-controller-configmap.yaml index 83b9ef02f388..9c4086616a3e 100644 --- a/manifests/components/mysql/overlays/workflow-controller-configmap.yaml +++ b/manifests/components/mysql/overlays/workflow-controller-configmap.yaml @@ -44,6 +44,18 @@ data: passwordSecret: name: argo-mysql-config key: password + memoization: | + tableName: cache_entries + mysql: + host: mysql + port: 3306 + database: argo + userNameSecret: + name: argo-mysql-config + key: username + passwordSecret: + name: argo-mysql-config + key: password retentionPolicy: | completed: 10 failed: 3 diff --git a/manifests/quick-start-mysql.yaml b/manifests/quick-start-mysql.yaml index 97539d2bf7f1..36d909e19c8f 100644 --- a/manifests/quick-start-mysql.yaml +++ b/manifests/quick-start-mysql.yaml @@ -183471,6 +183471,18 @@ data: - name: Completed Workflows scope: workflow-list url: http://workflows?label=workflows.argoproj.io/completed=true + memoization: | + tableName: cache_entries + mysql: + host: mysql + port: 3306 + database: argo + userNameSecret: + name: argo-mysql-config + key: username + passwordSecret: + name: argo-mysql-config + key: password metricsConfig: | enabled: true path: /metrics diff --git a/test/e2e/sqldb_memoize_test.go b/test/e2e/sqldb_memoize_test.go index 12dba813abd0..00cbd2513b66 100644 --- a/test/e2e/sqldb_memoize_test.go +++ b/test/e2e/sqldb_memoize_test.go @@ -94,7 +94,7 @@ func (s *SQLDBMemoizeSuite) TestSQLDBMemoize() { assert.True(t, memoHit, "expected at least one node to hit the cache") }) - // Also verify the entry landed in postgres, not a ConfigMap. + // Also verify the entry landed in the configured SQL database, not a ConfigMap. s.assertDBCacheEntry(ctx, cacheKey) s.assertNoConfigMap(ctx, "sqldb-memo-cache") } @@ -102,7 +102,7 @@ func (s *SQLDBMemoizeSuite) TestSQLDBMemoize() { // assertDBCacheEntry checks the configured memoization table directly. func (s *SQLDBMemoizeSuite) assertDBCacheEntry(ctx context.Context, key string) { memoCfg := s.Config.Memoization - // E2E tests connect to postgres via a port-forward on localhost. + // E2E tests connect to the configured SQL backend via a port-forward on localhost. cfg := *memoCfg if cfg.PostgreSQL != nil { pg := *cfg.PostgreSQL From 78a6fddd3ae128c903e26a33b2e967c4235ebd65 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 14 May 2026 09:04:01 -0400 Subject: [PATCH 14/15] test: make sure sql cache tests always run Signed-off-by: droctothorpe --- .github/workflows/ci-build.yaml | 4 ++-- test/e2e/sqldb_memoize_test.go | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-build.yaml b/.github/workflows/ci-build.yaml index 7b61facb4ec1..14914260b8bb 100644 --- a/.github/workflows/ci-build.yaml +++ b/.github/workflows/ci-build.yaml @@ -251,10 +251,10 @@ jobs: - test: test-corefunctional profile: minimal use-api: false - - test: test-corefunctional + - test: test-sqldbmemoize profile: mysql use-api: false - - test: test-corefunctional + - test: test-sqldbmemoize profile: postgres use-api: false - test: test-functional diff --git a/test/e2e/sqldb_memoize_test.go b/test/e2e/sqldb_memoize_test.go index 00cbd2513b66..0a8b8cc16026 100644 --- a/test/e2e/sqldb_memoize_test.go +++ b/test/e2e/sqldb_memoize_test.go @@ -1,4 +1,4 @@ -//go:build corefunctional +//go:build sqldbmemoize package e2e @@ -61,9 +61,7 @@ spec: } func (s *SQLDBMemoizeSuite) TestSQLDBMemoize() { - if s.Config.Memoization == nil { - s.T().Skip("memoization DB not configured; skipping SQL cache test") - } + s.Require().NotNil(s.Config.Memoization, "memoization DB must be configured for SQL cache test") ctx := logging.TestContext(s.T().Context()) From 9f1fcf21f823a2889743d542e73ce4f2529984e2 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Sun, 17 May 2026 12:33:22 -0400 Subject: [PATCH 15/15] fix: flaky windows unit test Signed-off-by: droctothorpe --- workflow/sync/multi_throttler_test.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/workflow/sync/multi_throttler_test.go b/workflow/sync/multi_throttler_test.go index b38e976e0d90..0277af9dee31 100644 --- a/workflow/sync/multi_throttler_test.go +++ b/workflow/sync/multi_throttler_test.go @@ -207,12 +207,13 @@ func TestPriorityAcrossNamespaces(t *testing.T) { func TestParallelismUpdate(t *testing.T) { assert := assert.New(t) throttler := NewMultiThrottler(4, 0, func(Key) {}) - throttler.Add("a/0", 0, time.Now()) - throttler.Add("b/0", 0, time.Now()) - throttler.Add("c/0", 0, time.Now()) - throttler.Add("d/0", 0, time.Now()) - throttler.Add("e/0", 0, time.Now()) - throttler.Add("f/0", 0, time.Now()) + base := time.Unix(0, 0) + throttler.Add("a/0", 0, base) + throttler.Add("b/0", 0, base.Add(time.Second)) + throttler.Add("c/0", 0, base.Add(2*time.Second)) + throttler.Add("d/0", 0, base.Add(3*time.Second)) + throttler.Add("e/0", 0, base.Add(4*time.Second)) + throttler.Add("f/0", 0, base.Add(5*time.Second)) assert.True(throttler.Admit("a/0")) assert.True(throttler.Admit("b/0"))