fix: Limit concurrent schema cache loads

mkleczek · mkleczek · commit f3e06920544b · 2026-02-25T20:58:17.000+01:00
Triggering schema cache reload immediately upon receival of notification by the listener leads to thundering herd problem in PostgREST cluster.

This change adds limiting of number of concurrent schema cache loading queries using advisory locks.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,10 @@ All notable changes to this project will be documented in this file. From versio
 - Optimize requests with `Prefer: count=exact` that do not use ranges or `db-max-rows` by @laurenceisla in #3957
   + Removed unnecessary double count when building the `Content-Range`.
 
+### Fixed
+
+- Limit concurrent schema cache loads by @mkleczek in #4643
+
 ### Changed
 
 - Log error when `db-schemas` config contains schema `pg_catalog` or `information_schema` by @taimoorzaeem in #4359
diff --git a/src/PostgREST/AppState.hs b/src/PostgREST/AppState.hs
@@ -1,6 +1,8 @@
-{-# LANGUAGE LambdaCase      #-}
-{-# LANGUAGE NamedFieldPuns  #-}
-{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE LambdaCase       #-}
+{-# LANGUAGE NamedFieldPuns   #-}
+{-# LANGUAGE QuasiQuotes      #-}
+{-# LANGUAGE RecordWildCards  #-}
+{-# LANGUAGE TypeApplications #-}
 
 module PostgREST.AppState
   ( AppState
@@ -36,7 +38,8 @@ import           Data.Either.Combinators    (whenLeft)
 import qualified Data.Text                  as T (unpack)
 import qualified Hasql.Pool                 as SQL
 import qualified Hasql.Pool.Config          as SQL
-import qualified Hasql.Session              as SQL
+import qualified Hasql.Session              as SQL hiding (statement)
+import qualified Hasql.Transaction          as SQL hiding (sql)
 import qualified Hasql.Transaction.Sessions as SQL
 import qualified Network.HTTP.Types.Status  as HTTP
 import qualified Network.Socket             as NS
@@ -73,9 +76,16 @@ import PostgREST.SchemaCache             (SchemaCache (..),
 import PostgREST.SchemaCache.Identifiers (quoteQi)
 import PostgREST.Unix                    (createAndBindDomainSocket)
 
-import Data.Streaming.Network (bindPortTCP, bindRandomPortTCP)
-import Data.String            (IsString (..))
-import Protolude
+import           Data.Streaming.Network (bindPortTCP,
+                                         bindRandomPortTCP)
+import           Data.String            (IsString (..))
+import qualified Hasql.Decoders         as HD
+import qualified Hasql.Encoders         as HE
+import qualified Hasql.Statement        as SQL
+import           NeatInterpolation      (trimming)
+import           PostgREST.Metrics      (MetricsState (connTrack))
+import           Protolude
+
 
 data AppState = AppState
   -- | Database connection pool
@@ -360,7 +370,7 @@ getObserver = stateObserver
 -- + Because connections cache the pg catalog(see #2620)
 -- + For rapid recovery. Otherwise, the pool idle or lifetime timeout would have to be reached for new healthy connections to be acquired.
 retryingSchemaCacheLoad :: AppState -> IO ()
-retryingSchemaCacheLoad appState@AppState{stateObserver=observer, stateMainThreadId=mainThreadId} =
+retryingSchemaCacheLoad appState@AppState{stateObserver=observer, stateMainThreadId=mainThreadId, stateMetrics} =
   void $ retrying retryPolicy shouldRetry (\RetryStatus{rsIterNumber, rsPreviousDelay} -> do
     when (rsIterNumber > 0) $ do
       let delay = fromMaybe 0 rsPreviousDelay `div` oneSecondInUs
@@ -402,9 +412,25 @@ retryingSchemaCacheLoad appState@AppState{stateObserver=observer, stateMainThrea
     qSchemaCache :: IO (Maybe SchemaCache)
     qSchemaCache = do
       conf@AppConfig{..} <- getConfig appState
+      -- Throttle concurrent schema cache loads, guarded by advisory locks.
+      -- This is to prevent thundering herd problem on startup or when many PostgREST
+      -- instances receive "reload schema" notifications at the same time
+      -- schema reloading session + listener session
+      -- See get_lock_sql for details of the algorithm.
+      -- Here we calculate the number of open connections passed to the query.
+      Metrics.ConnStats connected inUse <- Metrics.connectionCounts $ connTrack stateMetrics
+      -- Determine whether schema cache loading will create a new session
+      let
+        scLoadingSessions = case (connected <= inUse, inUse >= configDbPoolSize) of
+          (True, False) -> 1 -- all connections in use but pool not full - schema cache loading will create session
+          _             -> 0
+        withTxLock = SQL.statement
+          (fromIntegral $ connected + scLoadingSessions)
+          (SQL.Statement get_lock_sql get_lock_params HD.noResult configDbPreparedStatements)
+
       (resultTime, result) <-
         let transaction = if configDbPreparedStatements then SQL.transaction else SQL.unpreparedTransaction in
-        timeItT $ usePool appState (transaction SQL.ReadCommitted SQL.Read $ querySchemaCache conf)
+        timeItT $ usePool appState (transaction SQL.ReadCommitted SQL.Read $ withTxLock *> querySchemaCache conf)
       case result of
         Left e -> do
           putSCacheStatus appState SCPending
@@ -422,6 +448,43 @@ retryingSchemaCacheLoad appState@AppState{stateObserver=observer, stateMainThrea
           observer $ SchemaCacheLoadedObs t
           putSCacheStatus appState SCLoaded
           return $ Just sCache
+      where
+        -- Recursive query that tries acquiring locks in order
+        -- and waits for randomly selected lock if no attempt succeeded.
+        -- It has a single parameter: this node open connection count.
+        -- It is used to estimate the number of nodes
+        -- by counting the number of active sessions for current session_user
+        -- and dividing it by this node open connections.
+        -- Assuming load is uniform among cluster nodes, all should have
+        -- statistically the same number of open connections.
+        -- Once the number of nodes is known we calculate the number
+        -- of locks as ceil(log(2, number_of_nodes))
+        get_lock_sql = encodeUtf8 [trimming|
+          WITH RECURSIVE attempts AS (
+            SELECT 1 AS lock_number, pg_try_advisory_xact_lock(lock_id, 1) AS success FROM parameters
+            UNION ALL
+            SELECT next_lock_number AS lock_number, pg_try_advisory_xact_lock(lock_id, next_lock_number) AS success
+            FROM
+              parameters CROSS JOIN LATERAL (
+                  SELECT lock_number + 1 AS next_lock_number FROM attempts
+                  WHERE NOT success AND lock_number < locks_count
+                  ORDER BY lock_number DESC
+                  LIMIT 1
+                ) AS previous_attempt
+          ),
+          counts AS (
+            SELECT round(log(2, round(count(*)::double precision/$$1)::numeric))::int AS locks_count
+            FROM
+              pg_stat_activity WHERE usename = SESSION_USER
+          ),
+          parameters AS (
+            SELECT locks_count, 50168275 AS lock_id FROM counts WHERE locks_count > 0
+          )
+          SELECT pg_advisory_xact_lock(lock_id, floor(random() * locks_count)::int + 1)
+          FROM
+            parameters WHERE NOT EXISTS (SELECT 1 FROM attempts WHERE success) |]
+
+        get_lock_params = HE.param (HE.nonNullable HE.int4)
 
     shouldRetry :: RetryStatus -> (Maybe PgVersion, Maybe SchemaCache) -> IO Bool
     shouldRetry _ (pgVer, sCache) = do
diff --git a/test/io/test_io.py b/test/io/test_io.py
@@ -1,5 +1,6 @@
 "Unit tests for Input/Ouput of PostgREST seen as a black box."
 
+import contextlib
 import os
 import re
 import signal
@@ -18,6 +19,7 @@
     sleep_until_postgrest_full_reload,
     sleep_until_postgrest_scache_reload,
     wait_until_exit,
+    wait_until_status_code,
 )
 
 
@@ -1058,6 +1060,91 @@ def test_schema_cache_concurrent_notifications(slow_schema_cache_env):
         assert response.status_code == 200
 
 
+@pytest.mark.parametrize(
+    "instance_count, expected_concurrency", [(2, 2), (4, 3), (6, 4), (8, 4), (16, 5)]
+)
+def test_schema_cache_reload_throttled_with_advisory_locks(
+    instance_count, expected_concurrency, slow_schema_cache_env
+):
+    "schema cache reloads should be throttled across instances if instance count > 10"
+
+    internal_sleep_ms = int(
+        slow_schema_cache_env["PGRST_INTERNAL_SCHEMA_CACHE_QUERY_SLEEP"]
+    )
+    lock_wait_threshold_ms = internal_sleep_ms * 2
+    query_log_pattern = re.compile(r"Schema cache queried in ([\d.]+) milliseconds")
+
+    def read_available_output_lines(postgrest):
+        try:
+            output = postgrest.process.stdout.read()
+        except BlockingIOError:
+            return []
+
+        if not output:
+            return []
+        return output.decode().splitlines()
+
+    with contextlib.ExitStack() as stack:
+        instances = [
+            stack.enter_context(
+                run(
+                    env=slow_schema_cache_env,
+                    wait_for_readiness=False,
+                    wait_max_seconds=10,
+                )
+            )
+            for _ in range(instance_count)
+        ]
+
+        for postgrest in instances:
+            wait_until_status_code(
+                postgrest.admin.baseurl + "/ready", max_seconds=10, status_code=200
+            )
+
+        # Drop startup logs so only reload logs are parsed.
+        for postgrest in instances:
+            read_available_output_lines(postgrest)
+
+        response = instances[0].session.get("/rpc/notify_pgrst")
+        assert response.status_code == 204
+
+        # Wait long enough for the lock-throttled cache reloads to finish.
+        time.sleep((internal_sleep_ms / 1000) * 2)
+
+        reload_durations_ms = []
+        for postgrest in instances:
+            output_lines = []
+            for _ in range(instance_count * 2):
+                output_lines.extend(read_available_output_lines(postgrest))
+                if any(query_log_pattern.search(line) for line in output_lines):
+                    break
+                time.sleep(0.2)
+
+            durations = []
+            for line in output_lines:
+                match = query_log_pattern.search(line)
+                if match:
+                    durations.append(float(match.group(1)))
+
+            assert durations
+            reload_durations_ms.append(max(durations))
+
+        assert len(reload_durations_ms) == instance_count
+
+        # 10 instances should be fast, remaining instances should be slow
+        assert (
+            instance_count
+            - len(
+                [
+                    duration
+                    for duration in reload_durations_ms
+                    if duration > lock_wait_threshold_ms
+                ]
+            )
+            == expected_concurrency
+        )
+
+
 def test_schema_cache_query_sleep_logs(defaultenv):
     """Schema cache sleep should be reflected in the logged query duration."""
 
@@ -1691,7 +1778,7 @@ def test_requests_with_resource_embedding_wait_for_schema_cache_reload(defaulten
     env = {
         **defaultenv,
         "PGRST_DB_POOL": "2",
-        "PGRST_INTERNAL_SCHEMA_CACHE_RELATIONSHIP_LOAD_SLEEP": "5100",
+        "PGRST_INTERNAL_SCHEMA_CACHE_RELATIONSHIP_LOAD_SLEEP": "5200",
     }
 
     with run(env=env, wait_max_seconds=30) as postgrest: