-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
fix: implement connection tracking in metrics #4672
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,10 @@ | |
| -} | ||
| module PostgREST.Metrics | ||
| ( init | ||
| , ConnTrack | ||
| , ConnStats (..) | ||
| , MetricsState (..) | ||
| , connectionCounts | ||
| , observationMetrics | ||
| , metricsToText | ||
| ) where | ||
|
|
@@ -19,13 +22,18 @@ | |
|
|
||
| import PostgREST.Observation | ||
|
|
||
|
|
||
| import Protolude | ||
| import Control.Arrow ((&&&)) | ||
| import Data.Bitraversable (bisequenceA) | ||
| import Data.Tuple.Extra (both) | ||
| import Data.UUID (UUID) | ||
| import qualified Focus | ||
| import Protolude | ||
| import qualified StmHamt.SizedHamt as SH | ||
|
|
||
| data MetricsState = | ||
| MetricsState { | ||
| poolTimeouts :: Counter, | ||
| poolAvailable :: Gauge, | ||
| connTrack :: ConnTrack, | ||
| poolWaiting :: Gauge, | ||
| poolMaxSize :: Gauge, | ||
| schemaCacheLoads :: Vector Label1 Counter, | ||
|
|
@@ -40,7 +48,7 @@ | |
| whenM getRTSStatsEnabled $ void $ register PMG.ghcMetrics | ||
| metricState <- MetricsState <$> | ||
| register (counter (Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts")) <*> | ||
| register (gauge (Info "pgrst_db_pool_available" "Available connections in the pool")) <*> | ||
| register (Metric ((identity &&& dbPoolAvailable) <$> connectionTracker)) <*> | ||
| register (gauge (Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection")) <*> | ||
| register (gauge (Info "pgrst_db_pool_max" "Max pool connections")) <*> | ||
| register (vector "status" $ counter (Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded")) <*> | ||
|
|
@@ -50,20 +58,28 @@ | |
| register (counter (Info "pgrst_jwt_cache_evictions_total" "The total number of JWT cache evictions")) | ||
| setGauge (poolMaxSize metricState) (fromIntegral configDbPoolSize) | ||
| pure metricState | ||
| where | ||
| dbPoolAvailable = (pure . noLabelsGroup (Info "pgrst_db_pool_available" "Available connections in the pool") GaugeType . calcAvailable <$>) . connectionCounts | ||
| where | ||
| calcAvailable = liftA2 (-) connected inUse | ||
| toSample name labels = Sample name labels . encodeUtf8 . show | ||
| noLabelsGroup info sampleType = SampleGroup info sampleType . pure . toSample (metricName info) mempty | ||
|
|
||
| -- Only some observations are used as metrics | ||
| observationMetrics :: MetricsState -> ObservationHandler | ||
| observationMetrics MetricsState{..} obs = case obs of | ||
| PoolAcqTimeoutObs -> do | ||
| incCounter poolTimeouts | ||
| (HasqlPoolObs (SQL.ConnectionObservation _ status)) -> case status of | ||
| SQL.ReadyForUseConnectionStatus _ -> do | ||
| incGauge poolAvailable | ||
| SQL.InUseConnectionStatus -> do | ||
| decGauge poolAvailable | ||
| SQL.TerminatedConnectionStatus _ -> do | ||
| decGauge poolAvailable | ||
| SQL.ConnectingConnectionStatus -> pure () | ||
| -- Handle pool observations with connection tracking | ||
| -- this is necessary because it is not possible | ||
| -- to accurately maintain open/in use conneciton counts | ||
| -- statelessly based only on pool observation events. | ||
| -- The reason is that hasql-pool emits TerminatedConnectionStatus | ||
| -- both for connections successfully established and failed when connecting. | ||
| -- When receiving TerminatedConnectionStatus we have to find out | ||
| -- if we can decrement established connection count. To do that we have to track | ||
| -- established connections. | ||
|
Comment on lines
+73
to
+81
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comment - this helps a lot. Q: Would this reasoning be different once we vendor hasql, including hasql-pool, and could change the respective events? Would this make the fix here simpler, because we could keep using the same events?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. EDIT: I mean |
||
| (HasqlPoolObs sqlObs) -> trackConnections connTrack sqlObs | ||
| PoolRequest -> | ||
| incGauge poolWaiting | ||
| PoolRequestFullfilled -> | ||
|
|
@@ -81,3 +97,28 @@ | |
|
|
||
| metricsToText :: IO LBS.ByteString | ||
| metricsToText = exportMetricsAsText | ||
|
|
||
| data ConnStats = ConnStats { | ||
| connected :: Int, | ||
| inUse :: Int | ||
| } deriving (Eq, Show) | ||
|
|
||
| data ConnTrack = ConnTrack { connTrackConnected :: SH.SizedHamt UUID, connTrackInUse :: SH.SizedHamt UUID } | ||
|
|
||
| connectionTracker :: IO ConnTrack | ||
| connectionTracker = ConnTrack <$> SH.newIO <*> SH.newIO | ||
|
|
||
| trackConnections :: ConnTrack -> SQL.Observation -> IO () | ||
| trackConnections ConnTrack{..} (SQL.ConnectionObservation uuid status) = case status of | ||
| SQL.ReadyForUseConnectionStatus _ -> atomically $ | ||
| SH.insert identity uuid connTrackConnected *> | ||
| SH.focus Focus.delete identity uuid connTrackInUse | ||
| SQL.TerminatedConnectionStatus _ -> atomically $ | ||
| SH.focus Focus.delete identity uuid connTrackConnected *> | ||
| SH.focus Focus.delete identity uuid connTrackInUse | ||
| SQL.InUseConnectionStatus -> atomically $ | ||
| SH.insert identity uuid connTrackInUse | ||
| _ -> mempty | ||
|
|
||
| connectionCounts :: ConnTrack -> IO ConnStats | ||
| connectionCounts = atomically . fmap (uncurry ConnStats) . bisequenceA . both SH.size . (connTrackConnected &&& connTrackInUse) | ||
|
steve-chavez marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,17 +6,21 @@ | |
|
|
||
| module Observation.MetricsSpec where | ||
|
|
||
| import Data.List (lookup) | ||
| import Network.Wai (Application) | ||
| import Data.List (lookup) | ||
| import qualified Hasql.Pool.Observation as SQL | ||
| import Network.Wai (Application) | ||
| import ObsHelper | ||
| import qualified PostgREST.AppState as AppState | ||
| import PostgREST.Config (AppConfig (configDbSchemas)) | ||
| import qualified PostgREST.Metrics as Metrics | ||
| import qualified PostgREST.AppState as AppState | ||
| import PostgREST.Config (AppConfig (configDbSchemas)) | ||
| import PostgREST.Metrics (ConnStats (..), | ||
| MetricsState (..), | ||
| connectionCounts) | ||
| import PostgREST.Observation | ||
| import Prometheus (getCounter, getVectorWith) | ||
| import Protolude | ||
| import Test.Hspec (SpecWith, describe, it) | ||
| import Test.Hspec.Wai (getState) | ||
| import Prometheus (getCounter, getVectorWith) | ||
| import Test.Hspec (SpecWith, describe, it) | ||
| import Test.Hspec.Wai (getState) | ||
|
|
||
| import Protolude | ||
|
|
||
| spec :: SpecWith (SpecState, Application) | ||
| spec = describe "Server started with metrics enabled" $ do | ||
|
|
@@ -71,9 +75,33 @@ spec = describe "Server started with metrics enabled" $ do | |
| -- (there should be none but we need to verify that) | ||
| threadDelay $ 1 * sec | ||
|
|
||
| it "Should track in use connections" $ do | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: Does this reproduce the problem on #4622? If not, could we add a new repro test or is that too complicated?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
As far as I understand it might be possible to reproduce once we have
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For that perhaps you could restart #4674? Should be easier I think now that we don't have I also wonder if there's a simpler/focused way by just killing the connection like mentioned on #1766 (comment)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For a case like this, I wouldn't block on adding an exactly-reproducing test of the original issue: We really changed the entire approach to calculating this metric, from simply reacting to certain events to tracking each connection. I find it's unlikely that someone accidentally introduces a regression here. That being said, I think we absolutely need some commentary in
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Done (see inline comment in
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, I was thinking we should take this chance to add the required infra to test otherwise we'll end up with a "test debt" (like on #1766). But I guess we still have #4670 as another chance to test an unreliable network. So I won't block here if you approve here @wolfgangwalther
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In light of #4672 (comment), I think there is a chance that we'll rewrite this whole thing again when we have
|
||
| SpecState{specAppState = appState, specMetrics = metrics, specObsChan} <- getState | ||
| let waitFor = waitForObs specObsChan | ||
|
|
||
| liftIO $ checkState' metrics [ | ||
| -- we expect in use connections to be the same once finished | ||
| inUseConnections (+ 0) | ||
| ] $ do | ||
| signal <- newEmptyMVar | ||
| -- make sure waiting thread is signaled | ||
| bracket_ (pure ()) (putMVar signal ()) $ | ||
| -- expecting one more connection in use | ||
| checkState' metrics [ | ||
| inUseConnections (+ 1) | ||
| ] $ do | ||
| -- start a thread hanging on a single connection until signaled | ||
| void $ forkIO $ void $ AppState.usePool appState $ liftIO (readMVar signal) | ||
| -- main thread waits for ConnectionObservation with InUseConnectionStatus | ||
| -- after which used connections count should be incremented | ||
| waitFor (1 * sec) "InUseConnectionStatus" $ \x -> [ o | o@(HasqlPoolObs (SQL.ConnectionObservation _ SQL.InUseConnectionStatus)) <- pure x] | ||
|
|
||
| -- hanging thread was signaled and should return the connection | ||
| waitFor (1 * sec) "ReadyForUseConnectionStatus" $ \x -> [ o | o@(HasqlPoolObs (SQL.ConnectionObservation _ (SQL.ReadyForUseConnectionStatus _))) <- pure x] | ||
|
Comment on lines
+79
to
+100
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm looking at this test-case.. and it just feels like an imperative language like python is much better to represent such a step-by-step test. I can't even make full sense of the test, but I believe pytest + toxiproxy + metrics endpoint should allow the same test, right? |
||
|
|
||
| where | ||
| -- prometheus-client api to handle vectors is convoluted | ||
| schemaCacheLoads label = expectField @"schemaCacheLoads" $ | ||
| fmap (maybe (0::Int) round . lookup label) . (`getVectorWith` getCounter) | ||
| inUseConnections = expectField @"connTrack" ((inUse <$>) . connectionCounts) | ||
| sec = 1000000 | ||
Uh oh!
There was an error while loading. Please reload this page.