PYTHON-5731 - Server selection deprioritization only for overload errors on replica sets (#2710)

NoahStapp · web-flow · commit 84814b2a7275 · 2026-02-23T13:18:24.000-05:00
diff --git a/pymongo/asynchronous/mongo_client.py b/pymongo/asynchronous/mongo_client.py
@@ -2825,7 +2825,11 @@ async def run(self) -> T:
                     if self._last_error is None:
                         self._last_error = exc
 
-                if self._server is not None:
+                if (
+                    self._server is not None
+                    and self._client.topology_description.topology_type_name == "Sharded"
+                    or exc.has_error_label("SystemOverloadedError")
+                ):
                     self._deprioritized_servers.append(self._server)
 
     def _is_not_eligible_for_retry(self) -> bool:
diff --git a/pymongo/synchronous/mongo_client.py b/pymongo/synchronous/mongo_client.py
@@ -2815,7 +2815,11 @@ def run(self) -> T:
                     if self._last_error is None:
                         self._last_error = exc
 
-                if self._server is not None:
+                if (
+                    self._server is not None
+                    and self._client.topology_description.topology_type_name == "Sharded"
+                    or exc.has_error_label("SystemOverloadedError")
+                ):
                     self._deprioritized_servers.append(self._server)
 
     def _is_not_eligible_for_retry(self) -> bool:
diff --git a/test/asynchronous/test_retryable_reads.py b/test/asynchronous/test_retryable_reads.py
@@ -261,6 +261,84 @@ async def test_retryable_reads_are_retried_on_the_same_implicit_session(self):
             self.assertEqual(command_docs[0]["lsid"], command_docs[1]["lsid"])
             self.assertIsNot(command_docs[0], command_docs[1])
 
+    @async_client_context.require_replica_set
+    @async_client_context.require_secondaries_count(1)
+    @async_client_context.require_failCommand_fail_point
+    @async_client_context.require_version_min(4, 4, 0)
+    async def test_03_01_retryable_reads_caused_by_overload_errors_are_retried_on_a_different_replicaset_server_when_one_is_available(
+        self
+    ):
+        listener = OvertCommandListener()
+
+        # 1. Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
+        client = await self.async_rs_or_single_client(
+            event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
+        )
+
+        # 2. Configure a fail point with the RetryableError and SystemOverloadedError error labels.
+        command_args = {
+            "configureFailPoint": "failCommand",
+            "mode": {"times": 1},
+            "data": {
+                "failCommands": ["find"],
+                "errorLabels": ["RetryableError", "SystemOverloadedError"],
+                "errorCode": 6,
+            },
+        }
+        await async_set_fail_point(client, command_args)
+
+        # 3. Reset the command event monitor to clear the fail point command from its stored events.
+        listener.reset()
+
+        # 4. Execute a `find` command with `client`.
+        await client.t.t.find_one({})
+
+        # 5. Assert that one failed command event and one successful command event occurred.
+        self.assertEqual(len(listener.failed_events), 1)
+        self.assertEqual(len(listener.succeeded_events), 1)
+
+        # 6. Assert that both events occurred on different servers.
+        assert listener.failed_events[0].connection_id != listener.succeeded_events[0].connection_id
+
+    @async_client_context.require_replica_set
+    @async_client_context.require_secondaries_count(1)
+    @async_client_context.require_failCommand_fail_point
+    @async_client_context.require_version_min(4, 4, 0)
+    async def test_03_02_retryable_reads_caused_by_non_overload_errors_are_retried_on_the_same_replicaset_server(
+        self
+    ):
+        listener = OvertCommandListener()
+
+        # 1. Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
+        client = await self.async_rs_or_single_client(
+            event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
+        )
+
+        # 2. Configure a fail point with the RetryableError error label.
+        command_args = {
+            "configureFailPoint": "failCommand",
+            "mode": {"times": 1},
+            "data": {
+                "failCommands": ["find"],
+                "errorLabels": ["RetryableError"],
+                "errorCode": 6,
+            },
+        }
+        await async_set_fail_point(client, command_args)
+
+        # 3. Reset the command event monitor to clear the fail point command from its stored events.
+        listener.reset()
+
+        # 4. Execute a `find` command with `client`.
+        await client.t.t.find_one({})
+
+        # 5. Assert that one failed command event and one successful command event occurred.
+        self.assertEqual(len(listener.failed_events), 1)
+        self.assertEqual(len(listener.succeeded_events), 1)
+
+        # 6. Assert that both events occurred the same server.
+        assert listener.failed_events[0].connection_id == listener.succeeded_events[0].connection_id
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_retryable_reads.py b/test/test_retryable_reads.py
@@ -259,6 +259,84 @@ def test_retryable_reads_are_retried_on_the_same_implicit_session(self):
             self.assertEqual(command_docs[0]["lsid"], command_docs[1]["lsid"])
             self.assertIsNot(command_docs[0], command_docs[1])
 
+    @client_context.require_replica_set
+    @client_context.require_secondaries_count(1)
+    @client_context.require_failCommand_fail_point
+    @client_context.require_version_min(4, 4, 0)
+    def test_03_01_retryable_reads_caused_by_overload_errors_are_retried_on_a_different_replicaset_server_when_one_is_available(
+        self
+    ):
+        listener = OvertCommandListener()
+
+        # 1. Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
+        client = self.rs_or_single_client(
+            event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
+        )
+
+        # 2. Configure a fail point with the RetryableError and SystemOverloadedError error labels.
+        command_args = {
+            "configureFailPoint": "failCommand",
+            "mode": {"times": 1},
+            "data": {
+                "failCommands": ["find"],
+                "errorLabels": ["RetryableError", "SystemOverloadedError"],
+                "errorCode": 6,
+            },
+        }
+        set_fail_point(client, command_args)
+
+        # 3. Reset the command event monitor to clear the fail point command from its stored events.
+        listener.reset()
+
+        # 4. Execute a `find` command with `client`.
+        client.t.t.find_one({})
+
+        # 5. Assert that one failed command event and one successful command event occurred.
+        self.assertEqual(len(listener.failed_events), 1)
+        self.assertEqual(len(listener.succeeded_events), 1)
+
+        # 6. Assert that both events occurred on different servers.
+        assert listener.failed_events[0].connection_id != listener.succeeded_events[0].connection_id
+
+    @client_context.require_replica_set
+    @client_context.require_secondaries_count(1)
+    @client_context.require_failCommand_fail_point
+    @client_context.require_version_min(4, 4, 0)
+    def test_03_02_retryable_reads_caused_by_non_overload_errors_are_retried_on_the_same_replicaset_server(
+        self
+    ):
+        listener = OvertCommandListener()
+
+        # 1. Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
+        client = self.rs_or_single_client(
+            event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
+        )
+
+        # 2. Configure a fail point with the RetryableError error label.
+        command_args = {
+            "configureFailPoint": "failCommand",
+            "mode": {"times": 1},
+            "data": {
+                "failCommands": ["find"],
+                "errorLabels": ["RetryableError"],
+                "errorCode": 6,
+            },
+        }
+        set_fail_point(client, command_args)
+
+        # 3. Reset the command event monitor to clear the fail point command from its stored events.
+        listener.reset()
+
+        # 4. Execute a `find` command with `client`.
+        client.t.t.find_one({})
+
+        # 5. Assert that one failed command event and one successful command event occurred.
+        self.assertEqual(len(listener.failed_events), 1)
+        self.assertEqual(len(listener.succeeded_events), 1)
+
+        # 6. Assert that both events occurred the same server.
+        assert listener.failed_events[0].connection_id == listener.succeeded_events[0].connection_id
+
 
 if __name__ == "__main__":
     unittest.main()