Fix a bug where servers could be marked as up when they were failing (#16506)

After this change a server will only be reported as back online if they were previously having requests fail.
7 months ago · 77dfc1f939
--- a/changelog.d/16506.bugfix
+++ b/changelog.d/16506.bugfix
@@ -0,0 +1 @@
 Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -170,10 +170,10 @@ class RetryDestinationLimiter:
                database in milliseconds, or zero if the last request was
                successful.
            backoff_on_404: Back off if we get a 404

            backoff_on_failure: set to False if we should not increase the
                retry interval on a failure.

            notifier: A notifier used to mark servers as up.
            replication_client A replication client used to mark servers as up.
            backoff_on_all_error_codes: Whether we should back off on any
                error code.
        """
@@ -237,6 +237,9 @@ class RetryDestinationLimiter:
            else:
                valid_err_code = False

        # Whether previous requests to the destination had been failing.
        previously_failing = bool(self.failure_ts)

        if success:
            # We connected successfully.
            if not self.retry_interval:
@@ -282,6 +285,9 @@ class RetryDestinationLimiter:
            if self.failure_ts is None:
                self.failure_ts = retry_last_ts

        # Whether the current request to the destination had been failing.
        currently_failing = bool(self.failure_ts)

        async def store_retry_timings() -> None:
            try:
                await self.store.set_destination_retry_timings(
@@ -291,17 +297,15 @@ class RetryDestinationLimiter:
                    self.retry_interval,
                )

                if self.notifier:
                    # Inform the relevant places that the remote server is back up.
                    self.notifier.notify_remote_server_up(self.destination)

                if self.replication_client:
                    # If we're on a worker we try and inform master about this. The
                    # replication client doesn't hook into the notifier to avoid
                    # infinite loops where we send a `REMOTE_SERVER_UP` command to
                    # master, which then echoes it back to us which in turn pokes
                    # the notifier.
                    self.replication_client.send_remote_server_up(self.destination)
                # If the server was previously failing, but is no longer.
                if previously_failing and not currently_failing:
                    if self.notifier:
                        # Inform the relevant places that the remote server is back up.
                        self.notifier.notify_remote_server_up(self.destination)

                    if self.replication_client:
                        # Inform other workers that the remote server is up.
                        self.replication_client.send_remote_server_up(self.destination)

            except Exception:
                logger.exception("Failed to store destination_retry_timings")
--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@@ -11,6 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from unittest import mock

 from synapse.notifier import Notifier
 from synapse.replication.tcp.handler import ReplicationCommandHandler
 from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter

 from tests.unittest import HomeserverTestCase
@@ -109,6 +113,77 @@ class RetryLimiterTestCase(HomeserverTestCase):
        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        self.assertIsNone(new_timings)

    def test_notifier_replication(self) -> None:
        """Ensure the notifier/replication client is called only when expected."""
        store = self.hs.get_datastores().main

        notifier = mock.Mock(spec=Notifier)
        replication_client = mock.Mock(spec=ReplicationCommandHandler)

        limiter = self.get_success(
            get_retry_limiter(
                "test_dest",
                self.clock,
                store,
                notifier=notifier,
                replication_client=replication_client,
            )
        )

        # The server is already up, nothing should occur.
        self.pump(1)
        with limiter:
            pass
        self.pump()

        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        self.assertIsNone(new_timings)
        notifier.notify_remote_server_up.assert_not_called()
        replication_client.send_remote_server_up.assert_not_called()

        # Attempt again, but return an error. This will cause new retry timings, but
        # should not trigger server up notifications.
        self.pump(1)
        try:
            with limiter:
                raise AssertionError("argh")
        except AssertionError:
            pass
        self.pump()

        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        # The exact retry timings are tested separately.
        self.assertIsNotNone(new_timings)
        notifier.notify_remote_server_up.assert_not_called()
        replication_client.send_remote_server_up.assert_not_called()

        # A second failing request should be treated as the above.
        self.pump(1)
        try:
            with limiter:
                raise AssertionError("argh")
        except AssertionError:
            pass
        self.pump()

        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        # The exact retry timings are tested separately.
        self.assertIsNotNone(new_timings)
        notifier.notify_remote_server_up.assert_not_called()
        replication_client.send_remote_server_up.assert_not_called()

        # A final successful attempt should generate a server up notification.
        self.pump(1)
        with limiter:
            pass
        self.pump()

        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        # The exact retry timings are tested separately.
        self.assertIsNone(new_timings)
        notifier.notify_remote_server_up.assert_called_once_with("test_dest")
        replication_client.send_remote_server_up.assert_called_once_with("test_dest")

    def test_max_retry_interval(self) -> None:
        """Test that `destination_max_retry_interval` setting works as expected"""
        store = self.hs.get_datastores().main