Adds three new configuration variables: * destination_min_retry_interval is identical to before (10mn). * destination_retry_multiplier is now 2 instead of 5, the maximum value will be reached slower. * destination_max_retry_interval is one day instead of (essentially) infinity. Capping this will cause destinations to continue to be retried sometimes instead of being lost forever. The previous value was 2 ^ 62 milliseconds.tags/v1.90.0rc1
@@ -0,0 +1 @@ | |||
Allow for the configuration of the backoff algorithm for federation destinations. |
@@ -1242,6 +1242,14 @@ like sending a federation transaction. | |||
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts. | |||
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts. | |||
The following options control the retry logic when communicating with a specific homeserver destination. | |||
Unlike the previous configuration options, these values apply across all requests | |||
for a given destination and the state of the backoff is stored in the database. | |||
* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m. | |||
* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2. | |||
* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week. | |||
Example configuration: | |||
```yaml | |||
federation: | |||
@@ -1250,6 +1258,9 @@ federation: | |||
max_long_retry_delay: 100s | |||
max_short_retries: 5 | |||
max_long_retries: 20 | |||
destination_min_retry_interval: 30s | |||
destination_retry_multiplier: 5 | |||
destination_max_retry_interval: 12h | |||
``` | |||
--- | |||
## Caching | |||
@@ -65,5 +65,23 @@ class FederationConfig(Config): | |||
self.max_long_retries = federation_config.get("max_long_retries", 10) | |||
self.max_short_retries = federation_config.get("max_short_retries", 3) | |||
# Allow for the configuration of the backoff algorithm used | |||
# when trying to reach an unavailable destination. | |||
# Unlike previous configuration those values applies across | |||
# multiple requests and the state of the backoff is stored on DB. | |||
self.destination_min_retry_interval_ms = Config.parse_duration( | |||
federation_config.get("destination_min_retry_interval", "10m") | |||
) | |||
self.destination_retry_multiplier = federation_config.get( | |||
"destination_retry_multiplier", 2 | |||
) | |||
self.destination_max_retry_interval_ms = min( | |||
Config.parse_duration( | |||
federation_config.get("destination_max_retry_interval", "7d") | |||
), | |||
# Set a hard-limit to not overflow the database column. | |||
2**62, | |||
) | |||
_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}} |
@@ -27,15 +27,6 @@ if TYPE_CHECKING: | |||
logger = logging.getLogger(__name__) | |||
# the initial backoff, after the first transaction fails | |||
MIN_RETRY_INTERVAL = 10 * 60 * 1000 | |||
# how much we multiply the backoff by after each subsequent fail | |||
RETRY_MULTIPLIER = 5 | |||
# a cap on the backoff. (Essentially none) | |||
MAX_RETRY_INTERVAL = 2**62 | |||
class NotRetryingDestination(Exception): | |||
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str): | |||
@@ -169,6 +160,16 @@ class RetryDestinationLimiter: | |||
self.notifier = notifier | |||
self.replication_client = replication_client | |||
self.destination_min_retry_interval_ms = ( | |||
self.store.hs.config.federation.destination_min_retry_interval_ms | |||
) | |||
self.destination_retry_multiplier = ( | |||
self.store.hs.config.federation.destination_retry_multiplier | |||
) | |||
self.destination_max_retry_interval_ms = ( | |||
self.store.hs.config.federation.destination_max_retry_interval_ms | |||
) | |||
def __enter__(self) -> None: | |||
pass | |||
@@ -220,13 +221,15 @@ class RetryDestinationLimiter: | |||
# We couldn't connect. | |||
if self.retry_interval: | |||
self.retry_interval = int( | |||
self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4) | |||
self.retry_interval | |||
* self.destination_retry_multiplier | |||
* random.uniform(0.8, 1.4) | |||
) | |||
if self.retry_interval >= MAX_RETRY_INTERVAL: | |||
self.retry_interval = MAX_RETRY_INTERVAL | |||
if self.retry_interval >= self.destination_max_retry_interval_ms: | |||
self.retry_interval = self.destination_max_retry_interval_ms | |||
else: | |||
self.retry_interval = MIN_RETRY_INTERVAL | |||
self.retry_interval = self.destination_min_retry_interval_ms | |||
logger.info( | |||
"Connection to %s was unsuccessful (%s(%s)); backoff now %i", | |||
@@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor | |||
from synapse.server import HomeServer | |||
from synapse.storage.databases.main.transactions import DestinationRetryTimings | |||
from synapse.util import Clock | |||
from synapse.util.retryutils import MAX_RETRY_INTERVAL | |||
from tests.unittest import HomeserverTestCase | |||
@@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase): | |||
self.get_success(d) | |||
def test_large_destination_retry(self) -> None: | |||
max_retry_interval_ms = ( | |||
self.hs.config.federation.destination_max_retry_interval_ms | |||
) | |||
d = self.store.set_destination_retry_timings( | |||
"example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL | |||
"example.com", | |||
max_retry_interval_ms, | |||
max_retry_interval_ms, | |||
max_retry_interval_ms, | |||
) | |||
self.get_success(d) | |||
@@ -11,12 +11,7 @@ | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
from synapse.util.retryutils import ( | |||
MIN_RETRY_INTERVAL, | |||
RETRY_MULTIPLIER, | |||
NotRetryingDestination, | |||
get_retry_limiter, | |||
) | |||
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter | |||
from tests.unittest import HomeserverTestCase | |||
@@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase): | |||
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) | |||
min_retry_interval_ms = ( | |||
self.hs.config.federation.destination_min_retry_interval_ms | |||
) | |||
retry_multiplier = self.hs.config.federation.destination_retry_multiplier | |||
self.pump(1) | |||
try: | |||
with limiter: | |||
@@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase): | |||
assert new_timings is not None | |||
self.assertEqual(new_timings.failure_ts, failure_ts) | |||
self.assertEqual(new_timings.retry_last_ts, failure_ts) | |||
self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL) | |||
self.assertEqual(new_timings.retry_interval, min_retry_interval_ms) | |||
# now if we try again we should get a failure | |||
self.get_failure( | |||
@@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase): | |||
# advance the clock and try again | |||
# | |||
self.pump(MIN_RETRY_INTERVAL) | |||
self.pump(min_retry_interval_ms) | |||
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) | |||
self.pump(1) | |||
@@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase): | |||
self.assertEqual(new_timings.failure_ts, failure_ts) | |||
self.assertEqual(new_timings.retry_last_ts, retry_ts) | |||
self.assertGreaterEqual( | |||
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5 | |||
new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5 | |||
) | |||
self.assertLessEqual( | |||
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0 | |||
new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0 | |||
) | |||
# | |||
# one more go, with success | |||
# | |||
self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0) | |||
self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0) | |||
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) | |||
self.pump(1) | |||