0
0
Fork 1
mirror of https://mau.dev/maunium/synapse.git synced 2024-12-14 07:33:47 +01:00

Avoid deep recursion in appservice recovery (#5885)

Hopefully, this will fix a stack overflow when recovering an appservice.

The recursion here leads to a huge chain of deferred callbacks, which then
overflows the stack when the chain completes. `inlineCallbacks` makes a better
job of this if we use iteration instead.

Clean up the code a bit too, while we're there.
This commit is contained in:
Richard van der Hoff 2019-08-20 17:39:38 +01:00 committed by GitHub
parent c886f976e0
commit baa3f4a80d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 17 deletions

1
changelog.d/5885.bugfix Normal file
View file

@ -0,0 +1 @@
Fix stack overflow when recovering an appservice which had an outage.

View file

@ -224,7 +224,9 @@ class _Recoverer(object):
"as-recoverer-%s" % (self.service.id,), self.retry "as-recoverer-%s" % (self.service.id,), self.retry
) )
self.clock.call_later((2 ** self.backoff_counter), _retry) delay = 2 ** self.backoff_counter
logger.info("Scheduling retries on %s in %fs", self.service.id, delay)
self.clock.call_later(delay, _retry)
def _backoff(self): def _backoff(self):
# cap the backoff to be around 8.5min => (2^9) = 512 secs # cap the backoff to be around 8.5min => (2^9) = 512 secs
@ -234,25 +236,30 @@ class _Recoverer(object):
@defer.inlineCallbacks @defer.inlineCallbacks
def retry(self): def retry(self):
logger.info("Starting retries on %s", self.service.id)
try: try:
txn = yield self.store.get_oldest_unsent_txn(self.service) while True:
if txn: txn = yield self.store.get_oldest_unsent_txn(self.service)
if not txn:
# nothing left: we're done!
self.callback(self)
return
logger.info( logger.info(
"Retrying transaction %s for AS ID %s", txn.id, txn.service.id "Retrying transaction %s for AS ID %s", txn.id, txn.service.id
) )
sent = yield txn.send(self.as_api) sent = yield txn.send(self.as_api)
if sent: if not sent:
yield txn.complete(self.store) break
# reset the backoff counter and retry immediately
self.backoff_counter = 1
yield self.retry()
else:
self._backoff()
else:
self._set_service_recovered()
except Exception as e:
logger.exception(e)
self._backoff()
def _set_service_recovered(self): yield txn.complete(self.store)
self.callback(self)
# reset the backoff counter and then process the next transaction
self.backoff_counter = 1
except Exception:
logger.exception("Unexpected error running retries")
# we didn't manage to send all of the transactions before we got an error of
# some flavour: reschedule the next retry.
self._backoff()