Add metrics to track how often events are soft_failed (#10156)

Spawned from missing messages we were seeing on `matrix.org` from a
federated Gtiter bridged room, https://gitlab.com/gitterHQ/webapp/-/issues/2770.
The underlying issue in Synapse is tracked by https://github.com/matrix-org/synapse/issues/10066
where the message and join event race and the message is `soft_failed` before the
`join` event reaches the remote federated server.

Less soft_failed events = better and usually this should only trigger for events
where people are doing bad things and trying to fuzz and fake everything.
This commit is contained in:
Eric Eastwood 2021-06-11 04:12:35 -05:00 committed by GitHub
parent e21c347332
commit b31daac01c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 0 deletions

1
changelog.d/10156.misc Normal file
View file

@ -0,0 +1 @@
Add `synapse_federation_soft_failed_events_total` metric to track how often events are soft failed.

View file

@ -33,6 +33,7 @@ from typing import (
)
import attr
from prometheus_client import Counter
from signedjson.key import decode_verify_key_bytes
from signedjson.sign import verify_signed_json
from unpaddedbase64 import decode_base64
@ -101,6 +102,11 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
soft_failed_event_counter = Counter(
"synapse_federation_soft_failed_events_total",
"Events received over federation that we marked as soft_failed",
)
@attr.s(slots=True)
class _NewEventInfo:
@ -2498,6 +2504,7 @@ class FederationHandler(BaseHandler):
event_auth.check(room_version_obj, event, auth_events=current_auth_events)
except AuthError as e:
logger.warning("Soft-failing %r because %s", event, e)
soft_failed_event_counter.inc()
event.internal_metadata.soft_failed = True
async def on_get_missing_events(