From be5e44378ff5d36985aaa444acafef23c34cedeb Mon Sep 17 00:00:00 2001 From: Half-Shot Date: Thu, 8 Jun 2023 09:21:43 +0100 Subject: [PATCH] Add a metric for connection failures --- metrics.go | 12 ++++++++++++ user.go | 3 +++ 2 files changed, 15 insertions(+) diff --git a/metrics.go b/metrics.go index 5613b50..6a93b04 100644 --- a/metrics.go +++ b/metrics.go @@ -52,6 +52,7 @@ type MetricsHandler struct { countCollection prometheus.Histogram disconnections *prometheus.CounterVec incomingRetryReceipts *prometheus.CounterVec + connectionFailures *prometheus.CounterVec puppetCount prometheus.Gauge userCount prometheus.Gauge messageCount prometheus.Gauge @@ -101,6 +102,10 @@ func NewMetricsHandler(address string, log log.Logger, db *database.Database) *M Name: "whatsapp_disconnections", Help: "Number of times a Matrix user has been disconnected from WhatsApp", }, []string{"user_id"}), + connectionFailures: promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "whatsapp_connection_failures", + Help: "Number of times a connection has failed to whatsapp", + }, []string{"reason"}), incomingRetryReceipts: promauto.NewCounterVec(prometheus.CounterOpts{ Name: "whatsapp_incoming_retry_receipts", Help: "Number of times a remote WhatsApp user has requested a retry from the bridge. retry_count = 5 is usually the last attempt (and very likely means a failed message)", @@ -173,6 +178,13 @@ func (mh *MetricsHandler) TrackDisconnection(userID id.UserID) { mh.disconnections.With(prometheus.Labels{"user_id": string(userID)}).Inc() } +func (mh *MetricsHandler) TrackConnectionFailure(reason string) { + if !mh.running { + return + } + mh.connectionFailures.With(prometheus.Labels{"reason": reason}).Inc() +} + func (mh *MetricsHandler) TrackRetryReceipt(count int, found bool) { if !mh.running { return diff --git a/user.go b/user.go index 3633be9..7957785 100644 --- a/user.go +++ b/user.go @@ -842,13 +842,16 @@ func (user *User) HandleEvent(event interface{}) { case *events.ConnectFailure: user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: fmt.Sprintf("Unknown connection failure: %s", v.Reason)}) user.bridge.Metrics.TrackConnectionState(user.JID, false) + user.bridge.Metrics.TrackConnectionFailure(fmt.Sprintf("status-%d", v.Reason)) case *events.ClientOutdated: user.log.Errorfln("Got a client outdated connect failure. The bridge is likely out of date, please update immediately.") user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: "Connect failure: 405 client outdated"}) user.bridge.Metrics.TrackConnectionState(user.JID, false) + user.bridge.Metrics.TrackConnectionFailure("client-outdated") case *events.TemporaryBan: user.BridgeState.Send(status.BridgeState{StateEvent: status.StateBadCredentials, Message: v.String()}) user.bridge.Metrics.TrackConnectionState(user.JID, false) + user.bridge.Metrics.TrackConnectionFailure("temporary-ban") case *events.Disconnected: // Don't send the normal transient disconnect state if we're already in a different transient disconnect state. // TODO remove this if/when the phone offline state is moved to a sub-state of CONNECTED