Add a metric for connection failures

This commit is contained in:
Half-Shot 2023-06-08 09:21:43 +01:00
parent 9843ba048f
commit be5e44378f
2 changed files with 15 additions and 0 deletions

View file

@ -52,6 +52,7 @@ type MetricsHandler struct {
countCollection prometheus.Histogram countCollection prometheus.Histogram
disconnections *prometheus.CounterVec disconnections *prometheus.CounterVec
incomingRetryReceipts *prometheus.CounterVec incomingRetryReceipts *prometheus.CounterVec
connectionFailures *prometheus.CounterVec
puppetCount prometheus.Gauge puppetCount prometheus.Gauge
userCount prometheus.Gauge userCount prometheus.Gauge
messageCount prometheus.Gauge messageCount prometheus.Gauge
@ -101,6 +102,10 @@ func NewMetricsHandler(address string, log log.Logger, db *database.Database) *M
Name: "whatsapp_disconnections", Name: "whatsapp_disconnections",
Help: "Number of times a Matrix user has been disconnected from WhatsApp", Help: "Number of times a Matrix user has been disconnected from WhatsApp",
}, []string{"user_id"}), }, []string{"user_id"}),
connectionFailures: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "whatsapp_connection_failures",
Help: "Number of times a connection has failed to whatsapp",
}, []string{"reason"}),
incomingRetryReceipts: promauto.NewCounterVec(prometheus.CounterOpts{ incomingRetryReceipts: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "whatsapp_incoming_retry_receipts", Name: "whatsapp_incoming_retry_receipts",
Help: "Number of times a remote WhatsApp user has requested a retry from the bridge. retry_count = 5 is usually the last attempt (and very likely means a failed message)", Help: "Number of times a remote WhatsApp user has requested a retry from the bridge. retry_count = 5 is usually the last attempt (and very likely means a failed message)",
@ -173,6 +178,13 @@ func (mh *MetricsHandler) TrackDisconnection(userID id.UserID) {
mh.disconnections.With(prometheus.Labels{"user_id": string(userID)}).Inc() mh.disconnections.With(prometheus.Labels{"user_id": string(userID)}).Inc()
} }
func (mh *MetricsHandler) TrackConnectionFailure(reason string) {
if !mh.running {
return
}
mh.connectionFailures.With(prometheus.Labels{"reason": reason}).Inc()
}
func (mh *MetricsHandler) TrackRetryReceipt(count int, found bool) { func (mh *MetricsHandler) TrackRetryReceipt(count int, found bool) {
if !mh.running { if !mh.running {
return return

View file

@ -842,13 +842,16 @@ func (user *User) HandleEvent(event interface{}) {
case *events.ConnectFailure: case *events.ConnectFailure:
user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: fmt.Sprintf("Unknown connection failure: %s", v.Reason)}) user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: fmt.Sprintf("Unknown connection failure: %s", v.Reason)})
user.bridge.Metrics.TrackConnectionState(user.JID, false) user.bridge.Metrics.TrackConnectionState(user.JID, false)
user.bridge.Metrics.TrackConnectionFailure(fmt.Sprintf("status-%d", v.Reason))
case *events.ClientOutdated: case *events.ClientOutdated:
user.log.Errorfln("Got a client outdated connect failure. The bridge is likely out of date, please update immediately.") user.log.Errorfln("Got a client outdated connect failure. The bridge is likely out of date, please update immediately.")
user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: "Connect failure: 405 client outdated"}) user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: "Connect failure: 405 client outdated"})
user.bridge.Metrics.TrackConnectionState(user.JID, false) user.bridge.Metrics.TrackConnectionState(user.JID, false)
user.bridge.Metrics.TrackConnectionFailure("client-outdated")
case *events.TemporaryBan: case *events.TemporaryBan:
user.BridgeState.Send(status.BridgeState{StateEvent: status.StateBadCredentials, Message: v.String()}) user.BridgeState.Send(status.BridgeState{StateEvent: status.StateBadCredentials, Message: v.String()})
user.bridge.Metrics.TrackConnectionState(user.JID, false) user.bridge.Metrics.TrackConnectionState(user.JID, false)
user.bridge.Metrics.TrackConnectionFailure("temporary-ban")
case *events.Disconnected: case *events.Disconnected:
// Don't send the normal transient disconnect state if we're already in a different transient disconnect state. // Don't send the normal transient disconnect state if we're already in a different transient disconnect state.
// TODO remove this if/when the phone offline state is moved to a sub-state of CONNECTED // TODO remove this if/when the phone offline state is moved to a sub-state of CONNECTED