2020-06-17 16:50:06 +02:00
// mautrix-whatsapp - A Matrix-WhatsApp puppeting bridge.
2021-10-22 19:14:34 +02:00
// Copyright (C) 2021 Tulir Asokan
2020-06-17 16:50:06 +02:00
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package main
import (
"context"
"net/http"
"runtime/debug"
2022-01-28 14:06:19 +01:00
"strconv"
2021-10-27 16:48:19 +02:00
"sync"
2020-06-17 16:50:06 +02:00
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
log "maunium.net/go/maulogger/v2"
2020-10-16 15:52:49 +02:00
2021-10-22 19:14:34 +02:00
"go.mau.fi/whatsmeow/types"
2021-02-17 00:21:30 +01:00
2020-06-17 16:50:06 +02:00
"maunium.net/go/mautrix/event"
2020-06-17 16:57:14 +02:00
"maunium.net/go/mautrix/id"
2020-06-17 16:50:06 +02:00
"maunium.net/go/mautrix-whatsapp/database"
)
type MetricsHandler struct {
db * database . Database
server * http . Server
log log . Logger
running bool
ctx context . Context
stopRecorder func ( )
2021-06-25 14:33:37 +02:00
matrixEventHandling * prometheus . HistogramVec
whatsappMessageAge prometheus . Histogram
whatsappMessageHandling * prometheus . HistogramVec
2020-06-17 16:50:06 +02:00
countCollection prometheus . Histogram
2020-06-17 16:57:14 +02:00
disconnections * prometheus . CounterVec
2022-01-28 14:06:19 +01:00
incomingRetryReceipts * prometheus . CounterVec
2023-06-08 10:21:43 +02:00
connectionFailures * prometheus . CounterVec
2020-06-17 16:50:06 +02:00
puppetCount prometheus . Gauge
userCount prometheus . Gauge
messageCount prometheus . Gauge
portalCount * prometheus . GaugeVec
encryptedGroupCount prometheus . Gauge
encryptedPrivateCount prometheus . Gauge
unencryptedGroupCount prometheus . Gauge
unencryptedPrivateCount prometheus . Gauge
2020-09-27 21:30:08 +02:00
2021-10-28 13:00:10 +02:00
connected prometheus . Gauge
connectedState map [ string ] bool
connectedStateLock sync . Mutex
loggedIn prometheus . Gauge
loggedInState map [ string ] bool
loggedInStateLock sync . Mutex
2020-06-17 16:50:06 +02:00
}
func NewMetricsHandler ( address string , log log . Logger , db * database . Database ) * MetricsHandler {
portalCount := promauto . NewGaugeVec ( prometheus . GaugeOpts {
Name : "whatsapp_portals_total" ,
Help : "Number of portal rooms on Matrix" ,
} , [ ] string { "type" , "encrypted" } )
return & MetricsHandler {
db : db ,
server : & http . Server { Addr : address , Handler : promhttp . Handler ( ) } ,
log : log ,
running : false ,
2021-06-25 14:33:37 +02:00
matrixEventHandling : promauto . NewHistogramVec ( prometheus . HistogramOpts {
2020-06-17 16:50:06 +02:00
Name : "matrix_event" ,
Help : "Time spent processing Matrix events" ,
} , [ ] string { "event_type" } ) ,
2021-06-25 14:33:37 +02:00
whatsappMessageAge : promauto . NewHistogram ( prometheus . HistogramOpts {
2021-06-30 14:13:48 +02:00
Name : "remote_event_age" ,
Help : "Age of messages received from WhatsApp" ,
2021-06-25 14:33:37 +02:00
Buckets : [ ] float64 { 1 , 2 , 3 , 5 , 7.5 , 10 , 20 , 30 , 60 } ,
} ) ,
whatsappMessageHandling : promauto . NewHistogramVec ( prometheus . HistogramOpts {
2021-06-30 14:13:48 +02:00
Name : "remote_event" ,
2021-06-25 14:33:37 +02:00
Help : "Time spent processing WhatsApp messages" ,
} , [ ] string { "message_type" } ) ,
2020-06-17 16:50:06 +02:00
countCollection : promauto . NewHistogram ( prometheus . HistogramOpts {
Name : "whatsapp_count_collection" ,
Help : "Time spent collecting the whatsapp_*_total metrics" ,
} ) ,
2020-06-17 16:57:14 +02:00
disconnections : promauto . NewCounterVec ( prometheus . CounterOpts {
Name : "whatsapp_disconnections" ,
Help : "Number of times a Matrix user has been disconnected from WhatsApp" ,
} , [ ] string { "user_id" } ) ,
2023-06-08 10:21:43 +02:00
connectionFailures : promauto . NewCounterVec ( prometheus . CounterOpts {
Name : "whatsapp_connection_failures" ,
Help : "Number of times a connection has failed to whatsapp" ,
} , [ ] string { "reason" } ) ,
2022-01-28 14:06:19 +01:00
incomingRetryReceipts : promauto . NewCounterVec ( prometheus . CounterOpts {
Name : "whatsapp_incoming_retry_receipts" ,
Help : "Number of times a remote WhatsApp user has requested a retry from the bridge. retry_count = 5 is usually the last attempt (and very likely means a failed message)" ,
} , [ ] string { "retry_count" , "message_found" } ) ,
2020-06-17 16:50:06 +02:00
puppetCount : promauto . NewGauge ( prometheus . GaugeOpts {
Name : "whatsapp_puppets_total" ,
Help : "Number of WhatsApp users bridged into Matrix" ,
} ) ,
userCount : promauto . NewGauge ( prometheus . GaugeOpts {
Name : "whatsapp_users_total" ,
Help : "Number of Matrix users using the bridge" ,
} ) ,
messageCount : promauto . NewGauge ( prometheus . GaugeOpts {
Name : "whatsapp_messages_total" ,
Help : "Number of messages bridged" ,
} ) ,
portalCount : portalCount ,
encryptedGroupCount : portalCount . With ( prometheus . Labels { "type" : "group" , "encrypted" : "true" } ) ,
encryptedPrivateCount : portalCount . With ( prometheus . Labels { "type" : "private" , "encrypted" : "true" } ) ,
unencryptedGroupCount : portalCount . With ( prometheus . Labels { "type" : "group" , "encrypted" : "false" } ) ,
unencryptedPrivateCount : portalCount . With ( prometheus . Labels { "type" : "private" , "encrypted" : "false" } ) ,
2020-09-27 21:30:08 +02:00
2020-10-16 15:52:49 +02:00
loggedIn : promauto . NewGauge ( prometheus . GaugeOpts {
2020-09-27 21:30:08 +02:00
Name : "bridge_logged_in" ,
Help : "Users logged into the bridge" ,
2020-10-16 15:52:49 +02:00
} ) ,
2021-10-22 19:14:34 +02:00
loggedInState : make ( map [ string ] bool ) ,
2020-10-16 15:52:49 +02:00
connected : promauto . NewGauge ( prometheus . GaugeOpts {
2020-09-27 21:30:08 +02:00
Name : "bridge_connected" ,
2020-10-16 15:52:49 +02:00
Help : "Bridge users connected to WhatsApp" ,
} ) ,
2021-10-22 19:14:34 +02:00
connectedState : make ( map [ string ] bool ) ,
2020-06-17 16:50:06 +02:00
}
}
2020-06-17 16:57:14 +02:00
func noop ( ) { }
2021-06-25 14:33:37 +02:00
func ( mh * MetricsHandler ) TrackMatrixEvent ( eventType event . Type ) func ( ) {
2020-06-17 16:57:14 +02:00
if ! mh . running {
return noop
}
2020-06-17 16:50:06 +02:00
start := time . Now ( )
return func ( ) {
duration := time . Now ( ) . Sub ( start )
2021-06-25 14:33:37 +02:00
mh . matrixEventHandling .
2020-06-17 16:50:06 +02:00
With ( prometheus . Labels { "event_type" : eventType . Type } ) .
Observe ( duration . Seconds ( ) )
}
}
2021-10-22 19:14:34 +02:00
func ( mh * MetricsHandler ) TrackWhatsAppMessage ( timestamp time . Time , messageType string ) func ( ) {
2021-06-25 14:33:37 +02:00
if ! mh . running {
return noop
}
start := time . Now ( )
return func ( ) {
duration := time . Now ( ) . Sub ( start )
mh . whatsappMessageHandling .
With ( prometheus . Labels { "message_type" : messageType } ) .
Observe ( duration . Seconds ( ) )
2021-10-22 19:14:34 +02:00
mh . whatsappMessageAge . Observe ( time . Now ( ) . Sub ( timestamp ) . Seconds ( ) )
2021-06-25 14:33:37 +02:00
}
}
2020-06-17 16:57:14 +02:00
func ( mh * MetricsHandler ) TrackDisconnection ( userID id . UserID ) {
if ! mh . running {
return
}
mh . disconnections . With ( prometheus . Labels { "user_id" : string ( userID ) } ) . Inc ( )
}
2023-06-08 10:21:43 +02:00
func ( mh * MetricsHandler ) TrackConnectionFailure ( reason string ) {
if ! mh . running {
return
}
mh . connectionFailures . With ( prometheus . Labels { "reason" : reason } ) . Inc ( )
}
2022-01-28 14:06:19 +01:00
func ( mh * MetricsHandler ) TrackRetryReceipt ( count int , found bool ) {
if ! mh . running {
return
}
mh . incomingRetryReceipts . With ( prometheus . Labels {
"retry_count" : strconv . Itoa ( count ) ,
"message_found" : strconv . FormatBool ( found ) ,
} ) . Inc ( )
}
2021-10-22 19:14:34 +02:00
func ( mh * MetricsHandler ) TrackLoginState ( jid types . JID , loggedIn bool ) {
2020-09-27 21:30:08 +02:00
if ! mh . running {
return
}
2021-10-27 16:48:19 +02:00
mh . loggedInStateLock . Lock ( )
defer mh . loggedInStateLock . Unlock ( )
2021-10-22 19:14:34 +02:00
currentVal , ok := mh . loggedInState [ jid . User ]
2020-10-16 15:52:49 +02:00
if ! ok || currentVal != loggedIn {
2021-10-22 19:14:34 +02:00
mh . loggedInState [ jid . User ] = loggedIn
2020-10-16 15:52:49 +02:00
if loggedIn {
mh . loggedIn . Inc ( )
} else {
mh . loggedIn . Dec ( )
}
2020-09-27 21:30:08 +02:00
}
}
2021-10-22 19:14:34 +02:00
func ( mh * MetricsHandler ) TrackConnectionState ( jid types . JID , connected bool ) {
2020-09-27 21:30:08 +02:00
if ! mh . running {
return
}
2021-10-27 16:48:19 +02:00
mh . connectedStateLock . Lock ( )
defer mh . connectedStateLock . Unlock ( )
2021-10-22 19:14:34 +02:00
currentVal , ok := mh . connectedState [ jid . User ]
2020-10-16 15:52:49 +02:00
if ! ok || currentVal != connected {
2021-10-22 19:14:34 +02:00
mh . connectedState [ jid . User ] = connected
2020-10-16 15:52:49 +02:00
if connected {
mh . connected . Inc ( )
} else {
mh . connected . Dec ( )
}
2020-09-27 21:30:08 +02:00
}
}
2020-06-17 16:50:06 +02:00
func ( mh * MetricsHandler ) updateStats ( ) {
start := time . Now ( )
var puppetCount int
err := mh . db . QueryRowContext ( mh . ctx , "SELECT COUNT(*) FROM puppet" ) . Scan ( & puppetCount )
if err != nil {
mh . log . Warnln ( "Failed to scan number of puppets:" , err )
} else {
mh . puppetCount . Set ( float64 ( puppetCount ) )
}
var userCount int
err = mh . db . QueryRowContext ( mh . ctx , ` SELECT COUNT(*) FROM "user" ` ) . Scan ( & userCount )
if err != nil {
mh . log . Warnln ( "Failed to scan number of users:" , err )
} else {
mh . userCount . Set ( float64 ( userCount ) )
}
var messageCount int
err = mh . db . QueryRowContext ( mh . ctx , "SELECT COUNT(*) FROM message" ) . Scan ( & messageCount )
if err != nil {
mh . log . Warnln ( "Failed to scan number of messages:" , err )
} else {
mh . messageCount . Set ( float64 ( messageCount ) )
}
var encryptedGroupCount , encryptedPrivateCount , unencryptedGroupCount , unencryptedPrivateCount int
err = mh . db . QueryRowContext ( mh . ctx , `
SELECT
COUNT ( CASE WHEN jid LIKE ' % @ g . us ' AND encrypted THEN 1 END ) AS encrypted_group_portals ,
COUNT ( CASE WHEN jid LIKE ' % @ s . whatsapp . net ' AND encrypted THEN 1 END ) AS encrypted_private_portals ,
COUNT ( CASE WHEN jid LIKE ' % @ g . us ' AND NOT encrypted THEN 1 END ) AS unencrypted_group_portals ,
COUNT ( CASE WHEN jid LIKE ' % @ s . whatsapp . net ' AND NOT encrypted THEN 1 END ) AS unencrypted_private_portals
FROM portal WHERE mxid < > ' '
` ) . Scan ( & encryptedGroupCount , & encryptedPrivateCount , & unencryptedGroupCount , & unencryptedPrivateCount )
if err != nil {
mh . log . Warnln ( "Failed to scan number of portals:" , err )
} else {
mh . encryptedGroupCount . Set ( float64 ( encryptedGroupCount ) )
mh . encryptedPrivateCount . Set ( float64 ( encryptedPrivateCount ) )
mh . unencryptedGroupCount . Set ( float64 ( unencryptedGroupCount ) )
mh . unencryptedPrivateCount . Set ( float64 ( encryptedPrivateCount ) )
}
mh . countCollection . Observe ( time . Now ( ) . Sub ( start ) . Seconds ( ) )
}
func ( mh * MetricsHandler ) startUpdatingStats ( ) {
defer func ( ) {
err := recover ( )
if err != nil {
mh . log . Fatalfln ( "Panic in metric updater: %v\n%s" , err , string ( debug . Stack ( ) ) )
}
} ( )
ticker := time . Tick ( 10 * time . Second )
for {
mh . updateStats ( )
select {
case <- mh . ctx . Done ( ) :
return
case <- ticker :
}
}
}
func ( mh * MetricsHandler ) Start ( ) {
mh . running = true
mh . ctx , mh . stopRecorder = context . WithCancel ( context . Background ( ) )
go mh . startUpdatingStats ( )
err := mh . server . ListenAndServe ( )
mh . running = false
2020-07-30 17:08:26 +02:00
if err != nil && err != http . ErrServerClosed {
2020-06-17 16:50:06 +02:00
mh . log . Fatalln ( "Error in metrics listener:" , err )
}
}
func ( mh * MetricsHandler ) Stop ( ) {
if ! mh . running {
return
}
mh . stopRecorder ( )
err := mh . server . Close ( )
if err != nil {
mh . log . Errorln ( "Error closing metrics listener:" , err )
}
}