Fix potential autoreconnect problem and add exponential backoff

This commit is contained in:
Tulir Asokan 2019-05-28 14:09:49 +03:00
parent 498c0e4130
commit 7f0c67168c
3 changed files with 25 additions and 11 deletions

View file

@ -35,6 +35,7 @@ type BridgeConfig struct {
ConnectionTimeout int `yaml:"connection_timeout"` ConnectionTimeout int `yaml:"connection_timeout"`
MaxConnectionAttempts int `yaml:"max_connection_attempts"` MaxConnectionAttempts int `yaml:"max_connection_attempts"`
ConnectionRetryDelay int `yaml:"connection_retry_delay"`
ReportConnectionRetry bool `yaml:"report_connection_retry"` ReportConnectionRetry bool `yaml:"report_connection_retry"`
InitialChatSync int `yaml:"initial_chat_sync_count"` InitialChatSync int `yaml:"initial_chat_sync_count"`
@ -56,6 +57,7 @@ type BridgeConfig struct {
func (bc *BridgeConfig) setDefaults() { func (bc *BridgeConfig) setDefaults() {
bc.ConnectionTimeout = 20 bc.ConnectionTimeout = 20
bc.MaxConnectionAttempts = 3 bc.MaxConnectionAttempts = 3
bc.ConnectionRetryDelay = -1
bc.ReportConnectionRetry = true bc.ReportConnectionRetry = true
bc.InitialChatSync = 10 bc.InitialChatSync = 10

View file

@ -62,6 +62,9 @@ bridge:
connection_timeout: 20 connection_timeout: 20
# Maximum number of times to retry connecting on connection error. # Maximum number of times to retry connecting on connection error.
max_connection_attempts: 3 max_connection_attempts: 3
# Number of seconds to wait between connection attempts.
# Negative numbers are exponential backoff: -connection_retry_delay + 1 + 2^attempts
connection_retry_delay: -1
# Whether or not the bridge should send a notice to the user's management room when it retries connecting. # Whether or not the bridge should send a notice to the user's management room when it retries connecting.
# If false, it will only report when it stops retrying. # If false, it will only report when it stops retrying.
report_connection_retry: true report_connection_retry: true

31
user.go
View file

@ -348,23 +348,22 @@ func (user *User) HandleError(err error) {
if errors.Cause(err) != whatsapp.ErrInvalidWsData { if errors.Cause(err) != whatsapp.ErrInvalidWsData {
user.log.Errorln("WhatsApp error:", err) user.log.Errorln("WhatsApp error:", err)
} }
var msg string
if closed, ok := err.(*whatsapp.ErrConnectionClosed); ok { if closed, ok := err.(*whatsapp.ErrConnectionClosed); ok {
user.Connected = false user.Connected = false
if closed.Code == 1000 { if closed.Code == 1000 {
// Normal closure // Normal closure
return return
} }
user.ConnectionErrors++ go user.tryReconnect(fmt.Sprintf("Your WhatsApp connection was closed with websocket status code %d", closed.Code))
msg = fmt.Sprintf("Your WhatsApp connection was closed with websocket status code %d", closed.Code)
} else if failed, ok := err.(*whatsapp.ErrConnectionFailed); ok { } else if failed, ok := err.(*whatsapp.ErrConnectionFailed); ok {
user.Connected = false user.Connected = false
user.ConnectionErrors++ user.ConnectionErrors++
msg = fmt.Sprintf("Your WhatsApp connection failed: %v", failed.Err) go user.tryReconnect(fmt.Sprintf("Your WhatsApp connection failed: %v", failed.Err))
} else {
// Unknown error, probably mostly harmless
return
} }
// Otherwise unknown error, probably mostly harmless
}
func (user *User) tryReconnect(msg string) {
if user.ConnectionErrors > user.bridge.Config.Bridge.MaxConnectionAttempts { if user.ConnectionErrors > user.bridge.Config.Bridge.MaxConnectionAttempts {
content := format.RenderMarkdown(fmt.Sprintf("%s. Use the `reconnect` command to reconnect.", msg)) content := format.RenderMarkdown(fmt.Sprintf("%s. Use the `reconnect` command to reconnect.", msg))
_, _ = user.bridge.Bot.SendMessageEvent(user.ManagementRoom, mautrix.EventMessage, content) _, _ = user.bridge.Bot.SendMessageEvent(user.ManagementRoom, mautrix.EventMessage, content)
@ -375,9 +374,16 @@ func (user *User) HandleError(err error) {
// Don't want the same error to be repeated // Don't want the same error to be repeated
msg = "" msg = ""
} }
tries := 0 var tries uint
var exponentialBackoff bool
baseDelay := time.Duration(user.bridge.Config.Bridge.ConnectionRetryDelay)
if baseDelay < 0 {
exponentialBackoff = true
baseDelay = -baseDelay + 1
}
delay := baseDelay
for user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts { for user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
err = user.Conn.Restore() err := user.Conn.Restore()
if err == nil { if err == nil {
user.ConnectionErrors = 0 user.ConnectionErrors = 0
user.Connected = true user.Connected = true
@ -389,11 +395,14 @@ func (user *User) HandleError(err error) {
tries++ tries++
user.ConnectionErrors++ user.ConnectionErrors++
if user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts { if user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
if exponentialBackoff {
delay = (1 << tries) + baseDelay
}
if user.bridge.Config.Bridge.ReportConnectionRetry { if user.bridge.Config.Bridge.ReportConnectionRetry {
_, _ = user.bridge.Bot.SendNotice(user.ManagementRoom, _, _ = user.bridge.Bot.SendNotice(user.ManagementRoom,
fmt.Sprintf("Reconnection attempt failed: %v. Retrying in 10 seconds...", err)) fmt.Sprintf("Reconnection attempt failed: %v. Retrying in %d seconds...", err, delay))
} }
time.Sleep(10 * time.Second) time.Sleep(delay * time.Second)
} }
} }