Avoid excessive listing attempts in the daily sweep (#8081)

Add better dynamic timeouts for locks, also
add jitters before launching daily sweep to ensure
that not all the servers in distributed setup
are not trying to hold locks to begin the sweep
round.

Also, add enough delay for incoming requests based
on totalSetCount*totalDriveCount.

A possible fix for #8071
This commit is contained in:
Harshavardhana 2019-08-19 08:22:32 -10:00 committed by GitHub
parent 60f52f461f
commit b3ca304c01
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 13 deletions

View file

@ -62,11 +62,11 @@ func (h *healRoutine) run() {
break
}
if globalHTTPServer != nil {
// Wait at max 1 minute for an inprogress request
// before proceeding to heal
waitCount := 60
// Wait at max 10 minute for an inprogress request before proceeding to heal
waitCount := 600
// Any requests in progress, delay the heal.
for globalHTTPServer.GetRequestCount() > 2 && waitCount > 0 {
for (globalHTTPServer.GetRequestCount() >= int32(globalXLSetCount*globalXLSetDriveCount)) &&
waitCount > 0 {
waitCount--
time.Sleep(1 * time.Second)
}

View file

@ -103,14 +103,12 @@ func startDailyLifecycle() {
}
}
var lifecycleTimeout = newDynamicTimeout(60*time.Second, time.Second)
func lifecycleRound(ctx context.Context, objAPI ObjectLayer) error {
zeroDuration := time.Millisecond
zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration)
// Lock to avoid concurrent lifecycle ops from other nodes
sweepLock := globalNSMutex.NewNSLock(ctx, "system", "daily-lifecycle-ops")
if err := sweepLock.GetLock(zeroDynamicTimeout); err != nil {
if err := sweepLock.GetLock(lifecycleTimeout); err != nil {
return err
}
defer sweepLock.Unlock()

View file

@ -18,6 +18,7 @@ package cmd
import (
"context"
"math/rand"
"sync"
"time"
@ -48,15 +49,14 @@ func copyDailySweepListeners() []chan string {
return listenersCopy
}
var sweepTimeout = newDynamicTimeout(60*time.Second, time.Second)
// sweepRound will list all objects, having read quorum or not and
// feeds to all listeners, such as the background healing
func sweepRound(ctx context.Context, objAPI ObjectLayer) error {
zeroDuration := time.Millisecond
zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration)
// General lock so we avoid parallel daily sweep by different instances.
sweepLock := globalNSMutex.NewNSLock(ctx, "system", "daily-sweep")
if err := sweepLock.GetLock(zeroDynamicTimeout); err != nil {
if err := sweepLock.GetLock(sweepTimeout); err != nil {
return err
}
defer sweepLock.Unlock()
@ -76,6 +76,17 @@ func sweepRound(ctx context.Context, objAPI ObjectLayer) error {
marker := ""
for {
if globalHTTPServer != nil {
// Wait at max 10 minute for an inprogress request before proceeding to heal
waitCount := 600
// Any requests in progress, delay the heal.
for (globalHTTPServer.GetRequestCount() >= int32(globalXLSetCount*globalXLSetDriveCount)) &&
waitCount > 0 {
waitCount--
time.Sleep(1 * time.Second)
}
}
res, err := objAPI.ListObjectsHeal(ctx, bucket.Name, "", marker, "", 1000)
if err != nil {
continue
@ -119,6 +130,9 @@ func dailySweeper() {
break
}
// Start with random sleep time, so as to avoid "synchronous checks" between servers
time.Sleep(time.Duration(rand.Float64() * float64(time.Hour)))
// Perform a sweep round each month
for {
if time.Since(lastSweepTime) < 30*24*time.Hour {