Cluster healthcheck improvements (#10408)

- do not fail the healthcheck if heal status
  was not obtained from one of the nodes,
  if many nodes fail then report this as a
  catastrophic error.
- add "x-minio-write-quorum" value to match
  the write tolerance supported by server.
- admin info now states if a drive is healing
  where madmin.Disk.Healing is set to true
  and madmin.Disk.State is "ok"
This commit is contained in:
Harshavardhana 2020-09-02 22:54:56 -07:00 committed by GitHub
parent 650dccfa9e
commit 8a291e1dc0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 49 additions and 25 deletions

View file

@ -799,14 +799,12 @@ func (a adminAPIHandlers) HealHandler(w http.ResponseWriter, r *http.Request) {
keepConnLive(w, r, respCh)
}
func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madmin.BgHealState, error) {
func getAggregatedBackgroundHealState(ctx context.Context) (madmin.BgHealState, error) {
var bgHealStates []madmin.BgHealState
localHealState, ok := getLocalBackgroundHealStatus()
if !ok {
if failOnErr {
return madmin.BgHealState{}, errServerNotInitialized
}
return madmin.BgHealState{}, errServerNotInitialized
}
// Get local heal status first
@ -815,14 +813,16 @@ func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madm
if globalIsDistErasure {
// Get heal status from other peers
peersHealStates, nerrs := globalNotificationSys.BackgroundHealStatus()
var errCount int
for _, nerr := range nerrs {
if nerr.Err != nil {
if failOnErr {
return madmin.BgHealState{}, nerr.Err
}
logger.LogIf(ctx, nerr.Err)
errCount++
}
}
if errCount == len(nerrs) {
return madmin.BgHealState{}, fmt.Errorf("all remote servers failed to report heal status, cluster is unhealthy")
}
bgHealStates = append(bgHealStates, peersHealStates...)
}
@ -868,7 +868,12 @@ func (a adminAPIHandlers) BackgroundHealStatusHandler(w http.ResponseWriter, r *
return
}
aggregateHealStateResult, _ := getAggregatedBackgroundHealState(r.Context(), false)
aggregateHealStateResult, err := getAggregatedBackgroundHealState(r.Context())
if err != nil {
writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL)
return
}
if err := json.NewEncoder(w).Encode(aggregateHealStateResult); err != nil {
writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL)
return
@ -1489,20 +1494,34 @@ func (a adminAPIHandlers) ServerInfoHandler(w http.ResponseWriter, r *http.Reque
Notifications: notifyTarget,
}
// Collect any disk healing.
healing, _ := getAggregatedBackgroundHealState(ctx)
healDisks := make(map[string]struct{}, len(healing.HealDisks))
for _, disk := range healing.HealDisks {
healDisks[disk] = struct{}{}
}
// find all disks which belong to each respective endpoints
for i := range servers {
for _, disk := range storageInfo.Disks {
if strings.Contains(disk.Endpoint, servers[i].Endpoint) {
if _, ok := healDisks[disk.Endpoint]; ok {
disk.Healing = true
}
servers[i].Disks = append(servers[i].Disks, disk)
}
}
}
// add all the disks local to this server.
for _, disk := range storageInfo.Disks {
if disk.DrivePath == "" && disk.Endpoint == "" {
continue
}
if disk.Endpoint == disk.DrivePath {
if _, ok := healDisks[disk.Endpoint]; ok {
disk.Healing = true
}
servers[len(servers)-1].Disks = append(servers[len(servers)-1].Disks, disk)
}
}

View file

@ -2045,17 +2045,18 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
reqInfo := (&logger.ReqInfo{}).AppendTags("maintenance", strconv.FormatBool(opts.Maintenance))
parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
diskCount := z.SetDriveCount()
if parityDrives == 0 {
parityDrives = getDefaultParityBlocks(diskCount)
}
dataDrives := diskCount - parityDrives
writeQuorum := dataDrives
if dataDrives == parityDrives {
writeQuorum++
}
for zoneIdx := range erasureSetUpCount {
parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
diskCount := z.zones[zoneIdx].setDriveCount
if parityDrives == 0 {
parityDrives = getDefaultParityBlocks(diskCount)
}
dataDrives := diskCount - parityDrives
writeQuorum := dataDrives
if dataDrives == parityDrives {
writeQuorum++
}
for setIdx := range erasureSetUpCount[zoneIdx] {
if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
@ -2075,14 +2076,15 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
// to look at the healing side of the code.
if !opts.Maintenance {
return HealthResult{
Healthy: true,
Healthy: true,
WriteQuorum: writeQuorum,
}
}
// check if local disks are being healed, if they are being healed
// we need to tell healthy status as 'false' so that this server
// is not taken down for maintenance
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx, true)
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx)
if err != nil {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
return HealthResult{
@ -2094,11 +2096,10 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
}
healthy := len(aggHealStateResult.HealDisks) == 0
return HealthResult{
Healthy: healthy,
Healthy: len(aggHealStateResult.HealDisks) == 0,
HealingDrives: len(aggHealStateResult.HealDisks),
WriteQuorum: writeQuorum,
}
}

View file

@ -266,10 +266,11 @@ func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []Buc
}
// Collect any disk healing.
healing, err := getAggregatedBackgroundHealState(ctx, true)
healing, err := getAggregatedBackgroundHealState(ctx)
if err != nil {
return err
}
healDisks := make(map[string]struct{}, len(healing.HealDisks))
for _, disk := range healing.HealDisks {
healDisks[disk] = struct{}{}

View file

@ -38,6 +38,9 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
opts := HealthOptions{Maintenance: r.URL.Query().Get("maintenance") == "true"}
result := objLayer.Health(ctx, opts)
if result.WriteQuorum > 0 {
w.Header().Set("X-Minio-Write-Quorum", strconv.Itoa(result.WriteQuorum))
}
if !result.Healthy {
// return how many drives are being healed if any
w.Header().Set("X-Minio-Healing-Drives", strconv.Itoa(result.HealingDrives))
@ -51,7 +54,6 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
}
return
}
writeResponse(w, http.StatusOK, nil, mimeNone)
}

View file

@ -272,6 +272,7 @@ type Disk struct {
Endpoint string `json:"endpoint,omitempty"`
RootDisk bool `json:"rootDisk,omitempty"`
DrivePath string `json:"path,omitempty"`
Healing bool `json:"healing,omitempty"`
State string `json:"state,omitempty"`
UUID string `json:"uuid,omitempty"`
Model string `json:"model,omitempty"`