minio/cmd/prepare-storage.go

/*
 * MinIO Cloud Storage, (C) 2016 MinIO, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cmd

import (
	"context"
	"crypto/tls"
	"fmt"
	"net/http"
	"net/url"
	"os"
	"path"
	"sync"
	"time"

	xhttp "github.com/minio/minio/cmd/http"
	"github.com/minio/minio/cmd/logger"
	"github.com/minio/minio/cmd/rest"
	"github.com/minio/minio/pkg/sync/errgroup"
)

var printEndpointError = func() func(Endpoint, error) {
	var mutex sync.Mutex
	printOnce := make(map[Endpoint]map[string]bool)

	return func(endpoint Endpoint, err error) {
		reqInfo := (&logger.ReqInfo{}).AppendTags("endpoint", endpoint.String())
		ctx := logger.SetReqInfo(context.Background(), reqInfo)
		mutex.Lock()
		defer mutex.Unlock()
		m, ok := printOnce[endpoint]
		if !ok {
			m = make(map[string]bool)
			m[err.Error()] = true
			printOnce[endpoint] = m
			logger.LogAlwaysIf(ctx, err)
			return
		}
		if m[err.Error()] {
			return
		}
		m[err.Error()] = true
		logger.LogAlwaysIf(ctx, err)
	}
}()

// Migrates backend format of local disks.
func formatXLMigrateLocalEndpoints(endpoints Endpoints) error {
	g := errgroup.WithNErrs(len(endpoints))
	for index, endpoint := range endpoints {
		if !endpoint.IsLocal {
			continue
		}
		index := index
		g.Go(func() error {
			epPath := endpoints[index].Path
			formatPath := pathJoin(epPath, minioMetaBucket, formatConfigFile)
			if _, err := os.Stat(formatPath); err != nil {
				if os.IsNotExist(err) {
					return nil
				}
				return fmt.Errorf("unable to access (%s) %w", formatPath, err)
			}
			return formatXLMigrate(epPath)
		}, index)
	}
	for _, err := range g.Wait() {
		if err != nil {
			return err
		}
	}
	return nil
}

// Cleans up tmp directory of local disks.
func formatXLCleanupTmpLocalEndpoints(endpoints Endpoints) error {
	g := errgroup.WithNErrs(len(endpoints))
	for index, endpoint := range endpoints {
		if !endpoint.IsLocal {
			continue
		}
		index := index
		g.Go(func() error {
			epPath := endpoints[index].Path
			// If disk is not formatted there is nothing to be cleaned up.
			formatPath := pathJoin(epPath, minioMetaBucket, formatConfigFile)
			if _, err := os.Stat(formatPath); err != nil {
				if os.IsNotExist(err) {
					return nil
				}
				return fmt.Errorf("unable to access (%s) %w", formatPath, err)
			}
			if _, err := os.Stat(pathJoin(epPath, minioMetaTmpBucket+"-old")); err != nil {
				if !os.IsNotExist(err) {
					return fmt.Errorf("unable to access (%s) %w",
						pathJoin(epPath, minioMetaTmpBucket+"-old"),
						err)
				}
			}

			// Need to move temporary objects left behind from previous run of minio
			// server to a unique directory under `minioMetaTmpBucket-old` to clean
			// up `minioMetaTmpBucket` for the current run.
			//
			// /disk1/.minio.sys/tmp-old/
			//  |__ 33a58b40-aecc-4c9f-a22f-ff17bfa33b62
			//  |__ e870a2c1-d09c-450c-a69c-6eaa54a89b3e
			//
			// In this example, `33a58b40-aecc-4c9f-a22f-ff17bfa33b62` directory contains
			// temporary objects from one of the previous runs of minio server.
			tmpOld := pathJoin(epPath, minioMetaTmpBucket+"-old", mustGetUUID())
			if err := renameAll(pathJoin(epPath, minioMetaTmpBucket),
				tmpOld); err != nil && err != errFileNotFound {
				return fmt.Errorf("unable to rename (%s -> %s) %w",
					pathJoin(epPath, minioMetaTmpBucket),
					tmpOld,
					err)
			}

			// Removal of tmp-old folder is backgrounded completely.
			go removeAll(pathJoin(epPath, minioMetaTmpBucket+"-old"))

			if err := mkdirAll(pathJoin(epPath, minioMetaTmpBucket), 0777); err != nil {
				return fmt.Errorf("unable to create (%s) %w",
					pathJoin(epPath, minioMetaTmpBucket),
					err)
			}
			return nil
		}, index)
	}
	for _, err := range g.Wait() {
		if err != nil {
			return err
		}
	}
	return nil
}

// validate reference format against list of XL formats.
func validateXLFormats(format *formatXLV3, formats []*formatXLV3, endpoints Endpoints, setCount, drivesPerSet int) error {
	for i := range formats {
		if formats[i] == nil {
			continue
		}
		if err := formatXLV3Check(format, formats[i]); err != nil {
			return fmt.Errorf("%s format error: %w", endpoints[i], err)
		}
	}
	if len(format.XL.Sets) != setCount {
		return fmt.Errorf("Current backend format is inconsistent with input args (%s), Expected set count %d, got %d", endpoints, len(format.XL.Sets), setCount)
	}
	if len(format.XL.Sets[0]) != drivesPerSet {
		return fmt.Errorf("Current backend format is inconsistent with input args (%s), Expected drive count per set %d, got %d", endpoints, len(format.XL.Sets[0]), drivesPerSet)
	}

	return nil
}

// Following error message is added to fix a regression in release
// RELEASE.2018-03-16T22-52-12Z after migrating v1 to v2 to v3. This
// migration failed to capture '.This' field properly which indicates
// the disk UUID association. Below error message is returned when
// we see this situation in format.json, for more info refer
// https://github.com/minio/minio/issues/5667
var errXLV3ThisEmpty = fmt.Errorf("XL format version 3 has This field empty")

// IsServerResolvable - checks if the endpoint is resolvable
// by sending a naked HTTP request with liveness checks.
func IsServerResolvable(endpoint Endpoint) error {
	serverURL := &url.URL{
		Scheme: endpoint.Scheme,
		Host:   endpoint.Host,
		Path:   path.Join(healthCheckPathPrefix, healthCheckLivenessPath),
	}

	var tlsConfig *tls.Config
	if globalIsSSL {
		tlsConfig = &tls.Config{
			ServerName: endpoint.Hostname(),
			RootCAs:    globalRootCAs,
			NextProtos: []string{"http/1.1"}, // Force http1.1
		}
	}

	req, err := http.NewRequest(http.MethodGet, serverURL.String(), nil)
	if err != nil {
		return err
	}

	httpClient := &http.Client{
		Transport: newCustomHTTPTransport(tlsConfig, rest.DefaultRESTTimeout, rest.DefaultRESTTimeout)(),
	}

	resp, err := httpClient.Do(req)
	if err != nil {
		httpClient.CloseIdleConnections()
		return err
	}
	defer xhttp.DrainBody(resp.Body)

	if resp.StatusCode != http.StatusOK {
		return StorageErr(resp.Status)
	}
	return nil
}

// connect to list of endpoints and load all XL disk formats, validate the formats are correct
// and are in quorum, if no formats are found attempt to initialize all of them for the first
// time. additionally make sure to close all the disks used in this attempt.
func connectLoadInitFormats(retryCount int, firstDisk bool, endpoints Endpoints, zoneCount, setCount, drivesPerSet int, deploymentID string) (*formatXLV3, error) {
	// Initialize all storage disks
	storageDisks, errs := initStorageDisksWithErrors(endpoints)
	defer closeStorageDisks(storageDisks)

	for i, err := range errs {
		if err != nil {
			if err != errDiskNotFound {
				return nil, fmt.Errorf("Disk %s: %w", endpoints[i], err)
			}
			if retryCount >= 5 {
				logger.Info("Unable to connect to %s: %v\n", endpoints[i], IsServerResolvable(endpoints[i]))
			}
		}
	}

	// Attempt to load all `format.json` from all disks.
	formatConfigs, sErrs := loadFormatXLAll(storageDisks)
	// Check if we have
	for i, sErr := range sErrs {
		if _, ok := formatCriticalErrors[sErr]; ok {
			return nil, fmt.Errorf("Disk %s: %w", endpoints[i], sErr)
		}
	}

	// Pre-emptively check if one of the formatted disks
	// is invalid. This function returns success for the
	// most part unless one of the formats is not consistent
	// with expected XL format. For example if a user is
	// trying to pool FS backend into an XL set.
	if err := checkFormatXLValues(formatConfigs); err != nil {
		return nil, err
	}

	// All disks report unformatted we should initialized everyone.
	if shouldInitXLDisks(sErrs) && firstDisk {
		logger.Info("Formatting %v zone, %v set(s), %v drives per set.",
			zoneCount, setCount, drivesPerSet)

		// Initialize erasure code format on disks
		format, err := initFormatXL(context.Background(), storageDisks, setCount, drivesPerSet, deploymentID)
		if err != nil {
			return nil, err
		}
		// Assign globalDeploymentID on first run for the
		// minio server managing the first disk
		globalDeploymentID = format.ID
		return format, nil
	}

	// Return error when quorum unformatted disks - indicating we are
	// waiting for first server to be online.
	if quorumUnformattedDisks(sErrs) && !firstDisk {
		return nil, errNotFirstDisk
	}

	// Return error when quorum unformatted disks but waiting for rest
	// of the servers to be online.
	if quorumUnformattedDisks(sErrs) && firstDisk {
		return nil, errFirstDiskWait
	}

	// Following function is added to fix a regressions which was introduced
	// in release RELEASE.2018-03-16T22-52-12Z after migrating v1 to v2 to v3.
	// This migration failed to capture '.This' field properly which indicates
	// the disk UUID association. Below function is called to handle and fix
	// this regression, for more info refer https://github.com/minio/minio/issues/5667
	if err := fixFormatXLV3(storageDisks, endpoints, formatConfigs); err != nil {
		return nil, err
	}

	// If any of the .This field is still empty, we return error.
	if formatXLV3ThisEmpty(formatConfigs) {
		return nil, errXLV3ThisEmpty
	}

	format, err := getFormatXLInQuorum(formatConfigs)
	if err != nil {
		return nil, err
	}

	if format.ID == "" {
		// Not a first disk, wait until first disk fixes deploymentID
		if !firstDisk {
			return nil, errNotFirstDisk
		}
		if err = formatXLFixDeploymentID(endpoints, storageDisks, format); err != nil {
			return nil, err
		}
	}

	globalDeploymentID = format.ID

	if err = formatXLFixLocalDeploymentID(endpoints, storageDisks, format); err != nil {
		return nil, err
	}

	// The will always recreate some directories inside .minio.sys of
	// the local disk such as tmp, multipart and background-ops
	initXLMetaVolumesInLocalDisks(storageDisks, formatConfigs)

	return format, nil
}

// Format disks before initialization of object layer.
func waitForFormatXL(firstDisk bool, endpoints Endpoints, zoneCount, setCount, drivesPerSet int, deploymentID string) (format *formatXLV3, err error) {
	if len(endpoints) == 0 || setCount == 0 || drivesPerSet == 0 {
		return nil, errInvalidArgument
	}

	if err = formatXLMigrateLocalEndpoints(endpoints); err != nil {
		return nil, err
	}

	if err = formatXLCleanupTmpLocalEndpoints(endpoints); err != nil {
		return nil, err
	}

	// prepare getElapsedTime() to calculate elapsed time since we started trying formatting disks.
	// All times are rounded to avoid showing milli, micro and nano seconds
	formatStartTime := time.Now().Round(time.Second)
	getElapsedTime := func() string {
		return time.Now().Round(time.Second).Sub(formatStartTime).String()
	}

	// Wait on each try for an update.
	ticker := time.NewTicker(500 * time.Millisecond)
	defer ticker.Stop()
	var tries int
	for {
		select {
		case <-ticker.C:
			format, err := connectLoadInitFormats(tries, firstDisk, endpoints, zoneCount, setCount, drivesPerSet, deploymentID)
			if err != nil {
				tries++
				switch err {
				case errNotFirstDisk:
					// Fresh setup, wait for first server to be up.
					logger.Info("Waiting for the first server to format the disks.")
					continue
				case errFirstDiskWait:
					// Fresh setup, wait for other servers to come up.
					logger.Info("Waiting for all other servers to be online to format the disks.")
					continue
				case errXLReadQuorum:
					// no quorum available continue to wait for minimum number of servers.
					logger.Info("Waiting for a minimum of %d disks to come online (elapsed %s)\n", len(endpoints)/2, getElapsedTime())
					continue
				case errXLV3ThisEmpty:
					// need to wait for this error to be healed, so continue.
					continue
				default:
					// For all other unhandled errors we exit and fail.
					return nil, err
				}
			}
			return format, nil
		case <-globalOSSignalCh:
			return nil, fmt.Errorf("Initializing data volumes gracefully stopped")
		}
	}
}