[feat] use rename instead of recursive deletes (#11641)

most of the delete calls today spend time in
a blocking operation where multiple calls need
to be recursively sent to delete the objects,
instead we can use rename operation to atomically
move the objects from the namespace to `tmp/.trash`

we can schedule deletion of objects at this
location once in 15, 30mins and we can also add
wait times between each delete operation.

this allows us to make delete's faster as well
less chattier on the drives, each server runs locally
a groutine which would clean this up regularly.
This commit is contained in:
Harshavardhana 2021-02-26 09:52:27 -08:00 committed by GitHub
parent 1f659204a2
commit 6386b45c08
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 179 additions and 87 deletions

View file

@ -20,6 +20,7 @@ import (
"context" "context"
"fmt" "fmt"
"io" "io"
"os"
"path" "path"
"sort" "sort"
"strconv" "strconv"
@ -138,39 +139,39 @@ func (er erasureObjects) deleteAll(ctx context.Context, bucket, prefix string) {
// Remove the old multipart uploads on the given disk. // Remove the old multipart uploads on the given disk.
func (er erasureObjects) cleanupStaleUploadsOnDisk(ctx context.Context, disk StorageAPI, expiry time.Duration) { func (er erasureObjects) cleanupStaleUploadsOnDisk(ctx context.Context, disk StorageAPI, expiry time.Duration) {
now := time.Now() now := time.Now()
shaDirs, err := disk.ListDir(ctx, minioMetaMultipartBucket, "", -1) diskPath := disk.Endpoint().Path
if err != nil {
return readDirFn(pathJoin(diskPath, minioMetaMultipartBucket), func(shaDir string, typ os.FileMode) error {
} return readDirFn(pathJoin(diskPath, minioMetaMultipartBucket, shaDir), func(uploadIDDir string, typ os.FileMode) error {
for _, shaDir := range shaDirs {
uploadIDDirs, err := disk.ListDir(ctx, minioMetaMultipartBucket, shaDir, -1)
if err != nil {
continue
}
for _, uploadIDDir := range uploadIDDirs {
uploadIDPath := pathJoin(shaDir, uploadIDDir) uploadIDPath := pathJoin(shaDir, uploadIDDir)
fi, err := disk.ReadVersion(ctx, minioMetaMultipartBucket, uploadIDPath, "", false) fi, err := disk.ReadVersion(ctx, minioMetaMultipartBucket, uploadIDPath, "", false)
if err != nil { if err != nil {
continue return nil
} }
wait := er.deletedCleanupSleeper.Timer(ctx)
if now.Sub(fi.ModTime) > expiry { if now.Sub(fi.ModTime) > expiry {
er.renameAll(ctx, minioMetaMultipartBucket, uploadIDPath) er.renameAll(ctx, minioMetaMultipartBucket, uploadIDPath)
} }
wait()
return nil
})
})
readDirFn(pathJoin(diskPath, minioMetaTmpBucket), func(tmpDir string, typ os.FileMode) error {
if tmpDir == ".trash/" { // do not remove .trash/ here, it has its own routines
return nil
} }
}
tmpDirs, err := disk.ListDir(ctx, minioMetaTmpBucket, "", -1)
if err != nil {
return
}
for _, tmpDir := range tmpDirs {
vi, err := disk.StatVol(ctx, pathJoin(minioMetaTmpBucket, tmpDir)) vi, err := disk.StatVol(ctx, pathJoin(minioMetaTmpBucket, tmpDir))
if err != nil { if err != nil {
continue return nil
} }
wait := er.deletedCleanupSleeper.Timer(ctx)
if now.Sub(vi.Created) > expiry { if now.Sub(vi.Created) > expiry {
er.deleteAll(ctx, minioMetaTmpBucket, tmpDir) er.deleteAll(ctx, minioMetaTmpBucket, tmpDir)
} }
} wait()
return nil
})
} }
// ListMultipartUploads - lists all the pending multipart // ListMultipartUploads - lists all the pending multipart

View file

@ -403,21 +403,28 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
// Initialize erasure objects for a given set. // Initialize erasure objects for a given set.
s.sets[i] = &erasureObjects{ s.sets[i] = &erasureObjects{
setNumber: i, setNumber: i,
setDriveCount: setDriveCount, setDriveCount: setDriveCount,
defaultParityCount: defaultParityCount, defaultParityCount: defaultParityCount,
getDisks: s.GetDisks(i), getDisks: s.GetDisks(i),
getLockers: s.GetLockers(i), getLockers: s.GetLockers(i),
getEndpoints: s.GetEndpoints(i), getEndpoints: s.GetEndpoints(i),
nsMutex: mutex, deletedCleanupSleeper: newDynamicSleeper(10, 10*time.Second),
bp: bp, nsMutex: mutex,
mrfOpCh: make(chan partialOperation, 10000), bp: bp,
mrfOpCh: make(chan partialOperation, 10000),
} }
} }
// cleanup ".trash/" folder every 30 minutes with sufficient sleep cycles.
const deletedObjectsCleanupInterval = 30 * time.Minute
// start cleanup stale uploads go-routine. // start cleanup stale uploads go-routine.
go s.cleanupStaleUploads(ctx, GlobalStaleUploadsCleanupInterval, GlobalStaleUploadsExpiry) go s.cleanupStaleUploads(ctx, GlobalStaleUploadsCleanupInterval, GlobalStaleUploadsExpiry)
// start cleanup of deleted objects.
go s.cleanupDeletedObjects(ctx, deletedObjectsCleanupInterval)
// Start the disk monitoring and connect routine. // Start the disk monitoring and connect routine.
go s.monitorAndConnectEndpoints(ctx, defaultMonitorConnectEndpointInterval) go s.monitorAndConnectEndpoints(ctx, defaultMonitorConnectEndpointInterval)
go s.maintainMRFList() go s.maintainMRFList()
@ -426,6 +433,25 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
return s, nil return s, nil
} }
func (s *erasureSets) cleanupDeletedObjects(ctx context.Context, cleanupInterval time.Duration) {
timer := time.NewTimer(cleanupInterval)
defer timer.Stop()
for {
select {
case <-ctx.Done():
return
case <-timer.C:
// Reset for the next interval
timer.Reset(cleanupInterval)
for _, set := range s.sets {
set.cleanupDeletedObjects(ctx)
}
}
}
}
func (s *erasureSets) cleanupStaleUploads(ctx context.Context, cleanupInterval, expiry time.Duration) { func (s *erasureSets) cleanupStaleUploads(ctx context.Context, cleanupInterval, expiry time.Duration) {
timer := time.NewTimer(cleanupInterval) timer := time.NewTimer(cleanupInterval)
defer timer.Stop() defer timer.Stop()

View file

@ -21,6 +21,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"math/rand" "math/rand"
"os"
"sort" "sort"
"sync" "sync"
"time" "time"
@ -71,6 +72,8 @@ type erasureObjects struct {
bp *bpool.BytePoolCap bp *bpool.BytePoolCap
mrfOpCh chan partialOperation mrfOpCh chan partialOperation
deletedCleanupSleeper *dynamicSleeper
} }
// NewNSLock - initialize a new namespace RWLocker instance. // NewNSLock - initialize a new namespace RWLocker instance.
@ -273,6 +276,28 @@ func (er erasureObjects) getOnlineDisksWithHealing() (newDisks []StorageAPI, hea
return newDisks, healing return newDisks, healing
} }
// Clean-up previously deleted objects. from .minio.sys/tmp/.trash/
func (er erasureObjects) cleanupDeletedObjects(ctx context.Context) {
// run multiple cleanup's local to this server.
var wg sync.WaitGroup
for _, disk := range er.getLoadBalancedLocalDisks() {
if disk != nil {
wg.Add(1)
go func(disk StorageAPI) {
defer wg.Done()
diskPath := disk.Endpoint().Path
readDirFn(pathJoin(diskPath, minioMetaTmpDeletedBucket), func(ddir string, typ os.FileMode) error {
wait := er.deletedCleanupSleeper.Timer(ctx)
removeAll(pathJoin(diskPath, minioMetaTmpDeletedBucket, ddir))
wait()
return nil
})
}(disk)
}
}
wg.Wait()
}
// CrawlAndGetDataUsage will start crawling buckets and send updated totals as they are traversed. // CrawlAndGetDataUsage will start crawling buckets and send updated totals as they are traversed.
// Updates are sent on a regular basis and the caller *must* consume them. // Updates are sent on a regular basis and the caller *must* consume them.
func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, updates chan<- dataUsageCache) error { func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, updates chan<- dataUsageCache) error {

View file

@ -22,6 +22,7 @@ import (
"os" "os"
pathutil "path" pathutil "path"
"runtime" "runtime"
"strings"
"github.com/minio/minio/cmd/logger" "github.com/minio/minio/cmd/logger"
"github.com/minio/minio/pkg/lock" "github.com/minio/minio/pkg/lock"
@ -392,6 +393,55 @@ func fsRenameFile(ctx context.Context, sourcePath, destPath string) error {
return nil return nil
} }
func deleteFile(basePath, deletePath string, recursive bool) error {
if basePath == "" || deletePath == "" {
return nil
}
isObjectDir := HasSuffix(deletePath, SlashSeparator)
basePath = pathutil.Clean(basePath)
deletePath = pathutil.Clean(deletePath)
if !strings.HasPrefix(deletePath, basePath) || deletePath == basePath {
return nil
}
var err error
if recursive {
os.RemoveAll(deletePath)
} else {
err = os.Remove(deletePath)
}
if err != nil {
switch {
case isSysErrNotEmpty(err):
// if object is a directory, but if its not empty
// return FileNotFound to indicate its an empty prefix.
if isObjectDir {
return errFileNotFound
}
// Ignore errors if the directory is not empty. The server relies on
// this functionality, and sometimes uses recursion that should not
// error on parent directories.
return nil
case osIsNotExist(err):
return errFileNotFound
case osIsPermission(err):
return errFileAccessDenied
case isSysErrIO(err):
return errFaultyDisk
default:
return err
}
}
deletePath = pathutil.Dir(deletePath)
// Delete parent directory obviously not recursively. Errors for
// parent directories shouldn't trickle down.
deleteFile(basePath, deletePath, false)
return nil
}
// fsDeleteFile is a wrapper for deleteFile(), after checking the path length. // fsDeleteFile is a wrapper for deleteFile(), after checking the path length.
func fsDeleteFile(ctx context.Context, basePath, deletePath string) error { func fsDeleteFile(ctx context.Context, basePath, deletePath string) error {
if err := checkPathLength(basePath); err != nil { if err := checkPathLength(basePath); err != nil {

View file

@ -87,8 +87,9 @@ const (
// GlobalStaleUploadsExpiry - Expiry duration after which the uploads in multipart, tmp directory are deemed stale. // GlobalStaleUploadsExpiry - Expiry duration after which the uploads in multipart, tmp directory are deemed stale.
GlobalStaleUploadsExpiry = time.Hour * 24 // 24 hrs. GlobalStaleUploadsExpiry = time.Hour * 24 // 24 hrs.
// GlobalStaleUploadsCleanupInterval - Cleanup interval when the stale uploads cleanup is initiated. // GlobalStaleUploadsCleanupInterval - Cleanup interval when the stale uploads cleanup is initiated.
GlobalStaleUploadsCleanupInterval = time.Hour * 24 // 24 hrs. GlobalStaleUploadsCleanupInterval = time.Hour * 12 // 12 hrs.
// GlobalServiceExecutionInterval - Executes the Lifecycle events. // GlobalServiceExecutionInterval - Executes the Lifecycle events.
GlobalServiceExecutionInterval = time.Hour * 24 // 24 hrs. GlobalServiceExecutionInterval = time.Hour * 24 // 24 hrs.
@ -96,7 +97,7 @@ const (
// Refresh interval to update in-memory iam config cache. // Refresh interval to update in-memory iam config cache.
globalRefreshIAMInterval = 5 * time.Minute globalRefreshIAMInterval = 5 * time.Minute
// Limit of location constraint XML for unauthenticted PUT bucket operations. // Limit of location constraint XML for unauthenticated PUT bucket operations.
maxLocationConstraintSize = 3 * humanize.MiByte maxLocationConstraintSize = 3 * humanize.MiByte
// Maximum size of default bucket encryption configuration allowed // Maximum size of default bucket encryption configuration allowed

View file

@ -23,6 +23,7 @@ import (
"io" "io"
"os" "os"
"path" "path"
pathutil "path"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -35,7 +36,7 @@ func renameAllBucketMetacache(epPath string) error {
// to `.minio.sys/tmp/` for deletion. // to `.minio.sys/tmp/` for deletion.
return readDirFn(pathJoin(epPath, minioMetaBucket, bucketMetaPrefix), func(name string, typ os.FileMode) error { return readDirFn(pathJoin(epPath, minioMetaBucket, bucketMetaPrefix), func(name string, typ os.FileMode) error {
if typ == os.ModeDir { if typ == os.ModeDir {
tmpMetacacheOld := pathJoin(epPath, minioMetaTmpBucket+"-old", mustGetUUID()) tmpMetacacheOld := pathutil.Join(epPath, minioMetaTmpDeletedBucket, mustGetUUID())
if err := renameAll(pathJoin(epPath, minioMetaBucket, metacachePrefixForID(name, slashSeparator)), if err := renameAll(pathJoin(epPath, minioMetaBucket, metacachePrefixForID(name, slashSeparator)),
tmpMetacacheOld); err != nil && err != errFileNotFound { tmpMetacacheOld); err != nil && err != errFileNotFound {
return fmt.Errorf("unable to rename (%s -> %s) %w", return fmt.Errorf("unable to rename (%s -> %s) %w",

View file

@ -340,8 +340,19 @@ func testObjectAPIPutObjectStaleFiles(obj ObjectLayer, instanceType string, disk
for _, disk := range disks { for _, disk := range disks {
tmpMetaDir := path.Join(disk, minioMetaTmpBucket) tmpMetaDir := path.Join(disk, minioMetaTmpBucket)
if !isDirEmpty(tmpMetaDir) { files, err := ioutil.ReadDir(tmpMetaDir)
t.Fatalf("%s: expected: empty, got: non-empty", minioMetaTmpBucket) if err != nil {
t.Fatal(err)
}
var found bool
for _, fi := range files {
if fi.Name() == ".trash" {
continue
}
found = true
}
if found {
t.Fatalf("%s: expected: empty, got: non-empty %#v", minioMetaTmpBucket, files)
} }
} }
} }
@ -418,8 +429,17 @@ func testObjectAPIMultipartPutObjectStaleFiles(obj ObjectLayer, instanceType str
t.Errorf("%s", err) t.Errorf("%s", err)
} }
if len(files) != 0 { var found bool
t.Fatalf("%s: expected: empty, got: non-empty. content: %s", tmpMetaDir, files) for _, fi := range files {
if fi.Name() == ".trash" {
continue
}
found = true
break
}
if found {
t.Fatalf("%s: expected: empty, got: non-empty. content: %#v", tmpMetaDir, files)
} }
} }
} }

View file

@ -58,8 +58,11 @@ const (
mpartMetaPrefix = "multipart" mpartMetaPrefix = "multipart"
// MinIO Multipart meta prefix. // MinIO Multipart meta prefix.
minioMetaMultipartBucket = minioMetaBucket + SlashSeparator + mpartMetaPrefix minioMetaMultipartBucket = minioMetaBucket + SlashSeparator + mpartMetaPrefix
// MinIO Tmp meta prefix. // MinIO tmp meta prefix.
minioMetaTmpBucket = minioMetaBucket + "/tmp" minioMetaTmpBucket = minioMetaBucket + "/tmp"
// MinIO tmp meta prefix for deleted objects.
minioMetaTmpDeletedBucket = minioMetaTmpBucket + "/.trash"
// DNS separator (period), used for bucket name validation. // DNS separator (period), used for bucket name validation.
dnsDelimiter = "." dnsDelimiter = "."
// On compressed files bigger than this; // On compressed files bigger than this;

View file

@ -940,7 +940,7 @@ func (s *xlStorage) DeleteVersion(ctx context.Context, volume, path string, fi F
if !isXL2V1Format(buf) { if !isXL2V1Format(buf) {
// Delete the meta file, if there are no more versions the // Delete the meta file, if there are no more versions the
// top level parent is automatically removed. // top level parent is automatically removed.
return deleteFile(volumeDir, pathJoin(volumeDir, path), true) return s.deleteFile(volumeDir, pathJoin(volumeDir, path), true)
} }
var xlMeta xlMetaV2 var xlMeta xlMetaV2
@ -967,7 +967,8 @@ func (s *xlStorage) DeleteVersion(ctx context.Context, volume, path string, fi F
return err return err
} }
if err = removeAll(filePath); err != nil { tmpuuid := mustGetUUID()
if err = renameAll(filePath, pathutil.Join(s.diskPath, minioMetaTmpDeletedBucket, tmpuuid)); err != nil {
return err return err
} }
} }
@ -985,7 +986,7 @@ func (s *xlStorage) DeleteVersion(ctx context.Context, volume, path string, fi F
return err return err
} }
return deleteFile(volumeDir, filePath, false) return s.deleteFile(volumeDir, filePath, false)
} }
// WriteMetadata - writes FileInfo metadata for path at `xl.meta` // WriteMetadata - writes FileInfo metadata for path at `xl.meta`
@ -1828,7 +1829,7 @@ func (s *xlStorage) CheckFile(ctx context.Context, volume string, path string) e
// move up the tree, deleting empty parent directories until it finds one // move up the tree, deleting empty parent directories until it finds one
// with files in it. Returns nil for a non-empty directory even when // with files in it. Returns nil for a non-empty directory even when
// recursive is set to false. // recursive is set to false.
func deleteFile(basePath, deletePath string, recursive bool) error { func (s *xlStorage) deleteFile(basePath, deletePath string, recursive bool) error {
if basePath == "" || deletePath == "" { if basePath == "" || deletePath == "" {
return nil return nil
} }
@ -1841,7 +1842,8 @@ func deleteFile(basePath, deletePath string, recursive bool) error {
var err error var err error
if recursive { if recursive {
err = os.RemoveAll(deletePath) tmpuuid := mustGetUUID()
err = renameAll(deletePath, pathutil.Join(s.diskPath, minioMetaTmpDeletedBucket, tmpuuid))
} else { } else {
err = os.Remove(deletePath) err = os.Remove(deletePath)
} }
@ -1872,7 +1874,7 @@ func deleteFile(basePath, deletePath string, recursive bool) error {
// Delete parent directory obviously not recursively. Errors for // Delete parent directory obviously not recursively. Errors for
// parent directories shouldn't trickle down. // parent directories shouldn't trickle down.
deleteFile(basePath, deletePath, false) s.deleteFile(basePath, deletePath, false)
return nil return nil
} }
@ -1910,46 +1912,7 @@ func (s *xlStorage) Delete(ctx context.Context, volume string, path string, recu
} }
// Delete file and delete parent directory as well if it's empty. // Delete file and delete parent directory as well if it's empty.
return deleteFile(volumeDir, filePath, recursive) return s.deleteFile(volumeDir, filePath, recursive)
}
func (s *xlStorage) DeleteFileBulk(volume string, paths []string) (errs []error, err error) {
atomic.AddInt32(&s.activeIOCount, 1)
defer func() {
atomic.AddInt32(&s.activeIOCount, -1)
}()
volumeDir, err := s.getVolDir(volume)
if err != nil {
return nil, err
}
// Stat a volume entry.
_, err = os.Lstat(volumeDir)
if err != nil {
if osIsNotExist(err) {
return nil, errVolumeNotFound
} else if osIsPermission(err) {
return nil, errVolumeAccessDenied
} else if isSysErrIO(err) {
return nil, errFaultyDisk
}
return nil, err
}
errs = make([]error, len(paths))
// Following code is needed so that we retain SlashSeparator
// suffix if any in path argument.
for idx, path := range paths {
filePath := pathJoin(volumeDir, path)
errs[idx] = checkPathLength(filePath)
if errs[idx] != nil {
continue
}
// Delete file and delete parent directory as well if its empty.
errs[idx] = deleteFile(volumeDir, filePath, false)
}
return
} }
// RenameData - rename source path to destination path atomically, metadata and data directory. // RenameData - rename source path to destination path atomically, metadata and data directory.
@ -2180,8 +2143,10 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath, dataDir,
// Commit data // Commit data
if srcDataPath != "" { if srcDataPath != "" {
removeAll(oldDstDataPath) tmpuuid := mustGetUUID()
removeAll(dstDataPath) renameAll(oldDstDataPath, pathutil.Join(s.diskPath, minioMetaTmpDeletedBucket, tmpuuid))
tmpuuid = mustGetUUID()
renameAll(dstDataPath, pathutil.Join(s.diskPath, minioMetaTmpDeletedBucket, tmpuuid))
if err = renameAll(srcDataPath, dstDataPath); err != nil { if err = renameAll(srcDataPath, dstDataPath); err != nil {
return osErrToFileErr(err) return osErrToFileErr(err)
} }
@ -2194,12 +2159,12 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath, dataDir,
// Remove parent dir of the source file if empty // Remove parent dir of the source file if empty
if parentDir := pathutil.Dir(srcFilePath); isDirEmpty(parentDir) { if parentDir := pathutil.Dir(srcFilePath); isDirEmpty(parentDir) {
deleteFile(srcVolumeDir, parentDir, false) s.deleteFile(srcVolumeDir, parentDir, false)
} }
if srcDataPath != "" { if srcDataPath != "" {
if parentDir := pathutil.Dir(srcDataPath); isDirEmpty(parentDir) { if parentDir := pathutil.Dir(srcDataPath); isDirEmpty(parentDir) {
deleteFile(srcVolumeDir, parentDir, false) s.deleteFile(srcVolumeDir, parentDir, false)
} }
} }
@ -2286,7 +2251,7 @@ func (s *xlStorage) RenameFile(ctx context.Context, srcVolume, srcPath, dstVolum
// Remove parent dir of the source file if empty // Remove parent dir of the source file if empty
if parentDir := pathutil.Dir(srcFilePath); isDirEmpty(parentDir) { if parentDir := pathutil.Dir(srcFilePath); isDirEmpty(parentDir) {
deleteFile(srcVolumeDir, parentDir, false) s.deleteFile(srcVolumeDir, parentDir, false)
} }
return nil return nil