erasure: simplify XL backend operations (#4649) (#4758)

This change provides new implementations of the XL backend operations:
 - create file
 - read   file
 - heal   file
Further this change adds table based tests for all three operations.

This affects also the bitrot algorithm integration. Algorithms are now
integrated in an idiomatic way (like crypto.Hash).
Fixes #4696
Fixes #4649
Fixes #4359
This commit is contained in:
Andreas Auernhammer 2017-08-14 18:08:42 -07:00 committed by Dee Koder
parent 617f2394fb
commit 85fcee1919
33 changed files with 1238 additions and 1863 deletions

View file

@ -17,128 +17,79 @@
package cmd
import (
"encoding/hex"
"hash"
"io"
"sync"
"github.com/klauspost/reedsolomon"
)
// erasureCreateFile - writes an entire stream by erasure coding to
// all the disks, writes also calculate individual block's checksum
// for future bit-rot protection.
func erasureCreateFile(disks []StorageAPI, volume, path string, reader io.Reader, allowEmpty bool, blockSize int64,
dataBlocks, parityBlocks int, algo HashAlgo, writeQuorum int) (newDisks []StorageAPI, bytesWritten int64, checkSums []string, err error) {
// CreateFile creates a new bitrot encoded file spread over all available disks. CreateFile will create
// the file at the given volume and path. It will read from src until an io.EOF occurs. The given algorithm will
// be used to protect the erasure encoded file.
func (s *ErasureStorage) CreateFile(src io.Reader, volume, path string, buffer []byte, algorithm BitrotAlgorithm, writeQuorum int) (f ErasureFileInfo, err error) {
if !algorithm.Available() {
return f, traceError(errBitrotHashAlgoInvalid)
}
f.Checksums = make([][]byte, len(s.disks))
hashers := make([]hash.Hash, len(s.disks))
for i := range hashers {
hashers[i] = algorithm.New()
}
errChans, errors := make([]chan error, len(s.disks)), make([]error, len(s.disks))
for i := range errChans {
errChans[i] = make(chan error, 1) // create buffered channel to let finished go-routines die early
}
// Allocated blockSized buffer for reading from incoming stream.
buf := make([]byte, blockSize)
hashWriters := newHashWriters(len(disks), algo)
// Read until io.EOF, erasure codes data and writes to all disks.
for {
var blocks [][]byte
n, rErr := io.ReadFull(reader, buf)
// FIXME: this is a bug in Golang, n == 0 and err ==
// io.ErrUnexpectedEOF for io.ReadFull function.
if n == 0 && rErr == io.ErrUnexpectedEOF {
return nil, 0, nil, traceError(rErr)
}
if rErr == io.EOF {
// We have reached EOF on the first byte read, io.Reader
// must be 0bytes, we don't need to erasure code
// data. Will create a 0byte file instead.
if bytesWritten == 0 && allowEmpty {
blocks = make([][]byte, len(disks))
newDisks, rErr = appendFile(disks, volume, path, blocks, hashWriters, writeQuorum)
if rErr != nil {
return nil, 0, nil, rErr
}
} // else we have reached EOF after few reads, no need to
// add an additional 0bytes at the end.
break
}
if rErr != nil && rErr != io.ErrUnexpectedEOF {
return nil, 0, nil, traceError(rErr)
}
if n > 0 {
// Returns encoded blocks.
var enErr error
blocks, enErr = encodeData(buf[0:n], dataBlocks, parityBlocks)
if enErr != nil {
return nil, 0, nil, enErr
blocks, n := [][]byte{}, len(buffer)
for n == len(buffer) {
n, err = io.ReadFull(src, buffer)
if n == 0 && err == io.EOF {
if f.Size != 0 { // don't write empty block if we have written to the disks
break
}
// Write to all disks.
if newDisks, err = appendFile(disks, volume, path, blocks, hashWriters, writeQuorum); err != nil {
return nil, 0, nil, err
blocks = make([][]byte, len(s.disks)) // write empty block
} else if err == nil || (n > 0 && err == io.ErrUnexpectedEOF) {
blocks, err = s.ErasureEncode(buffer[:n])
if err != nil {
return f, err
}
bytesWritten += int64(n)
} else {
return f, traceError(err)
}
for i := range errChans { // span workers
go erasureAppendFile(s.disks[i], volume, path, hashers[i], blocks[i], errChans[i])
}
for i := range errChans { // what until all workers are finished
errors[i] = <-errChans[i]
}
if err = reduceWriteQuorumErrs(errors, objectOpIgnoredErrs, writeQuorum); err != nil {
return f, err
}
s.disks = evalDisks(s.disks, errors)
f.Size += int64(n)
}
checkSums = make([]string, len(disks))
for i := range checkSums {
checkSums[i] = hex.EncodeToString(hashWriters[i].Sum(nil))
}
return newDisks, bytesWritten, checkSums, nil
}
// encodeData - encodes incoming data buffer into
// dataBlocks+parityBlocks returns a 2 dimensional byte array.
func encodeData(dataBuffer []byte, dataBlocks, parityBlocks int) ([][]byte, error) {
rs, err := reedsolomon.New(dataBlocks, parityBlocks)
if err != nil {
return nil, traceError(err)
}
// Split the input buffer into data and parity blocks.
var blocks [][]byte
blocks, err = rs.Split(dataBuffer)
if err != nil {
return nil, traceError(err)
}
// Encode parity blocks using data blocks.
err = rs.Encode(blocks)
if err != nil {
return nil, traceError(err)
}
// Return encoded blocks.
return blocks, nil
}
// appendFile - append data buffer at path.
func appendFile(disks []StorageAPI, volume, path string, enBlocks [][]byte, hashWriters []hash.Hash, writeQuorum int) ([]StorageAPI, error) {
var wg = &sync.WaitGroup{}
var wErrs = make([]error, len(disks))
// Write encoded data to quorum disks in parallel.
for index, disk := range disks {
if disk == nil {
wErrs[index] = traceError(errDiskNotFound)
f.Algorithm = algorithm
for i, disk := range s.disks {
if disk == OfflineDisk {
continue
}
wg.Add(1)
// Write encoded data in routine.
go func(index int, disk StorageAPI) {
defer wg.Done()
wErr := disk.AppendFile(volume, path, enBlocks[index])
if wErr != nil {
wErrs[index] = traceError(wErr)
return
}
// Calculate hash for each blocks.
hashWriters[index].Write(enBlocks[index])
// Successfully wrote.
wErrs[index] = nil
}(index, disk)
f.Checksums[i] = hashers[i].Sum(nil)
}
// Wait for all the appends to finish.
wg.Wait()
return evalDisks(disks, wErrs), reduceWriteQuorumErrs(wErrs, objectOpIgnoredErrs, writeQuorum)
return f, nil
}
// erasureAppendFile appends the content of buf to the file on the given disk and updates computes
// the hash of the written data. It sends the write error (or nil) over the error channel.
func erasureAppendFile(disk StorageAPI, volume, path string, hash hash.Hash, buf []byte, errChan chan<- error) {
if disk == OfflineDisk {
errChan <- traceError(errDiskNotFound)
return
}
err := disk.AppendFile(volume, path, buf)
if err != nil {
errChan <- err
return
}
hash.Write(buf)
errChan <- err
}

View file

@ -19,186 +19,104 @@ package cmd
import (
"bytes"
"crypto/rand"
"io"
"testing"
humanize "github.com/dustin/go-humanize"
"github.com/klauspost/reedsolomon"
)
// Simulates a faulty disk for AppendFile()
type AppendDiskDown struct {
*posix
}
type badDisk struct{ StorageAPI }
func (a AppendDiskDown) AppendFile(volume string, path string, buf []byte) error {
func (a badDisk) AppendFile(volume string, path string, buf []byte) error {
return errFaultyDisk
}
// Test erasureCreateFile()
func TestErasureCreateFile(t *testing.T) {
// Initialize environment needed for the test.
dataBlocks := 7
parityBlocks := 7
blockSize := int64(blockSizeV1)
setup, err := newErasureTestSetup(dataBlocks, parityBlocks, blockSize)
if err != nil {
t.Error(err)
return
}
defer setup.Remove()
const oneMiByte = 1 * humanize.MiByte
disks := setup.disks
// Prepare a slice of 1MiB with random data.
data := make([]byte, 1*humanize.MiByte)
_, err = rand.Read(data)
if err != nil {
t.Fatal(err)
}
// Test when all disks are up.
_, size, _, err := erasureCreateFile(disks, "testbucket", "testobject1", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != int64(len(data)) {
t.Errorf("erasureCreateFile returned %d, expected %d", size, len(data))
}
// 2 disks down.
disks[4] = AppendDiskDown{disks[4].(*posix)}
disks[5] = AppendDiskDown{disks[5].(*posix)}
// Test when two disks are down.
_, size, _, err = erasureCreateFile(disks, "testbucket", "testobject2", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != int64(len(data)) {
t.Errorf("erasureCreateFile returned %d, expected %d", size, len(data))
}
// 4 more disks down. 6 disks down in total.
disks[6] = AppendDiskDown{disks[6].(*posix)}
disks[7] = AppendDiskDown{disks[7].(*posix)}
disks[8] = AppendDiskDown{disks[8].(*posix)}
disks[9] = AppendDiskDown{disks[9].(*posix)}
_, size, _, err = erasureCreateFile(disks, "testbucket", "testobject3", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != int64(len(data)) {
t.Errorf("erasureCreateFile returned %d, expected %d", size, len(data))
}
// 1 more disk down. 7 disk down in total. Should return quorum error.
disks[10] = AppendDiskDown{disks[10].(*posix)}
_, _, _, err = erasureCreateFile(disks, "testbucket", "testobject4", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if errorCause(err) != errXLWriteQuorum {
t.Errorf("erasureCreateFile return value: expected errXLWriteQuorum, got %s", err)
}
var erasureCreateFileTests = []struct {
dataBlocks int
onDisks, offDisks int
blocksize, data int64
offset int
algorithm BitrotAlgorithm
shouldFail, shouldFailQuorum bool
}{
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 0
{dataBlocks: 3, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 1, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 1
{dataBlocks: 4, onDisks: 8, offDisks: 2, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 2, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 2
{dataBlocks: 5, onDisks: 10, offDisks: 3, blocksize: int64(blockSizeV1), data: oneMiByte, offset: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 3
{dataBlocks: 6, onDisks: 12, offDisks: 4, blocksize: int64(blockSizeV1), data: oneMiByte, offset: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 4
{dataBlocks: 7, onDisks: 14, offDisks: 5, blocksize: int64(blockSizeV1), data: 0, offset: 0, shouldFail: false, algorithm: SHA256, shouldFailQuorum: false}, // 5
{dataBlocks: 8, onDisks: 16, offDisks: 7, blocksize: int64(blockSizeV1), data: 0, offset: 0, shouldFail: false, algorithm: DefaultBitrotAlgorithm, shouldFailQuorum: false}, // 6
{dataBlocks: 2, onDisks: 4, offDisks: 2, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: true}, // 7
{dataBlocks: 4, onDisks: 8, offDisks: 4, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: SHA256, shouldFail: false, shouldFailQuorum: true}, // 8
{dataBlocks: 7, onDisks: 14, offDisks: 7, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 9
{dataBlocks: 8, onDisks: 16, offDisks: 8, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 10
{dataBlocks: 5, onDisks: 10, offDisks: 3, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 11
{dataBlocks: 6, onDisks: 12, offDisks: 5, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 102, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 12
{dataBlocks: 3, onDisks: 6, offDisks: 1, blocksize: int64(blockSizeV1), data: oneMiByte, offset: oneMiByte / 2, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 13
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(oneMiByte / 2), data: oneMiByte, offset: oneMiByte/2 + 1, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 14
{dataBlocks: 4, onDisks: 8, offDisks: 0, blocksize: int64(oneMiByte - 1), data: oneMiByte, offset: oneMiByte - 1, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 15
{dataBlocks: 8, onDisks: 12, offDisks: 2, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 2, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 16
{dataBlocks: 8, onDisks: 10, offDisks: 1, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 17
{dataBlocks: 10, onDisks: 14, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 17, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 18
{dataBlocks: 2, onDisks: 6, offDisks: 2, blocksize: int64(oneMiByte), data: oneMiByte, offset: oneMiByte / 2, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 19
{dataBlocks: 10, onDisks: 16, offDisks: 8, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 20
}
// TestErasureEncode checks for encoding for different data sets.
func TestErasureEncode(t *testing.T) {
// Collection of cases for encode cases.
testEncodeCases := []struct {
inputData []byte
inputDataBlocks int
inputParityBlocks int
shouldPass bool
expectedErr error
}{
// TestCase - 1.
// Regular data encoded.
{
[]byte("Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."),
8,
8,
true,
nil,
},
// TestCase - 2.
// Empty data errors out.
{
[]byte(""),
8,
8,
false,
reedsolomon.ErrShortData,
},
// TestCase - 3.
// Single byte encoded.
{
[]byte("1"),
4,
4,
true,
nil,
},
// TestCase - 4.
// test case with negative data block.
{
[]byte("1"),
-1,
8,
false,
reedsolomon.ErrInvShardNum,
},
// TestCase - 5.
// test case with negative parity block.
{
[]byte("1"),
8,
-1,
false,
reedsolomon.ErrInvShardNum,
},
// TestCase - 6.
// test case with zero data block.
{
[]byte("1"),
0,
8,
false,
reedsolomon.ErrInvShardNum,
},
// TestCase - 7.
// test case with zero parity block.
{
[]byte("1"),
8,
0,
false,
reedsolomon.ErrInvShardNum,
},
// TestCase - 8.
// test case with data + parity blocks > 256.
// expected to fail with Error Max Shard number.
{
[]byte("1"),
129,
128,
false,
reedsolomon.ErrMaxShardNum,
},
}
// Test encode cases.
for i, testCase := range testEncodeCases {
_, actualErr := encodeData(testCase.inputData, testCase.inputDataBlocks, testCase.inputParityBlocks)
if actualErr != nil && testCase.shouldPass {
t.Errorf("Test %d: Expected to pass but failed instead with \"%s\"", i+1, actualErr)
func TestErasureCreateFile(t *testing.T) {
for i, test := range erasureCreateFileTests {
setup, err := newErasureTestSetup(test.dataBlocks, test.onDisks-test.dataBlocks, test.blocksize)
if err != nil {
t.Fatalf("Test %d: failed to create test setup: %v", i, err)
}
if actualErr == nil && !testCase.shouldPass {
t.Errorf("Test %d: Expected to fail with error <Error> \"%v\", but instead passed", i+1, testCase.expectedErr)
storage, err := NewErasureStorage(setup.disks, test.dataBlocks, test.onDisks-test.dataBlocks)
if err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err)
}
// Failed as expected, but does it fail for the expected reason.
if actualErr != nil && !testCase.shouldPass {
if errorCause(actualErr) != testCase.expectedErr {
t.Errorf("Test %d: Expected Error to be \"%v\", but instead found \"%v\" ", i+1, testCase.expectedErr, actualErr)
buffer := make([]byte, test.blocksize, 2*test.blocksize)
data := make([]byte, test.data)
if _, err = io.ReadFull(rand.Reader, data); err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to generate random test data: %v", i, err)
}
algorithm := test.algorithm
if !algorithm.Available() {
algorithm = DefaultBitrotAlgorithm
}
file, err := storage.CreateFile(bytes.NewReader(data[test.offset:]), "testbucket", "object", buffer, test.algorithm, test.dataBlocks+1)
if err != nil && !test.shouldFail {
t.Errorf("Test %d: should pass but failed with: %v", i, err)
}
if err == nil && test.shouldFail {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if length := int64(len(data[test.offset:])); file.Size != length {
t.Errorf("Test %d: invalid number of bytes written: got: #%d want #%d", i, file.Size, length)
}
for j := range storage.disks[:test.offDisks] {
storage.disks[j] = badDisk{nil}
}
if test.offDisks > 0 {
storage.disks[0] = OfflineDisk
}
file, err = storage.CreateFile(bytes.NewReader(data[test.offset:]), "testbucket", "object2", buffer, test.algorithm, test.dataBlocks+1)
if err != nil && !test.shouldFailQuorum {
t.Errorf("Test %d: should pass but failed with: %v", i, err)
}
if err == nil && test.shouldFailQuorum {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if length := int64(len(data[test.offset:])); file.Size != length {
t.Errorf("Test %d: invalid number of bytes written: got: #%d want #%d", i, file.Size, length)
}
}
}
setup.Remove()
}
}

View file

@ -16,71 +16,75 @@
package cmd
import "encoding/hex"
import "hash"
// Heals the erasure coded file. reedsolomon.Reconstruct() is used to reconstruct the missing parts.
func erasureHealFile(latestDisks []StorageAPI, outDatedDisks []StorageAPI, volume, path, healBucket, healPath string,
size, blockSize int64, dataBlocks, parityBlocks int, algo HashAlgo) (checkSums []string, err error) {
var offset int64
remainingSize := size
// Hash for bitrot protection.
hashWriters := newHashWriters(len(outDatedDisks), bitRotAlgo)
for remainingSize > 0 {
curBlockSize := blockSize
if remainingSize < curBlockSize {
curBlockSize = remainingSize
}
// Calculate the block size that needs to be read from each disk.
curEncBlockSize := getChunkSize(curBlockSize, dataBlocks)
// Memory for reading data from disks and reconstructing missing data using erasure coding.
enBlocks := make([][]byte, len(latestDisks))
// Read data from the latest disks.
// FIXME: no need to read from all the disks. dataBlocks+1 is enough.
for index, disk := range latestDisks {
if disk == nil {
continue
}
enBlocks[index] = make([]byte, curEncBlockSize)
_, err := disk.ReadFile(volume, path, offset, enBlocks[index])
if err != nil {
enBlocks[index] = nil
}
}
// Reconstruct any missing data and parity blocks.
err := decodeDataAndParity(enBlocks, dataBlocks, parityBlocks)
if err != nil {
return nil, err
}
// Write to the healPath file.
for index, disk := range outDatedDisks {
if disk == nil {
continue
}
err := disk.AppendFile(healBucket, healPath, enBlocks[index])
if err != nil {
return nil, traceError(err)
}
hashWriters[index].Write(enBlocks[index])
}
remainingSize -= curBlockSize
offset += curEncBlockSize
// HealFile tries to reconstruct a bitrot encoded file spread over all available disks. HealFile will read the valid parts of the file,
// reconstruct the missing data and write the reconstructed parts back to the disks.
// It will try to read the valid parts from the file under the given volume and path and tries to reconstruct the file under the given
// healVolume and healPath. The given algorithm will be used to verify the valid parts and to protect the reconstructed file.
func (s ErasureStorage) HealFile(offlineDisks []StorageAPI, volume, path string, blocksize int64, healVolume, healPath string, size int64, algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo, err error) {
if !algorithm.Available() {
return f, traceError(errBitrotHashAlgoInvalid)
}
// Checksums for the bit rot.
checkSums = make([]string, len(outDatedDisks))
for index, disk := range outDatedDisks {
if disk == nil {
f.Checksums = make([][]byte, len(s.disks))
hashers, verifiers := make([]hash.Hash, len(s.disks)), make([]*BitrotVerifier, len(s.disks))
for i, disk := range s.disks {
if disk == OfflineDisk {
hashers[i] = algorithm.New()
} else {
verifiers[i] = NewBitrotVerifier(algorithm, checksums[i])
f.Checksums[i] = checksums[i]
}
}
blocks := make([][]byte, len(s.disks))
chunksize := getChunkSize(blocksize, s.dataBlocks)
for offset := int64(0); offset < size; offset += blocksize {
if size < blocksize {
blocksize = size
chunksize = getChunkSize(blocksize, s.dataBlocks)
}
numReads := 0
for i, disk := range s.disks {
if disk != OfflineDisk {
if blocks[i] == nil {
blocks[i] = make([]byte, chunksize)
}
blocks[i] = blocks[i][:chunksize]
if !verifiers[i].IsVerified() {
_, err = disk.ReadFileWithVerify(volume, path, offset, blocks[i], verifiers[i])
} else {
_, err = disk.ReadFile(volume, path, offset, blocks[i])
}
if err != nil {
blocks[i] = nil
} else {
numReads++
}
if numReads == s.dataBlocks { // we have enough data to reconstruct
break
}
}
}
if err = s.ErasureDecodeDataAndParityBlocks(blocks); err != nil {
return f, err
}
for i, disk := range s.disks {
if disk != OfflineDisk {
continue
}
if err = offlineDisks[i].AppendFile(healVolume, healPath, blocks[i]); err != nil {
return f, traceError(err)
}
hashers[i].Write(blocks[i])
}
}
f.Size = size
f.Algorithm = algorithm
for i, disk := range s.disks {
if disk != OfflineDisk {
continue
}
checkSums[index] = hex.EncodeToString(hashWriters[index].Sum(nil))
f.Checksums[i] = hashers[i].Sum(nil)
}
return checkSums, nil
return f, nil
}

View file

@ -19,111 +19,124 @@ package cmd
import (
"bytes"
"crypto/rand"
"os"
"path"
"io"
"reflect"
"testing"
humanize "github.com/dustin/go-humanize"
)
// Test erasureHealFile()
var erasureHealFileTests = []struct {
dataBlocks int
disks, offDisks, badDisks, badOffDisks int
blocksize, size int64
algorithm BitrotAlgorithm
shouldFail bool
shouldFailQuorum bool
}{
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 0
{dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 1
{dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 2
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 3
{dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 4
{dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 5
{dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 6
{dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: true, shouldFailQuorum: false}, // 7
{dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 8
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badOffDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true, shouldFailQuorum: false}, // 9
{dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 10
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 11
{dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 12
{dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 13
{dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 14
{dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 15
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 16
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 17
{dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 18
{dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 19
{dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 20
}
func TestErasureHealFile(t *testing.T) {
// Initialize environment needed for the test.
dataBlocks := 7
parityBlocks := 7
blockSize := int64(blockSizeV1)
setup, err := newErasureTestSetup(dataBlocks, parityBlocks, blockSize)
if err != nil {
t.Error(err)
return
}
defer setup.Remove()
disks := setup.disks
// Prepare a slice of 1MiB with random data.
data := make([]byte, 1*humanize.MiByte)
_, err = rand.Read(data)
if err != nil {
t.Fatal(err)
}
// Create a test file.
_, size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject1", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != int64(len(data)) {
t.Errorf("erasureCreateFile returned %d, expected %d", size, len(data))
}
latest := make([]StorageAPI, len(disks)) // Slice of latest disks
outDated := make([]StorageAPI, len(disks)) // Slice of outdated disks
// Test case when one part needs to be healed.
dataPath := path.Join(setup.diskPaths[0], "testbucket", "testobject1")
err = os.Remove(dataPath)
if err != nil {
t.Fatal(err)
}
copy(latest, disks)
latest[0] = nil
outDated[0] = disks[0]
healCheckSums, err := erasureHealFile(latest, outDated, "testbucket", "testobject1", "testbucket", "testobject1", 1*humanize.MiByte, blockSize, dataBlocks, parityBlocks, bitRotAlgo)
if err != nil {
t.Fatal(err)
}
// Checksum of the healed file should match.
if checkSums[0] != healCheckSums[0] {
t.Error("Healing failed, data does not match.")
}
// Test case when parityBlocks number of disks need to be healed.
// Should succeed.
copy(latest, disks)
for index := 0; index < parityBlocks; index++ {
dataPath := path.Join(setup.diskPaths[index], "testbucket", "testobject1")
err = os.Remove(dataPath)
for i, test := range erasureHealFileTests {
setup, err := newErasureTestSetup(test.dataBlocks, test.disks-test.dataBlocks, test.blocksize)
if err != nil {
t.Fatal(err)
t.Fatalf("Test %d: failed to setup XL environment: %v", i, err)
}
latest[index] = nil
outDated[index] = disks[index]
}
healCheckSums, err = erasureHealFile(latest, outDated, "testbucket", "testobject1", "testbucket", "testobject1", 1*humanize.MiByte, blockSize, dataBlocks, parityBlocks, bitRotAlgo)
if err != nil {
t.Fatal(err)
}
// Checksums of the healed files should match.
for index := 0; index < parityBlocks; index++ {
if checkSums[index] != healCheckSums[index] {
t.Error("Healing failed, data does not match.")
}
}
for index := dataBlocks; index < len(disks); index++ {
if healCheckSums[index] != "" {
t.Errorf("expected healCheckSums[%d] to be empty", index)
}
}
// Test case when parityBlocks+1 number of disks need to be healed.
// Should fail.
copy(latest, disks)
for index := 0; index < parityBlocks+1; index++ {
dataPath := path.Join(setup.diskPaths[index], "testbucket", "testobject1")
err = os.Remove(dataPath)
storage, err := NewErasureStorage(setup.disks, test.dataBlocks, test.disks-test.dataBlocks)
if err != nil {
t.Fatal(err)
setup.Remove()
t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err)
}
offline := make([]StorageAPI, len(storage.disks))
copy(offline, storage.disks)
data := make([]byte, test.size)
if _, err = io.ReadFull(rand.Reader, data); err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to create random test data: %v", i, err)
}
latest[index] = nil
outDated[index] = disks[index]
}
_, err = erasureHealFile(latest, outDated, "testbucket", "testobject1", "testbucket", "testobject1", 1*humanize.MiByte, blockSize, dataBlocks, parityBlocks, bitRotAlgo)
if err == nil {
t.Error("Expected erasureHealFile() to fail when the number of available disks <= parityBlocks")
algorithm := test.algorithm
if !algorithm.Available() {
algorithm = DefaultBitrotAlgorithm
}
buffer := make([]byte, test.blocksize, 2*test.blocksize)
file, err := storage.CreateFile(bytes.NewReader(data), "testbucket", "testobject", buffer, algorithm, test.dataBlocks+1)
if err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to create random test data: %v", i, err)
}
info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
if err != nil && !test.shouldFail {
t.Errorf("Test %d: should pass but it failed with: %v", i, err)
}
if err == nil && test.shouldFail {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if info.Size != test.size {
t.Errorf("Test %d: healed wrong number of bytes: got: #%d want: #%d", i, info.Size, test.size)
}
if info.Algorithm != test.algorithm {
t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm)
}
if !reflect.DeepEqual(info.Checksums, file.Checksums) {
t.Errorf("Test %d: heal returned different bitrot keys", i)
}
}
if err == nil && !test.shouldFail {
for j := 0; j < len(storage.disks); j++ {
if j < test.offDisks {
storage.disks[j] = OfflineDisk
} else {
offline[j] = OfflineDisk
}
}
for j := 0; j < test.badDisks; j++ {
storage.disks[test.offDisks+j] = badDisk{nil}
}
for j := 0; j < test.badOffDisks; j++ {
offline[j] = badDisk{nil}
}
info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
if err != nil && !test.shouldFailQuorum {
t.Errorf("Test %d: should pass but it failed with: %v", i, err)
}
if err == nil && test.shouldFailQuorum {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if info.Size != test.size {
t.Errorf("Test %d: healed wrong number of bytes: got: #%d want: #%d", i, info.Size, test.size)
}
if info.Algorithm != test.algorithm {
t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm)
}
if !reflect.DeepEqual(info.Checksums, file.Checksums) {
t.Errorf("Test %d: heal returned different bitrot checksums", i)
}
}
}
setup.Remove()
}
}

View file

@ -18,321 +18,150 @@ package cmd
import (
"io"
"sync"
"github.com/klauspost/reedsolomon"
"github.com/minio/minio/pkg/bpool"
)
// isSuccessDecodeBlocks - do we have all the blocks to be
// successfully decoded?. Input encoded blocks ordered matrix.
func isSuccessDecodeBlocks(enBlocks [][]byte, dataBlocks int) bool {
// Count number of data and parity blocks that were read.
var successDataBlocksCount = 0
var successParityBlocksCount = 0
for index := range enBlocks {
if enBlocks[index] == nil {
continue
}
// block index lesser than data blocks, update data block count.
if index < dataBlocks {
successDataBlocksCount++
continue
} // else { // update parity block count.
successParityBlocksCount++
}
// Returns true if we have atleast dataBlocks parity.
return successDataBlocksCount == dataBlocks || successDataBlocksCount+successParityBlocksCount >= dataBlocks
}
// isSuccessDataBlocks - do we have all the data blocks?
// Input encoded blocks ordered matrix.
func isSuccessDataBlocks(enBlocks [][]byte, dataBlocks int) bool {
// Count number of data blocks that were read.
var successDataBlocksCount = 0
for index := range enBlocks[:dataBlocks] {
if enBlocks[index] == nil {
continue
}
// block index lesser than data blocks, update data block count.
if index < dataBlocks {
successDataBlocksCount++
}
}
// Returns true if we have atleast the dataBlocks.
return successDataBlocksCount >= dataBlocks
}
// Return readable disks slice from which we can read parallelly.
func getReadDisks(orderedDisks []StorageAPI, index int, dataBlocks int) (readDisks []StorageAPI, nextIndex int, err error) {
readDisks = make([]StorageAPI, len(orderedDisks))
dataDisks := 0
parityDisks := 0
// Count already read data and parity chunks.
for i := 0; i < index; i++ {
if orderedDisks[i] == nil {
continue
}
if i < dataBlocks {
dataDisks++
} else {
parityDisks++
}
}
// Sanity checks - we should never have this situation.
if dataDisks == dataBlocks {
return nil, 0, traceError(errUnexpected)
}
if dataDisks+parityDisks >= dataBlocks {
return nil, 0, traceError(errUnexpected)
}
// Find the disks from which next set of parallel reads should happen.
for i := index; i < len(orderedDisks); i++ {
if orderedDisks[i] == nil {
continue
}
if i < dataBlocks {
dataDisks++
} else {
parityDisks++
}
readDisks[i] = orderedDisks[i]
if dataDisks == dataBlocks {
return readDisks, i + 1, nil
} else if dataDisks+parityDisks == dataBlocks {
return readDisks, i + 1, nil
}
}
return nil, 0, traceError(errXLReadQuorum)
}
// parallelRead - reads chunks in parallel from the disks specified in []readDisks.
func parallelRead(volume, path string, readDisks, orderedDisks []StorageAPI, enBlocks [][]byte,
blockOffset, curChunkSize int64, brVerifiers []bitRotVerifier, pool *bpool.BytePool) {
// WaitGroup to synchronise the read go-routines.
wg := &sync.WaitGroup{}
// Read disks in parallel.
for index := range readDisks {
if readDisks[index] == nil {
continue
}
wg.Add(1)
// Reads chunk from readDisk[index] in routine.
go func(index int) {
defer wg.Done()
// evaluate if we need to perform bit-rot checking
needBitRotVerification := true
if brVerifiers[index].isVerified {
needBitRotVerification = false
// if file has bit-rot, do not reuse disk
if brVerifiers[index].hasBitRot {
orderedDisks[index] = nil
return
}
}
buf, err := pool.Get()
if err != nil {
errorIf(err, "unable to get buffer from byte pool")
orderedDisks[index] = nil
return
}
buf = buf[:curChunkSize]
if needBitRotVerification {
_, err = readDisks[index].ReadFileWithVerify(
volume, path, blockOffset, buf,
brVerifiers[index].algo,
brVerifiers[index].checkSum)
} else {
_, err = readDisks[index].ReadFile(volume, path,
blockOffset, buf)
}
// if bit-rot verification was done, store the
// result of verification so we can skip
// re-doing it next time
if needBitRotVerification {
brVerifiers[index].isVerified = true
_, ok := err.(hashMismatchError)
brVerifiers[index].hasBitRot = ok
}
if err != nil {
orderedDisks[index] = nil
return
}
enBlocks[index] = buf
}(index)
}
// Waiting for first routines to finish.
wg.Wait()
}
// erasureReadFile - read bytes from erasure coded files and writes to
// given writer. Erasure coded files are read block by block as per
// given erasureInfo and data chunks are decoded into a data
// block. Data block is trimmed for given offset and length, then
// written to given writer. This function also supports bit-rot
// detection by verifying checksum of individual block's checksum.
func erasureReadFile(writer io.Writer, disks []StorageAPI, volume, path string,
offset, length, totalLength, blockSize int64, dataBlocks, parityBlocks int,
checkSums []string, algo HashAlgo, pool *bpool.BytePool) (int64, error) {
// Offset and length cannot be negative.
// ReadFile reads as much data as requested from the file under the given volume and path and writes the data to the provided writer.
// The algorithm and the keys/checksums are used to verify the integrity of the given file. ReadFile will read data from the given offset
// up to the given length. If parts of the file are corrupted ReadFile tries to reconstruct the data.
func (s ErasureStorage) ReadFile(writer io.Writer, volume, path string, offset, length int64, totalLength int64, checksums [][]byte, algorithm BitrotAlgorithm, blocksize int64, pool *bpool.BytePool) (f ErasureFileInfo, err error) {
if offset < 0 || length < 0 {
return 0, traceError(errUnexpected)
return f, traceError(errUnexpected)
}
// Can't request more data than what is available.
if offset+length > totalLength {
return 0, traceError(errUnexpected)
return f, traceError(errUnexpected)
}
if !algorithm.Available() {
return f, traceError(errBitrotHashAlgoInvalid)
}
// chunkSize is the amount of data that needs to be read from
// each disk at a time.
chunkSize := getChunkSize(blockSize, dataBlocks)
brVerifiers := make([]bitRotVerifier, len(disks))
for i := range brVerifiers {
brVerifiers[i].algo = algo
brVerifiers[i].checkSum = checkSums[i]
f.Checksums = make([][]byte, len(s.disks))
verifiers := make([]*BitrotVerifier, len(s.disks))
for i, disk := range s.disks {
if disk == OfflineDisk {
continue
}
verifiers[i] = NewBitrotVerifier(algorithm, checksums[i])
}
errChans := make([]chan error, len(s.disks))
for i := range errChans {
errChans[i] = make(chan error, 1)
}
lastBlock := totalLength / blocksize
startOffset := offset % blocksize
chunksize := getChunkSize(blocksize, s.dataBlocks)
// Total bytes written to writer
var bytesWritten int64
startBlock := offset / blockSize
endBlock := (offset + length) / blockSize
// curChunkSize = chunk size for the current block in the for loop below.
// curBlockSize = block size for the current block in the for loop below.
// curChunkSize and curBlockSize can change for the last block if totalLength%blockSize != 0
curChunkSize := chunkSize
curBlockSize := blockSize
// For each block, read chunk from each disk. If we are able to read all the data disks then we don't
// need to read parity disks. If one of the data disk is missing we need to read DataBlocks+1 number
// of disks. Once read, we Reconstruct() missing data if needed and write it to the given writer.
for block := startBlock; block <= endBlock; block++ {
// Mark all buffers as unused at the start of the loop so that the buffers
// can be reused.
blocks := make([][]byte, len(s.disks))
for off := offset / blocksize; length > 0; off++ {
blockOffset := off * chunksize
pool.Reset()
// Each element of enBlocks holds curChunkSize'd amount of data read from its corresponding disk.
enBlocks := make([][]byte, len(disks))
if ((offset + bytesWritten) / blockSize) == (totalLength / blockSize) {
// This is the last block for which curBlockSize and curChunkSize can change.
// For ex. if totalLength is 15M and blockSize is 10MB, curBlockSize for
// the last block should be 5MB.
curBlockSize = totalLength % blockSize
curChunkSize = getChunkSize(curBlockSize, dataBlocks)
if currentBlock := (offset + f.Size) / blocksize; currentBlock == lastBlock {
blocksize = totalLength % blocksize
chunksize = getChunkSize(blocksize, s.dataBlocks)
}
// NOTE: That for the offset calculation we have to use chunkSize and
// not curChunkSize. If we use curChunkSize for offset calculation
// then it can result in wrong offset for the last block.
blockOffset := block * chunkSize
// nextIndex - index from which next set of parallel reads
// should happen.
nextIndex := 0
for {
// readDisks - disks from which we need to read in parallel.
var readDisks []StorageAPI
var err error
// get readable disks slice from which we can read parallelly.
readDisks, nextIndex, err = getReadDisks(disks, nextIndex, dataBlocks)
if err != nil {
return bytesWritten, err
}
// Issue a parallel read across the disks specified in readDisks.
parallelRead(volume, path, readDisks, disks, enBlocks, blockOffset, curChunkSize, brVerifiers, pool)
if isSuccessDecodeBlocks(enBlocks, dataBlocks) {
// If enough blocks are available to do rs.Reconstruct()
break
}
if nextIndex == len(disks) {
// No more disks to read from.
return bytesWritten, traceError(errXLReadQuorum)
}
// We do not have enough enough data blocks to reconstruct the data
// hence continue the for-loop till we have enough data blocks.
}
// If we have all the data blocks no need to decode, continue to write.
if !isSuccessDataBlocks(enBlocks, dataBlocks) {
// Reconstruct the missing data blocks.
if err := decodeMissingData(enBlocks, dataBlocks, parityBlocks); err != nil {
return bytesWritten, err
}
}
// Offset in enBlocks from where data should be read from.
var enBlocksOffset int64
// Total data to be read from enBlocks.
enBlocksLength := curBlockSize
// If this is the start block then enBlocksOffset might not be 0.
if block == startBlock {
enBlocksOffset = offset % blockSize
enBlocksLength -= enBlocksOffset
}
remaining := length - bytesWritten
if remaining < enBlocksLength {
// We should not send more data than what was requested.
enBlocksLength = remaining
}
// Write data blocks.
n, err := writeDataBlocks(writer, enBlocks, dataBlocks, enBlocksOffset, enBlocksLength)
err = s.readConcurrent(volume, path, blockOffset, chunksize, blocks, verifiers, errChans, pool)
if err != nil {
return bytesWritten, err
return f, traceError(errXLReadQuorum)
}
// Update total bytes written.
bytesWritten += n
writeLength := blocksize - startOffset
if length < writeLength {
writeLength = length
}
n, err := writeDataBlocks(writer, blocks, s.dataBlocks, startOffset, writeLength)
if err != nil {
return f, err
}
startOffset = 0
f.Size += int64(n)
length -= int64(n)
}
if bytesWritten == length {
// Done writing all the requested data.
break
f.Algorithm = algorithm
for i, disk := range s.disks {
if disk == OfflineDisk {
continue
}
f.Checksums[i] = verifiers[i].Sum(nil)
}
return f, nil
}
func erasureCountMissingBlocks(blocks [][]byte, limit int) int {
missing := 0
for i := range blocks[:limit] {
if blocks[i] == nil {
missing++
}
}
// Success.
return bytesWritten, nil
return missing
}
// decodeMissingData - decode any missing data blocks.
func decodeMissingData(enBlocks [][]byte, dataBlocks, parityBlocks int) error {
// Initialize reedsolomon.
rs, err := reedsolomon.New(dataBlocks, parityBlocks)
if err != nil {
return traceError(err)
// readConcurrent reads all requested data concurrently from the disks into blocks. It returns an error if
// too many disks failed while reading.
func (s *ErasureStorage) readConcurrent(volume, path string, offset int64, length int64, blocks [][]byte, verifiers []*BitrotVerifier, errChans []chan error, pool *bpool.BytePool) (err error) {
errs := make([]error, len(s.disks))
for i := range blocks {
blocks[i], err = pool.Get()
if err != nil {
return traceErrorf("failed to get new buffer from pool: %v", err)
}
blocks[i] = blocks[i][:length]
}
// Reconstruct any missing data blocks.
return rs.ReconstructData(enBlocks)
}
// decodeDataAndParity - decode all encoded data and parity blocks.
func decodeDataAndParity(enBlocks [][]byte, dataBlocks, parityBlocks int) error {
// Initialize reedsolomon.
rs, err := reedsolomon.New(dataBlocks, parityBlocks)
if err != nil {
return traceError(err)
erasureReadBlocksConcurrent(s.disks[:s.dataBlocks], volume, path, offset, blocks[:s.dataBlocks], verifiers[:s.dataBlocks], errs[:s.dataBlocks], errChans[:s.dataBlocks])
missingDataBlocks := erasureCountMissingBlocks(blocks, s.dataBlocks)
mustReconstruct := missingDataBlocks > 0
if mustReconstruct {
requiredReads := s.dataBlocks + missingDataBlocks
if requiredReads > s.dataBlocks+s.parityBlocks {
return errXLReadQuorum
}
erasureReadBlocksConcurrent(s.disks[s.dataBlocks:requiredReads], volume, path, offset, blocks[s.dataBlocks:requiredReads], verifiers[s.dataBlocks:requiredReads], errs[s.dataBlocks:requiredReads], errChans[s.dataBlocks:requiredReads])
if erasureCountMissingBlocks(blocks, requiredReads) > 0 {
erasureReadBlocksConcurrent(s.disks[requiredReads:], volume, path, offset, blocks[requiredReads:], verifiers[requiredReads:], errs[requiredReads:], errChans[requiredReads:])
}
}
// Reconstruct encoded blocks.
return rs.Reconstruct(enBlocks)
if err = reduceReadQuorumErrs(errs, []error{}, s.dataBlocks); err != nil {
return err
}
if mustReconstruct {
if err = s.ErasureDecodeDataBlocks(blocks); err != nil {
return err
}
}
return nil
}
// erasureReadBlocksConcurrent reads all data from each disk to each data block in parallel.
// Therefore disks, blocks, verifiers errors and locks must have the same length.
func erasureReadBlocksConcurrent(disks []StorageAPI, volume, path string, offset int64, blocks [][]byte, verifiers []*BitrotVerifier, errors []error, errChans []chan error) {
for i := range errChans {
go erasureReadFromFile(disks[i], volume, path, offset, blocks[i], verifiers[i], errChans[i])
}
for i := range errChans {
errors[i] = <-errChans[i] // blocks until the go routine 'i' is done - no data race
if errors[i] != nil {
disks[i] = OfflineDisk
blocks[i] = nil
}
}
}
// erasureReadFromFile reads data from the disk to buffer in parallel.
// It sends the returned error through the error channel.
func erasureReadFromFile(disk StorageAPI, volume, path string, offset int64, buffer []byte, verifier *BitrotVerifier, errChan chan<- error) {
if disk == OfflineDisk {
errChan <- traceError(errDiskNotFound)
return
}
var err error
if !verifier.IsVerified() {
_, err = disk.ReadFileWithVerify(volume, path, offset, buffer, verifier)
} else {
_, err = disk.ReadFile(volume, path, offset, buffer)
}
errChan <- err
}

View file

@ -18,358 +18,148 @@ package cmd
import (
"bytes"
crand "crypto/rand"
"io"
"math/rand"
"testing"
"reflect"
humanize "github.com/dustin/go-humanize"
"github.com/minio/minio/pkg/bpool"
)
// Tests getReadDisks which returns readable disks slice from which we can
// read parallelly.
func testGetReadDisks(t *testing.T, xl *xlObjects) {
d := xl.storageDisks
testCases := []struct {
index int // index argument for getReadDisks
argDisks []StorageAPI // disks argument for getReadDisks
retDisks []StorageAPI // disks return value from getReadDisks
nextIndex int // return value from getReadDisks
err error // error return value from getReadDisks
}{
// Test case - 1.
// When all disks are available, should return data disks.
{
0,
[]StorageAPI{d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]},
[]StorageAPI{d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], nil, nil, nil, nil, nil, nil, nil, nil},
8,
nil,
},
// Test case - 2.
// If a parity disk is down, should return all data disks.
{
0,
[]StorageAPI{d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], nil, d[10], d[11], d[12], d[13], d[14], d[15]},
[]StorageAPI{d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], nil, nil, nil, nil, nil, nil, nil, nil},
8,
nil,
},
// Test case - 3.
// If a data disk is down, should return 7 data and 1 parity.
{
0,
[]StorageAPI{nil, d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]},
[]StorageAPI{nil, d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], nil, nil, nil, nil, nil, nil, nil},
9,
nil,
},
// Test case - 4.
// If 7 data disks are down, should return 1 data and 7 parity.
{
0,
[]StorageAPI{nil, nil, nil, nil, nil, nil, nil, d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]},
[]StorageAPI{nil, nil, nil, nil, nil, nil, nil, d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], nil},
15,
nil,
},
// Test case - 5.
// When 2 disks fail during parallelRead, next call to getReadDisks should return 3 disks
{
8,
[]StorageAPI{nil, nil, d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]},
[]StorageAPI{nil, nil, nil, nil, nil, nil, nil, nil, d[8], d[9], nil, nil, nil, nil, nil, nil},
10,
nil,
},
// Test case - 6.
// If 2 disks again fail from the 3 disks returned previously, return next 2 disks
{
11,
[]StorageAPI{nil, nil, d[2], d[3], d[4], d[5], d[6], d[7], nil, nil, d[10], d[11], d[12], d[13], d[14], d[15]},
[]StorageAPI{nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, d[11], nil, nil, nil, nil},
12,
nil,
},
// Test case - 7.
// No more disks are available for read, return error
{
13,
[]StorageAPI{nil, nil, d[2], d[3], d[4], d[5], d[6], d[7], nil, nil, d[10], nil, nil, nil, nil, nil},
nil,
0,
errXLReadQuorum,
},
}
for i, test := range testCases {
disks, nextIndex, err := getReadDisks(test.argDisks, test.index, xl.dataBlocks)
if errorCause(err) != test.err {
t.Errorf("test-case %d - expected error : %s, got : %s", i+1, test.err, err)
continue
}
if test.nextIndex != nextIndex {
t.Errorf("test-case %d - expected nextIndex: %d, got : %d", i+1, test.nextIndex, nextIndex)
continue
}
if !reflect.DeepEqual(test.retDisks, disks) {
t.Errorf("test-case %d : incorrect disks returned. expected %+v, got %+v", i+1, test.retDisks, disks)
continue
}
}
}
// Test for isSuccessDataBlocks and isSuccessDecodeBlocks.
func TestIsSuccessBlocks(t *testing.T) {
dataBlocks := 8
testCases := []struct {
enBlocks [][]byte // data and parity blocks.
successData bool // expected return value of isSuccessDataBlocks()
successDecode bool // expected return value of isSuccessDecodeBlocks()
}{
{
// When all data and partity blocks are available.
[][]byte{
{'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
{'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
},
true,
true,
},
{
// When one data block is not available.
[][]byte{
nil, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
{'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
},
false,
true,
},
{
// When one data and all parity are available, enough for reedsolomon.Reconstruct()
[][]byte{
nil, nil, nil, nil, nil, nil, nil, {'a'},
{'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
},
false,
true,
},
{
// When all data disks are not available, enough for reedsolomon.Reconstruct()
[][]byte{
nil, nil, nil, nil, nil, nil, nil, nil,
{'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
},
false,
true,
},
{
// Not enough disks for reedsolomon.Reconstruct()
[][]byte{
nil, nil, nil, nil, nil, nil, nil, nil,
nil, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'}, {'a'},
},
false,
false,
},
}
for i, test := range testCases {
got := isSuccessDataBlocks(test.enBlocks, dataBlocks)
if test.successData != got {
t.Errorf("test-case %d : expected %v got %v", i+1, test.successData, got)
}
got = isSuccessDecodeBlocks(test.enBlocks, dataBlocks)
if test.successDecode != got {
t.Errorf("test-case %d : expected %v got %v", i+1, test.successDecode, got)
}
}
}
// Wrapper function for testGetReadDisks, testShuffleDisks.
func TestErasureReadUtils(t *testing.T) {
nDisks := 16
disks, err := getRandomDisks(nDisks)
if err != nil {
t.Fatal(err)
}
objLayer, _, err := initObjectLayer(mustGetNewEndpointList(disks...))
if err != nil {
removeRoots(disks)
t.Fatal(err)
}
defer removeRoots(disks)
xl := objLayer.(*xlObjects)
testGetReadDisks(t, xl)
}
// Simulates a faulty disk for ReadFile()
type ReadDiskDown struct {
*posix
}
func (r ReadDiskDown) ReadFile(volume string, path string, offset int64, buf []byte) (n int64, err error) {
func (d badDisk) ReadFile(volume string, path string, offset int64, buf []byte) (n int64, err error) {
return 0, errFaultyDisk
}
func (r ReadDiskDown) ReadFileWithVerify(volume string, path string, offset int64, buf []byte,
algo HashAlgo, expectedHash string) (n int64, err error) {
func (d badDisk) ReadFileWithVerify(volume string, path string, offset int64, buf []byte, verifier *BitrotVerifier) (n int64, err error) {
return 0, errFaultyDisk
}
func TestErasureReadFileDiskFail(t *testing.T) {
// Initialize environment needed for the test.
dataBlocks := 7
parityBlocks := 7
blockSize := int64(blockSizeV1)
setup, err := newErasureTestSetup(dataBlocks, parityBlocks, blockSize)
if err != nil {
t.Error(err)
return
}
defer setup.Remove()
disks := setup.disks
// Prepare a slice of 1humanize.MiByte with random data.
data := make([]byte, 1*humanize.MiByte)
length := int64(len(data))
_, err = rand.Read(data)
if err != nil {
t.Fatal(err)
}
// Create a test file to read from.
_, size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != length {
t.Errorf("erasureCreateFile returned %d, expected %d", size, length)
}
// create byte pool which will be used by erasureReadFile for
// reading from disks and erasure decoding.
chunkSize := getChunkSize(blockSize, dataBlocks)
pool := bpool.NewBytePool(chunkSize, len(disks))
buf := &bytes.Buffer{}
_, err = erasureReadFile(buf, disks, "testbucket", "testobject", 0, length, length, blockSize, dataBlocks, parityBlocks, checkSums, bitRotAlgo, pool)
if err != nil {
t.Error(err)
}
if !bytes.Equal(buf.Bytes(), data) {
t.Error("Contents of the erasure coded file differs")
}
// 2 disks down. Read should succeed.
disks[4] = ReadDiskDown{disks[4].(*posix)}
disks[5] = ReadDiskDown{disks[5].(*posix)}
buf.Reset()
_, err = erasureReadFile(buf, disks, "testbucket", "testobject", 0, length, length, blockSize, dataBlocks, parityBlocks, checkSums, bitRotAlgo, pool)
if err != nil {
t.Error(err)
}
if !bytes.Equal(buf.Bytes(), data) {
t.Error("Contents of the erasure coded file differs")
}
// 4 more disks down. 6 disks down in total. Read should succeed.
disks[6] = ReadDiskDown{disks[6].(*posix)}
disks[8] = ReadDiskDown{disks[8].(*posix)}
disks[9] = ReadDiskDown{disks[9].(*posix)}
disks[11] = ReadDiskDown{disks[11].(*posix)}
buf.Reset()
_, err = erasureReadFile(buf, disks, "testbucket", "testobject", 0, length, length, blockSize, dataBlocks, parityBlocks, checkSums, bitRotAlgo, pool)
if err != nil {
t.Error(err)
}
if !bytes.Equal(buf.Bytes(), data) {
t.Error("Contents of the erasure coded file differs")
}
// 2 more disk down. 8 disks down in total. Read should fail.
disks[12] = ReadDiskDown{disks[12].(*posix)}
disks[13] = ReadDiskDown{disks[13].(*posix)}
buf.Reset()
_, err = erasureReadFile(buf, disks, "testbucket", "testobject", 0, length, length, blockSize, dataBlocks, parityBlocks, checkSums, bitRotAlgo, pool)
if errorCause(err) != errXLReadQuorum {
t.Fatal("expected errXLReadQuorum error")
}
var erasureReadFileTests = []struct {
dataBlocks int
onDisks, offDisks int
blocksize, data int64
offset int64
length int64
algorithm BitrotAlgorithm
shouldFail, shouldFailQuorum bool
}{
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 0
{dataBlocks: 3, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 1
{dataBlocks: 4, onDisks: 8, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 2
{dataBlocks: 5, onDisks: 10, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 1, length: oneMiByte - 1, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 3
{dataBlocks: 6, onDisks: 12, offDisks: 0, blocksize: int64(oneMiByte), data: oneMiByte, offset: oneMiByte, length: 0, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 4
{dataBlocks: 7, onDisks: 14, offDisks: 0, blocksize: int64(oneMiByte), data: oneMiByte, offset: 3, length: 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 5
{dataBlocks: 8, onDisks: 16, offDisks: 0, blocksize: int64(oneMiByte), data: oneMiByte, offset: 4, length: 8 * 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 6
{dataBlocks: 7, onDisks: 14, offDisks: 7, blocksize: int64(blockSizeV1), data: oneMiByte, offset: oneMiByte, length: 1, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 7
{dataBlocks: 6, onDisks: 12, offDisks: 6, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 8
{dataBlocks: 5, onDisks: 10, offDisks: 5, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 9
{dataBlocks: 4, onDisks: 8, offDisks: 4, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 10
{dataBlocks: 3, onDisks: 6, offDisks: 3, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 11
{dataBlocks: 2, onDisks: 4, offDisks: 2, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 12
{dataBlocks: 2, onDisks: 4, offDisks: 1, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 13
{dataBlocks: 3, onDisks: 6, offDisks: 2, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 14
{dataBlocks: 4, onDisks: 8, offDisks: 3, blocksize: int64(2 * oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 15
{dataBlocks: 5, onDisks: 10, offDisks: 6, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 16
{dataBlocks: 5, onDisks: 10, offDisks: 2, blocksize: int64(blockSizeV1), data: 2 * oneMiByte, offset: oneMiByte, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 17
{dataBlocks: 5, onDisks: 10, offDisks: 1, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 18
{dataBlocks: 6, onDisks: 12, offDisks: 3, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 19
{dataBlocks: 6, onDisks: 12, offDisks: 7, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 20
{dataBlocks: 8, onDisks: 16, offDisks: 8, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 21
{dataBlocks: 8, onDisks: 16, offDisks: 9, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 22
{dataBlocks: 8, onDisks: 16, offDisks: 7, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 23
{dataBlocks: 2, onDisks: 4, offDisks: 1, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 24
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 25
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 26
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(blockSizeV1) + 1, offset: 0, length: int64(blockSizeV1) + 1, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 27
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 12, length: int64(blockSizeV1) + 17, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 28
{dataBlocks: 3, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 1023, length: int64(blockSizeV1) + 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 29
{dataBlocks: 4, onDisks: 8, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 11, length: int64(blockSizeV1) + 2*1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 30
{dataBlocks: 6, onDisks: 12, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 512, length: int64(blockSizeV1) + 8*1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 31
{dataBlocks: 8, onDisks: 16, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: int64(blockSizeV1), length: int64(blockSizeV1) - 1, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 32
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(oneMiByte), offset: -1, length: 3, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 33
{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(oneMiByte), offset: 1024, length: -1, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 34
{dataBlocks: 4, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV1), data: int64(blockSizeV1), offset: 0, length: int64(blockSizeV1), algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 35
{dataBlocks: 4, onDisks: 6, offDisks: 1, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 12, length: int64(blockSizeV1) + 17, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 36
{dataBlocks: 4, onDisks: 6, offDisks: 3, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 1023, length: int64(blockSizeV1) + 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 37
{dataBlocks: 8, onDisks: 12, offDisks: 4, blocksize: int64(blockSizeV1), data: int64(2 * blockSizeV1), offset: 11, length: int64(blockSizeV1) + 2*1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 38
}
func TestErasureReadFileOffsetLength(t *testing.T) {
// Initialize environment needed for the test.
dataBlocks := 7
parityBlocks := 7
blockSize := int64(1 * humanize.MiByte)
setup, err := newErasureTestSetup(dataBlocks, parityBlocks, blockSize)
if err != nil {
t.Error(err)
return
}
defer setup.Remove()
disks := setup.disks
// Prepare a slice of 5humanize.MiByte with random data.
data := make([]byte, 5*humanize.MiByte)
length := int64(len(data))
_, err = rand.Read(data)
if err != nil {
t.Fatal(err)
}
// Create a test file to read from.
_, size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != length {
t.Errorf("erasureCreateFile returned %d, expected %d", size, length)
}
testCases := []struct {
offset, length int64
}{
// Full file.
{0, length},
// Read nothing.
{length, 0},
// 2nd block.
{blockSize, blockSize},
// Test cases for random offsets and lengths.
{blockSize - 1, 2},
{blockSize - 1, blockSize + 1},
{blockSize + 1, blockSize - 1},
{blockSize + 1, blockSize},
{blockSize + 1, blockSize + 1},
{blockSize*2 - 1, blockSize + 1},
{length - 1, 1},
{length - blockSize, blockSize},
{length - blockSize - 1, blockSize},
{length - blockSize - 1, blockSize + 1},
}
chunkSize := getChunkSize(blockSize, dataBlocks)
pool := bpool.NewBytePool(chunkSize, len(disks))
// Compare the data read from file with "data" byte array.
for i, testCase := range testCases {
expected := data[testCase.offset:(testCase.offset + testCase.length)]
buf := &bytes.Buffer{}
_, err = erasureReadFile(buf, disks, "testbucket", "testobject", testCase.offset, testCase.length, length, blockSize, dataBlocks, parityBlocks, checkSums, bitRotAlgo, pool)
func TestErasureReadFile(t *testing.T) {
for i, test := range erasureReadFileTests {
setup, err := newErasureTestSetup(test.dataBlocks, test.onDisks-test.dataBlocks, test.blocksize)
if err != nil {
t.Error(err)
continue
t.Fatalf("Test %d: failed to create test setup: %v", i, err)
}
got := buf.Bytes()
if !bytes.Equal(expected, got) {
t.Errorf("Test %d : read data is different from what was expected", i+1)
storage, err := NewErasureStorage(setup.disks, test.dataBlocks, test.onDisks-test.dataBlocks)
if err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err)
}
data := make([]byte, test.data)
if _, err = io.ReadFull(crand.Reader, data); err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to generate random test data: %v", i, err)
}
writeAlgorithm := test.algorithm
if !test.algorithm.Available() {
writeAlgorithm = DefaultBitrotAlgorithm
}
buffer := make([]byte, test.blocksize, 2*test.blocksize)
file, err := storage.CreateFile(bytes.NewReader(data[:]), "testbucket", "object", buffer, writeAlgorithm, test.dataBlocks+1)
if err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to create erasure test file: %v", i, err)
}
pool := bpool.NewBytePool(getChunkSize(test.blocksize, test.dataBlocks), len(storage.disks))
writer := bytes.NewBuffer(nil)
readInfo, err := storage.ReadFile(writer, "testbucket", "object", test.offset, test.length, test.data, file.Checksums, test.algorithm, test.blocksize, pool)
if err != nil && !test.shouldFail {
t.Errorf("Test %d: should pass but failed with: %v", i, err)
}
if err == nil && test.shouldFail {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if readInfo.Size != test.length {
t.Errorf("Test %d: read returns wrong number of bytes: got: #%d want: #%d", i, readInfo.Size, test.length)
}
if readInfo.Algorithm != test.algorithm {
t.Errorf("Test %d: read returns wrong algorithm: got: %v want: %v", i, readInfo.Algorithm, test.algorithm)
}
if content := writer.Bytes(); !bytes.Equal(content, data[test.offset:test.offset+test.length]) {
t.Errorf("Test %d: read retruns wrong file content", i)
}
}
if err == nil && !test.shouldFail {
writer.Reset()
for j := range storage.disks[:test.offDisks] {
storage.disks[j] = badDisk{nil}
}
if test.offDisks > 0 {
storage.disks[0] = OfflineDisk
}
readInfo, err = storage.ReadFile(writer, "testbucket", "object", test.offset, test.length, test.data, file.Checksums, test.algorithm, test.blocksize, pool)
if err != nil && !test.shouldFailQuorum {
t.Errorf("Test %d: should pass but failed with: %v", i, err)
}
if err == nil && test.shouldFailQuorum {
t.Errorf("Test %d: should fail but it passed", i)
}
if !test.shouldFailQuorum {
if readInfo.Size != test.length {
t.Errorf("Test %d: read returns wrong number of bytes: got: #%d want: #%d", i, readInfo.Size, test.length)
}
if readInfo.Algorithm != test.algorithm {
t.Errorf("Test %d: read returns wrong algorithm: got: %v want: %v", i, readInfo.Algorithm, test.algorithm)
}
if content := writer.Bytes(); !bytes.Equal(content, data[test.offset:test.offset+test.length]) {
t.Errorf("Test %d: read retruns wrong file content", i)
}
}
}
setup.Remove()
}
}
@ -390,8 +180,10 @@ func TestErasureReadFileRandomOffsetLength(t *testing.T) {
}
defer setup.Remove()
disks := setup.disks
storage, err := NewErasureStorage(setup.disks, dataBlocks, parityBlocks)
if err != nil {
t.Fatalf("failed to create ErasureStorage: %v", err)
}
// Prepare a slice of 5MiB with random data.
data := make([]byte, 5*humanize.MiByte)
length := int64(len(data))
@ -404,12 +196,13 @@ func TestErasureReadFileRandomOffsetLength(t *testing.T) {
iterations := 10000
// Create a test file to read from.
_, size, checkSums, err := erasureCreateFile(disks, "testbucket", "testobject", bytes.NewReader(data), true, blockSize, dataBlocks, parityBlocks, bitRotAlgo, dataBlocks+1)
buffer := make([]byte, blockSize, 2*blockSize)
file, err := storage.CreateFile(bytes.NewReader(data), "testbucket", "testobject", buffer, DefaultBitrotAlgorithm, dataBlocks+1)
if err != nil {
t.Fatal(err)
}
if size != length {
t.Errorf("erasureCreateFile returned %d, expected %d", size, length)
if file.Size != length {
t.Errorf("erasureCreateFile returned %d, expected %d", file.Size, length)
}
// To generate random offset/length.
@ -418,7 +211,7 @@ func TestErasureReadFileRandomOffsetLength(t *testing.T) {
// create pool buffer which will be used by erasureReadFile for
// reading from disks and erasure decoding.
chunkSize := getChunkSize(blockSize, dataBlocks)
pool := bpool.NewBytePool(chunkSize, len(disks))
pool := bpool.NewBytePool(chunkSize, len(storage.disks))
buf := &bytes.Buffer{}
@ -429,7 +222,7 @@ func TestErasureReadFileRandomOffsetLength(t *testing.T) {
expected := data[offset : offset+readLen]
_, err = erasureReadFile(buf, disks, "testbucket", "testobject", offset, readLen, length, blockSize, dataBlocks, parityBlocks, checkSums, bitRotAlgo, pool)
_, err = storage.ReadFile(buf, "testbucket", "testobject", offset, readLen, length, file.Checksums, DefaultBitrotAlgorithm, blockSize, pool)
if err != nil {
t.Fatal(err, offset, readLen)
}

View file

@ -18,72 +18,11 @@ package cmd
import (
"bytes"
"errors"
"hash"
"io"
"sync"
"github.com/klauspost/reedsolomon"
"github.com/minio/sha256-simd"
"golang.org/x/crypto/blake2b"
)
// newHashWriters - inititialize a slice of hashes for the disk count.
func newHashWriters(diskCount int, algo HashAlgo) []hash.Hash {
hashWriters := make([]hash.Hash, diskCount)
for index := range hashWriters {
hashWriters[index] = newHash(algo)
}
return hashWriters
}
// newHash - gives you a newly allocated hash depending on the input algorithm.
func newHash(algo HashAlgo) (h hash.Hash) {
switch algo {
case HashSha256:
// sha256 checksum specially on ARM64 platforms or whenever
// requested as dictated by `xl.json` entry.
h = sha256.New()
case HashBlake2b:
// ignore the error, because New512 without a key never fails
// New512 only returns a non-nil error, if the length of the passed
// key > 64 bytes - but we use blake2b as hash function (no key)
h, _ = blake2b.New512(nil)
// Add new hashes here.
default:
// Default to blake2b.
// ignore the error, because New512 without a key never fails
// New512 only returns a non-nil error, if the length of the passed
// key > 64 bytes - but we use blake2b as hash function (no key)
h, _ = blake2b.New512(nil)
}
return h
}
// Hash buffer pool is a pool of reusable
// buffers used while checksumming a stream.
var hashBufferPool = sync.Pool{
New: func() interface{} {
b := make([]byte, readSizeV1)
return &b
},
}
// hashSum calculates the hash of the entire path and returns.
func hashSum(disk StorageAPI, volume, path string, writer hash.Hash) ([]byte, error) {
// Fetch a new staging buffer from the pool.
bufp := hashBufferPool.Get().(*[]byte)
defer hashBufferPool.Put(bufp)
// Copy entire buffer to writer.
if err := copyBuffer(writer, disk, volume, path, *bufp); err != nil {
return nil, err
}
// Return the final hash sum.
return writer.Sum(nil), nil
}
// getDataBlockLen - get length of data blocks from encoded blocks.
func getDataBlockLen(enBlocks [][]byte, dataBlocks int) int {
size := 0
@ -166,57 +105,3 @@ func writeDataBlocks(dst io.Writer, enBlocks [][]byte, dataBlocks int, offset in
func getChunkSize(blockSize int64, dataBlocks int) int64 {
return (blockSize + int64(dataBlocks) - 1) / int64(dataBlocks)
}
// copyBuffer - copies from disk, volume, path to input writer until either EOF
// is reached at volume, path or an error occurs. A success copyBuffer returns
// err == nil, not err == EOF. Because copyBuffer is defined to read from path
// until EOF. It does not treat an EOF from ReadFile an error to be reported.
// Additionally copyBuffer stages through the provided buffer; otherwise if it
// has zero length, returns error.
func copyBuffer(writer io.Writer, disk StorageAPI, volume string, path string, buf []byte) error {
// Error condition of zero length buffer.
if buf != nil && len(buf) == 0 {
return errors.New("empty buffer in readBuffer")
}
// Starting offset for Reading the file.
var startOffset int64
// Read until io.EOF.
for {
n, err := disk.ReadFile(volume, path, startOffset, buf)
if n > 0 {
m, wErr := writer.Write(buf[:n])
if wErr != nil {
return wErr
}
if int64(m) != n {
return io.ErrShortWrite
}
}
if err != nil {
if err == io.EOF || err == io.ErrUnexpectedEOF {
break
}
return err
}
// Progress the offset.
startOffset += n
}
// Success.
return nil
}
// bitRotVerifier - type representing bit-rot verification process for
// a single under-lying object (currently whole files)
type bitRotVerifier struct {
// has the bit-rot verification been done?
isVerified bool
// is the data free of bit-rot?
hasBitRot bool
// hashing algorithm
algo HashAlgo
// hex-encoded expected raw-hash value
checkSum string
}

View file

@ -16,23 +16,7 @@
package cmd
import (
"bytes"
"errors"
"io"
"os"
"testing"
)
// Test validates the number hash writers returned.
func TestNewHashWriters(t *testing.T) {
diskNum := 8
hashWriters := newHashWriters(diskNum, bitRotAlgo)
if len(hashWriters) != diskNum {
t.Errorf("Expected %d hashWriters, but instead got %d", diskNum, len(hashWriters))
}
}
import "testing"
// Tests validate the output of getChunkSize.
func TestGetChunkSize(t *testing.T) {
@ -67,95 +51,3 @@ func TestGetChunkSize(t *testing.T) {
}
}
}
// TestCopyBuffer - Tests validate the result and errors produced when `copyBuffer` is called with sample inputs.
func TestCopyBuffer(t *testing.T) {
// create posix test setup
disk, diskPath, err := newPosixTestSetup()
if err != nil {
t.Fatalf("Unable to create posix test setup, %s", err)
}
defer os.RemoveAll(diskPath)
volume := "success-vol"
// Setup test environment.
if err = disk.MakeVol(volume); err != nil {
t.Fatalf("Unable to create volume, %s", err)
}
// creating io.Writer for the input to copyBuffer.
buffers := []*bytes.Buffer{
new(bytes.Buffer),
new(bytes.Buffer),
new(bytes.Buffer),
}
testFile := "testFile"
testContent := []byte("hello, world")
err = disk.AppendFile(volume, testFile, testContent)
if err != nil {
t.Fatalf("AppendFile failed: <ERROR> %s", err)
}
testCases := []struct {
writer io.Writer
disk StorageAPI
volume string
path string
buf []byte
expectedResult []byte
// flag to indicate whether test case should pass.
shouldPass bool
expectedErr error
}{
// Test case - 1.
// case with empty buffer.
{nil, nil, "", "", []byte{}, nil, false, errors.New("empty buffer in readBuffer")},
// Test case - 2.
// Test case with empty volume.
{buffers[0], disk, "", "", make([]byte, 5), nil, false, errInvalidArgument},
// Test case - 3.
// Test case with non existent volume.
{buffers[0], disk, "abc", "", make([]byte, 5), nil, false, errVolumeNotFound},
// Test case - 4.
// Test case with empty filename/path.
{buffers[0], disk, volume, "", make([]byte, 5), nil, false, errIsNotRegular},
// Test case - 5.
// Test case with non existent file name.
{buffers[0], disk, volume, "abcd", make([]byte, 5), nil, false, errFileNotFound},
// Test case - 6.
// Test case where the writer returns EOF.
{NewEOFWriter(buffers[0], 3), disk, volume, testFile, make([]byte, 5), nil, false, io.EOF},
// Test case - 7.
// Test case to produce io.ErrShortWrite, the TruncateWriter returns after writing 3 bytes.
{TruncateWriter(buffers[1], 3), disk, volume, testFile, make([]byte, 5), nil, false, io.ErrShortWrite},
// Teset case - 8.
// Valid case, expected to read till EOF and write the contents into the writer.
{buffers[2], disk, volume, testFile, make([]byte, 5), testContent, true, nil},
}
// iterate over the test cases and call copy Buffer with data.
for i, testCase := range testCases {
actualErr := copyBuffer(testCase.writer, testCase.disk, testCase.volume, testCase.path, testCase.buf)
if actualErr != nil && testCase.shouldPass {
t.Errorf("Test %d: Expected to pass but failed instead with \"%s\"", i+1, actualErr)
}
if actualErr == nil && !testCase.shouldPass {
t.Errorf("Test %d: Expected to fail with error <Error> \"%v\", but instead passed", i+1, testCase.expectedErr)
}
// Failed as expected, but does it fail for the expected reason.
if actualErr != nil && !testCase.shouldPass {
if testCase.expectedErr.Error() != actualErr.Error() {
t.Errorf("Test %d: Expected Error to be \"%v\", but instead found \"%v\" ", i+1, testCase.expectedErr, actualErr)
}
}
// test passed as expected, asserting the result.
if actualErr == nil && testCase.shouldPass {
if !bytes.Equal(testCase.expectedResult, buffers[2].Bytes()) {
t.Errorf("Test %d: copied buffer differs from the expected one.", i+1)
}
}
}
}

115
cmd/erasure.go Normal file
View file

@ -0,0 +1,115 @@
/*
* Minio Cloud Storage, (C) 2017 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"crypto/subtle"
"hash"
"github.com/klauspost/reedsolomon"
)
// OfflineDisk represents an unavailable disk.
var OfflineDisk StorageAPI // zero value is nil
// ErasureFileInfo contains information about an erasure file operation (create, read, heal).
type ErasureFileInfo struct {
Size int64
Algorithm BitrotAlgorithm
Checksums [][]byte
}
// ErasureStorage represents an array of disks.
// The disks contain erasure coded and bitrot-protected data.
type ErasureStorage struct {
disks []StorageAPI
erasure reedsolomon.Encoder
dataBlocks, parityBlocks int
}
// NewErasureStorage creates a new ErasureStorage. The storage erasure codes and protects all data written to
// the disks.
func NewErasureStorage(disks []StorageAPI, dataBlocks, parityBlocks int) (s ErasureStorage, err error) {
erasure, err := reedsolomon.New(dataBlocks, parityBlocks)
if err != nil {
return s, traceErrorf("failed to create erasure coding: %v", err)
}
s = ErasureStorage{
disks: make([]StorageAPI, len(disks)),
erasure: erasure,
dataBlocks: dataBlocks,
parityBlocks: parityBlocks,
}
copy(s.disks, disks)
return
}
// ErasureEncode encodes the given data and returns the erasure-coded data.
// It returns an error if the erasure coding failed.
func (s *ErasureStorage) ErasureEncode(data []byte) ([][]byte, error) {
encoded, err := s.erasure.Split(data)
if err != nil {
return nil, traceErrorf("failed to split data: %v", err)
}
if err = s.erasure.Encode(encoded); err != nil {
return nil, traceErrorf("failed to encode data: %v", err)
}
return encoded, nil
}
// ErasureDecodeDataBlocks decodes the given erasure-coded data.
// It only decodes the data blocks but does not verify them.
// It returns an error if the decoding failed.
func (s *ErasureStorage) ErasureDecodeDataBlocks(data [][]byte) error {
if err := s.erasure.ReconstructData(data); err != nil {
return traceErrorf("failed to reconstruct data: %v", err)
}
return nil
}
// ErasureDecodeDataAndParityBlocks decodes the given erasure-coded data and verifies it.
// It returns an error if the decoding failed.
func (s *ErasureStorage) ErasureDecodeDataAndParityBlocks(data [][]byte) error {
if err := s.erasure.Reconstruct(data); err != nil {
return traceErrorf("failed to reconstruct data: %v", err)
}
return nil
}
// NewBitrotVerifier returns a new BitrotVerifier implementing the given algorithm.
func NewBitrotVerifier(algorithm BitrotAlgorithm, checksum []byte) *BitrotVerifier {
return &BitrotVerifier{algorithm.New(), algorithm, checksum, false}
}
// BitrotVerifier can be used to verify protected data.
type BitrotVerifier struct {
hash.Hash
algorithm BitrotAlgorithm
sum []byte
verified bool
}
// Verify returns true iff the computed checksum of the verifier matches the the checksum provided when the verifier
// was created.
func (v *BitrotVerifier) Verify() bool {
v.verified = true
return subtle.ConstantTimeCompare(v.Sum(nil), v.sum) == 1
}
// IsVerified returns true iff Verify was called at least once.
func (v *BitrotVerifier) IsVerified() bool { return v.verified }

View file

@ -18,161 +18,91 @@ package cmd
import (
"bytes"
"crypto/rand"
"io"
"os"
"testing"
)
// mustEncodeData - encodes data slice and provides encoded 2 dimensional slice.
func mustEncodeData(data []byte, dataBlocks, parityBlocks int) [][]byte {
encodedData, err := encodeData(data, dataBlocks, parityBlocks)
if err != nil {
// Upon failure panic this function.
panic(err)
}
return encodedData
var erasureDecodeTests = []struct {
dataBlocks, parityBlocks int
missingData, missingParity int
reconstructParity bool
shouldFail bool
}{
{dataBlocks: 2, parityBlocks: 2, missingData: 0, missingParity: 0, reconstructParity: true, shouldFail: false},
{dataBlocks: 3, parityBlocks: 3, missingData: 1, missingParity: 0, reconstructParity: true, shouldFail: false},
{dataBlocks: 4, parityBlocks: 4, missingData: 2, missingParity: 0, reconstructParity: false, shouldFail: false},
{dataBlocks: 5, parityBlocks: 5, missingData: 0, missingParity: 1, reconstructParity: true, shouldFail: false},
{dataBlocks: 6, parityBlocks: 6, missingData: 0, missingParity: 2, reconstructParity: true, shouldFail: false},
{dataBlocks: 7, parityBlocks: 7, missingData: 1, missingParity: 1, reconstructParity: false, shouldFail: false},
{dataBlocks: 8, parityBlocks: 8, missingData: 3, missingParity: 2, reconstructParity: false, shouldFail: false},
{dataBlocks: 2, parityBlocks: 2, missingData: 2, missingParity: 1, reconstructParity: true, shouldFail: true},
{dataBlocks: 4, parityBlocks: 2, missingData: 2, missingParity: 2, reconstructParity: false, shouldFail: true},
{dataBlocks: 8, parityBlocks: 4, missingData: 2, missingParity: 2, reconstructParity: false, shouldFail: false},
}
// Generates good encoded data with one parity block and data block missing.
func getGoodEncodedData(data []byte, dataBlocks, parityBlocks int) [][]byte {
encodedData := mustEncodeData(data, dataBlocks, parityBlocks)
encodedData[3] = nil
encodedData[1] = nil
return encodedData
}
// Generates bad encoded data with one parity block and data block with garbage data.
func getBadEncodedData(data []byte, dataBlocks, parityBlocks int) [][]byte {
encodedData := mustEncodeData(data, dataBlocks, parityBlocks)
encodedData[3] = []byte("garbage")
encodedData[1] = []byte("garbage")
return encodedData
}
// Generates encoded data with all data blocks missing.
func getMissingData(data []byte, dataBlocks, parityBlocks int) [][]byte {
encodedData := mustEncodeData(data, dataBlocks, parityBlocks)
for i := 0; i < dataBlocks+1; i++ {
encodedData[i] = nil
}
return encodedData
}
// Generates encoded data with less number of blocks than expected data blocks.
func getInsufficientData(data []byte, dataBlocks, parityBlocks int) [][]byte {
encodedData := mustEncodeData(data, dataBlocks, parityBlocks)
// Return half of the data.
return encodedData[:dataBlocks/2]
}
// Represents erasure encoding matrix dataBlocks and paritBlocks.
type encodingMatrix struct {
dataBlocks int
parityBlocks int
}
// List of encoding matrices the tests will run on.
var encodingMatrices = []encodingMatrix{
{3, 3}, // 3 data, 3 parity blocks.
{4, 4}, // 4 data, 4 parity blocks.
{5, 5}, // 5 data, 5 parity blocks.
{6, 6}, // 6 data, 6 parity blocks.
{7, 7}, // 7 data, 7 parity blocks.
{8, 8}, // 8 data, 8 parity blocks.
}
// Tests erasure decoding functionality for various types of inputs.
func TestErasureDecode(t *testing.T) {
data := []byte("Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.")
// List of decoding cases
// - validates bad encoded data.
// - validates good encoded data.
// - validates insufficient data.
testDecodeCases := []struct {
enFn func([]byte, int, int) [][]byte
shouldPass bool
}{
// Generates bad encoded data.
{
enFn: getBadEncodedData,
shouldPass: false,
},
// Generates good encoded data.
{
enFn: getGoodEncodedData,
shouldPass: true,
},
// Generates missing data.
{
enFn: getMissingData,
shouldPass: false,
},
// Generates short data.
{
enFn: getInsufficientData,
shouldPass: false,
},
data := make([]byte, 256)
if _, err := io.ReadFull(rand.Reader, data); err != nil {
t.Fatalf("Failed to read random data: %v", err)
}
for i, test := range erasureDecodeTests {
buffer := make([]byte, len(data), 2*len(data))
copy(buffer, data)
// Validates all decode tests.
for i, testCase := range testDecodeCases {
for _, encodingMatrix := range encodingMatrices {
disks := make([]StorageAPI, test.dataBlocks+test.parityBlocks)
storage, err := NewErasureStorage(disks, test.dataBlocks, test.parityBlocks)
if err != nil {
t.Fatalf("Test %d: failed to create erasure storage: %v", i, err)
}
encoded, err := storage.ErasureEncode(buffer)
if err != nil {
t.Fatalf("Test %d: failed to encode data: %v", i, err)
}
// Encoding matrix.
dataBlocks := encodingMatrix.dataBlocks
parityBlocks := encodingMatrix.parityBlocks
for j := range encoded[:test.missingData] {
encoded[j] = nil
}
for j := test.dataBlocks; j < test.dataBlocks+test.missingParity; j++ {
encoded[j] = nil
}
// Data block size.
blockSize := len(data)
if test.reconstructParity {
err = storage.ErasureDecodeDataAndParityBlocks(encoded)
} else {
err = storage.ErasureDecodeDataBlocks(encoded)
}
// Test decoder for just the missing data blocks
{
// Generates encoded data based on type of testCase function.
encodedData := testCase.enFn(data, dataBlocks, parityBlocks)
if err == nil && test.shouldFail {
t.Errorf("Test %d: test should fail but it passed", i)
}
if err != nil && !test.shouldFail {
t.Errorf("Test %d: test should pass but it failed: %v", i, err)
}
// Decodes the data.
err := decodeMissingData(encodedData, dataBlocks, parityBlocks)
if err != nil && testCase.shouldPass {
t.Errorf("Test %d: Expected to pass by failed instead with %s", i+1, err)
decoded := encoded
if !test.shouldFail {
if test.reconstructParity {
for j := range decoded {
if decoded[j] == nil {
t.Errorf("Test %d: failed to reconstruct shard %d", i, j)
}
}
// Proceed to extract the data blocks.
decodedDataWriter := new(bytes.Buffer)
_, err = writeDataBlocks(decodedDataWriter, encodedData, dataBlocks, 0, int64(blockSize))
if err != nil && testCase.shouldPass {
t.Errorf("Test %d: Expected to pass by failed instead with %s", i+1, err)
}
// Validate if decoded data is what we expected.
if bytes.Equal(decodedDataWriter.Bytes(), data) != testCase.shouldPass {
err := errUnexpected
t.Errorf("Test %d: Expected to pass by failed instead %s", i+1, err)
} else {
for j := range decoded[:test.dataBlocks] {
if decoded[j] == nil {
t.Errorf("Test %d: failed to reconstruct data shard %d", i, j)
}
}
}
// Test decoder for all missing data and parity blocks
{
// Generates encoded data based on type of testCase function.
encodedData := testCase.enFn(data, dataBlocks, parityBlocks)
// Decodes the data.
err := decodeDataAndParity(encodedData, dataBlocks, parityBlocks)
if err != nil && testCase.shouldPass {
t.Errorf("Test %d: Expected to pass by failed instead with %s", i+1, err)
}
// Proceed to extract the data blocks.
decodedDataWriter := new(bytes.Buffer)
_, err = writeDataBlocks(decodedDataWriter, encodedData, dataBlocks, 0, int64(blockSize))
if err != nil && testCase.shouldPass {
t.Errorf("Test %d: Expected to pass by failed instead with %s", i+1, err)
}
// Validate if decoded data is what we expected.
if bytes.Equal(decodedDataWriter.Bytes(), data) != testCase.shouldPass {
err := errUnexpected
t.Errorf("Test %d: Expected to pass by failed instead %s", i+1, err)
}
decodedData := new(bytes.Buffer)
if _, err = writeDataBlocks(decodedData, decoded, test.dataBlocks, 0, int64(len(data))); err != nil {
t.Errorf("Test %d: failed to write data blocks: %v", i, err)
}
if !bytes.Equal(decodedData.Bytes(), data) {
t.Errorf("Test %d: Decoded data does not match original data: got: %v want: %v", i, decodedData.Bytes(), data)
}
}
}

View file

@ -151,3 +151,8 @@ func isErr(err error, errs ...error) bool {
}
return false
}
// traceErrorf behaves like fmt.traceErrorf but also traces the returned error.
func traceErrorf(format string, args ...interface{}) error {
return traceError(fmt.Errorf(format, args...))
}

View file

@ -122,13 +122,11 @@ func (d *naughtyDisk) ReadFile(volume string, path string, offset int64, buf []b
return d.disk.ReadFile(volume, path, offset, buf)
}
func (d *naughtyDisk) ReadFileWithVerify(volume, path string, offset int64,
buf []byte, algo HashAlgo, expectedHash string) (n int64, err error) {
func (d *naughtyDisk) ReadFileWithVerify(volume, path string, offset int64, buf []byte, verifier *BitrotVerifier) (n int64, err error) {
if err := d.calcError(); err != nil {
return 0, err
}
return d.disk.ReadFileWithVerify(volume, path, offset, buf, algo, expectedHash)
return d.disk.ReadFileWithVerify(volume, path, offset, buf, verifier)
}
func (d *naughtyDisk) PrepareFile(volume, path string, length int64) error {

View file

@ -17,9 +17,7 @@
package cmd
import (
"bytes"
"encoding/hex"
"hash"
"io"
"io/ioutil"
"os"
@ -539,8 +537,7 @@ func (s *posix) ReadAll(volume, path string) (buf []byte, err error) {
// Additionally ReadFile also starts reading from an offset. ReadFile
// semantics are same as io.ReadFull.
func (s *posix) ReadFile(volume, path string, offset int64, buf []byte) (n int64, err error) {
return s.ReadFileWithVerify(volume, path, offset, buf, "", "")
return s.ReadFileWithVerify(volume, path, offset, buf, &BitrotVerifier{verified: true})
}
// ReadFileWithVerify is the same as ReadFile but with hashsum
@ -553,9 +550,7 @@ func (s *posix) ReadFile(volume, path string, offset int64, buf []byte) (n int64
//
// The function takes care to minimize the number of disk read
// operations.
func (s *posix) ReadFileWithVerify(volume, path string, offset int64, buf []byte,
algo HashAlgo, expectedHash string) (n int64, err error) {
func (s *posix) ReadFileWithVerify(volume, path string, offset int64, buffer []byte, verifier *BitrotVerifier) (n int64, err error) {
defer func() {
if err == syscall.EIO {
atomic.AddInt32(&s.ioErrCount, 1)
@ -616,62 +611,34 @@ func (s *posix) ReadFileWithVerify(volume, path string, offset int64, buf []byte
return 0, errIsNotRegular
}
// If expected hash string is empty hash verification is
// skipped.
needToHash := expectedHash != ""
var hasher hash.Hash
if !verifier.IsVerified() {
bufp := s.pool.Get().(*[]byte)
defer s.pool.Put(bufp)
if needToHash {
// If the hashing algo is invalid, return an error.
if !isValidHashAlgo(algo) {
return 0, errBitrotHashAlgoInvalid
}
// Compute hash of object from start to the byte at
// (offset - 1), and as a result of this read, seek to
// `offset`.
hasher = newHash(algo)
if offset > 0 {
_, err = io.CopyN(hasher, file, offset)
if err != nil {
if offset != 0 {
if _, err = io.CopyBuffer(verifier, io.LimitReader(file, offset), *bufp); err != nil {
return 0, err
}
}
} else {
// Seek to requested offset.
_, err = file.Seek(offset, os.SEEK_SET)
if err != nil {
if _, err = file.Read(buffer); err != nil {
return 0, err
}
}
// Read until buffer is full.
m, err := io.ReadFull(file, buf)
if err == io.EOF {
return 0, err
}
if needToHash {
// Continue computing hash with buf.
_, err = hasher.Write(buf)
if err != nil {
if _, err = verifier.Write(buffer); err != nil {
return 0, err
}
// Continue computing hash until end of file.
_, err = io.Copy(hasher, file)
if err != nil {
if _, err = io.CopyBuffer(verifier, file, *bufp); err != nil {
return 0, err
}
// Verify the computed hash.
computedHash := hex.EncodeToString(hasher.Sum(nil))
if computedHash != expectedHash {
return 0, hashMismatchError{expectedHash, computedHash}
if !verifier.Verify() {
return 0, hashMismatchError{hex.EncodeToString(verifier.sum), hex.EncodeToString(verifier.Sum(nil))}
}
return int64(len(buffer)), err
}
// Success.
m, err := file.ReadAt(buffer, offset)
if m > 0 && m < len(buffer) {
err = io.ErrUnexpectedEOF
}
return int64(m), err
}
@ -811,17 +778,8 @@ func (s *posix) AppendFile(volume, path string, buf []byte) (err error) {
if err != nil {
return err
}
// Close upon return.
defer w.Close()
bufp := s.pool.Get().(*[]byte)
// Reuse buffer.
defer s.pool.Put(bufp)
// Return io.Copy
_, err = io.CopyBuffer(w, bytes.NewReader(buf), *bufp)
_, err = w.Write(buf)
w.Close()
return err
}

View file

@ -18,7 +18,7 @@ package cmd
import (
"bytes"
"crypto/sha256"
"crypto/rand"
"encoding/hex"
"io"
"io/ioutil"
@ -30,8 +30,6 @@ import (
"testing"
"github.com/minio/minio/pkg/disk"
"golang.org/x/crypto/blake2b"
)
// creates a temp dir and sets up posix layer.
@ -945,14 +943,14 @@ func TestPosixReadFile(t *testing.T) {
func() error {
if runtime.GOOS == globalWindowsOSName {
return &os.PathError{
Op: "seek",
Path: (slashpath.Join(path, "success-vol", "myobject")),
Err: syscall.Errno(0x83), // ERROR_NEGATIVE_SEEK
Op: "read",
Path: slashpath.Join(path, "success-vol", "myobject"),
Err: syscall.Errno(0x57), // invalid parameter
}
}
return &os.PathError{
Op: "seek",
Path: (slashpath.Join(path, "success-vol", "myobject")),
Op: "read",
Path: slashpath.Join(path, "success-vol", "myobject"),
Err: os.ErrInvalid,
}
}(),
@ -1079,111 +1077,77 @@ func TestPosixReadFile(t *testing.T) {
}
}
var posixReadFileWithVerifyTests = []struct {
file string
offset int
length int
algorithm BitrotAlgorithm
expError error
}{
{file: "myobject", offset: 0, length: 100, algorithm: SHA256, expError: nil}, // 0
{file: "myobject", offset: 25, length: 74, algorithm: SHA256, expError: nil}, // 1
{file: "myobject", offset: 29, length: 70, algorithm: SHA256, expError: nil}, // 2
{file: "myobject", offset: 100, length: 0, algorithm: SHA256, expError: nil}, // 3
{file: "myobject", offset: 1, length: 120, algorithm: SHA256, expError: hashMismatchError{}}, // 4
{file: "myobject", offset: 3, length: 1100, algorithm: SHA256, expError: nil}, // 5
{file: "myobject", offset: 2, length: 100, algorithm: SHA256, expError: hashMismatchError{}}, // 6
{file: "myobject", offset: 1000, length: 1001, algorithm: SHA256, expError: nil}, // 7
{file: "myobject", offset: 0, length: 100, algorithm: BLAKE2b512, expError: hashMismatchError{}}, // 8
{file: "myobject", offset: 25, length: 74, algorithm: BLAKE2b512, expError: nil}, // 9
{file: "myobject", offset: 29, length: 70, algorithm: BLAKE2b512, expError: hashMismatchError{}}, // 10
{file: "myobject", offset: 100, length: 0, algorithm: BLAKE2b512, expError: nil}, // 11
{file: "myobject", offset: 1, length: 120, algorithm: BLAKE2b512, expError: nil}, // 12
{file: "myobject", offset: 3, length: 1100, algorithm: BLAKE2b512, expError: nil}, // 13
{file: "myobject", offset: 2, length: 100, algorithm: BLAKE2b512, expError: nil}, // 14
{file: "myobject", offset: 1000, length: 1001, algorithm: BLAKE2b512, expError: nil}, // 15
}
// TestPosixReadFileWithVerify - tests the posix level
// ReadFileWithVerify API. Only tests hashing related
// functionality. Other functionality is tested with
// TestPosixReadFile.
func TestPosixReadFileWithVerify(t *testing.T) {
// create posix test setup
volume, object := "test-vol", "myobject"
posixStorage, path, err := newPosixTestSetup()
if err != nil {
os.RemoveAll(path)
t.Fatalf("Unable to create posix test setup, %s", err)
}
defer os.RemoveAll(path)
volume := "success-vol"
// Setup test environment.
if err = posixStorage.MakeVol(volume); err != nil {
t.Fatalf("Unable to create volume, %s", err)
os.RemoveAll(path)
t.Fatalf("Unable to create volume %s: %v", volume, err)
}
data := make([]byte, 8*1024)
if _, err = io.ReadFull(rand.Reader, data); err != nil {
os.RemoveAll(path)
t.Fatalf("Unable to create generate random data: %v", err)
}
if err = posixStorage.AppendFile(volume, object, data); err != nil {
os.RemoveAll(path)
t.Fatalf("Unable to create object: %v", err)
}
blakeHash := func(s string) string {
k := blake2b.Sum512([]byte(s))
return hex.EncodeToString(k[:])
}
for i, test := range posixReadFileWithVerifyTests {
h := test.algorithm.New()
h.Write(data)
if test.expError != nil {
expected := h.Sum(nil)
h.Write([]byte{0})
test.expError = hashMismatchError{hex.EncodeToString(h.Sum(nil)), hex.EncodeToString(expected)}
}
sha256Hash := func(s string) string {
k := sha256.Sum256([]byte(s))
return hex.EncodeToString(k[:])
}
testCases := []struct {
fileName string
offset int64
bufSize int
algo HashAlgo
expectedHash string
expectedBuf []byte
expectedErr error
}{
// Hash verification is skipped with empty expected
// hash - 1
{
"myobject", 0, 5, HashBlake2b, "",
[]byte("Hello"), nil,
},
// Hash verification failure case - 2
{
"myobject", 0, 5, HashBlake2b, "a",
[]byte(""),
hashMismatchError{"a", blakeHash("Hello, world!")},
},
// Hash verification success with full content requested - 3
{
"myobject", 0, 13, HashBlake2b, blakeHash("Hello, world!"),
[]byte("Hello, world!"), nil,
},
// Hash verification success with full content and Sha256 - 4
{
"myobject", 0, 13, HashSha256, sha256Hash("Hello, world!"),
[]byte("Hello, world!"), nil,
},
// Hash verification success with partial content requested - 5
{
"myobject", 7, 4, HashBlake2b, blakeHash("Hello, world!"),
[]byte("worl"), nil,
},
// Hash verification success with partial content and Sha256 - 6
{
"myobject", 7, 4, HashSha256, sha256Hash("Hello, world!"),
[]byte("worl"), nil,
},
// Empty hash-algo returns error - 7
{
"myobject", 7, 4, "", blakeHash("Hello, world!"),
[]byte("worl"), errBitrotHashAlgoInvalid,
},
// Empty content hash verification with empty
// hash-algo algo returns error - 8
{
"myobject", 7, 0, "", blakeHash("Hello, world!"),
[]byte(""), errBitrotHashAlgoInvalid,
},
}
// Create file used in testcases
err = posixStorage.AppendFile(volume, "myobject", []byte("Hello, world!"))
if err != nil {
t.Fatalf("Failure in test setup: %v\n", err)
}
// Validate each test case.
for i, testCase := range testCases {
var n int64
// Common read buffer.
var buf = make([]byte, testCase.bufSize)
n, err = posixStorage.ReadFileWithVerify(volume, testCase.fileName, testCase.offset, buf, testCase.algo, testCase.expectedHash)
buffer := make([]byte, test.length)
n, err := posixStorage.ReadFileWithVerify(volume, test.file, int64(test.offset), buffer, NewBitrotVerifier(test.algorithm, h.Sum(nil)))
switch {
case err == nil && testCase.expectedErr != nil:
t.Errorf("Test %d: Expected error %v but got none.", i+1, testCase.expectedErr)
case err == nil && n != int64(testCase.bufSize):
t.Errorf("Test %d: %d bytes were expected, but %d were written", i+1, testCase.bufSize, n)
case err == nil && !bytes.Equal(testCase.expectedBuf, buf):
t.Errorf("Test %d: Expected bytes: %v, but got: %v", i+1, testCase.expectedBuf, buf)
case err != nil && err != testCase.expectedErr:
t.Errorf("Test %d: Expected error: %v, but got: %v", i+1, testCase.expectedErr, err)
case err == nil && test.expError != nil:
t.Errorf("Test %d: Expected error %v but got none.", i, test.expError)
case err == nil && n != int64(test.length):
t.Errorf("Test %d: %d bytes were expected, but %d were written", i, test.length, n)
case err == nil && !bytes.Equal(data[test.offset:test.offset+test.length], buffer):
t.Errorf("Test %d: Expected bytes: %v, but got: %v", i, data[test.offset:test.offset+test.length], buffer)
case err != nil && err != test.expError:
t.Errorf("Test %d: Expected error: %v, but got: %v", i, test.expError, err)
}
}
}

View file

@ -221,18 +221,11 @@ func (f *retryStorage) ReadFile(volume, path string, offset int64, buffer []byte
// ReadFileWithVerify - a retryable implementation of reading at
// offset from a file with verification.
func (f *retryStorage) ReadFileWithVerify(volume, path string, offset int64, buffer []byte,
algo HashAlgo, expectedHash string) (m int64, err error) {
if f.IsOffline() {
return m, errDiskNotFound
}
func (f retryStorage) ReadFileWithVerify(volume, path string, offset int64, buffer []byte, verifier *BitrotVerifier) (m int64, err error) {
m, err = f.remoteStorage.ReadFileWithVerify(volume, path, offset, buffer,
algo, expectedHash)
m, err = f.remoteStorage.ReadFileWithVerify(volume, path, offset, buffer, verifier)
if f.reInitUponDiskNotFound(err) {
m, err = f.remoteStorage.ReadFileWithVerify(volume, path,
offset, buffer, algo, expectedHash)
return m, retryToStorageErr(err)
m, err = f.remoteStorage.ReadFileWithVerify(volume, path, offset, buffer, verifier)
}
return m, retryToStorageErr(err)
}

View file

@ -19,7 +19,6 @@ package cmd
import (
"bytes"
"crypto/sha256"
"encoding/hex"
"errors"
"os"
"reflect"
@ -305,15 +304,15 @@ func TestRetryStorage(t *testing.T) {
}
}
sha256Hash := func(s string) string {
k := sha256.Sum256([]byte(s))
return hex.EncodeToString(k[:])
sha256Hash := func(b []byte) []byte {
k := sha256.Sum256(b)
return k[:]
}
for _, disk := range storageDisks {
var buf2 = make([]byte, 5)
verifier := NewBitrotVerifier(SHA256, sha256Hash([]byte("Hello, World")))
var n int64
if n, err = disk.ReadFileWithVerify("existent", "path", 7, buf2,
HashSha256, sha256Hash("Hello, World")); err != nil {
if n, err = disk.ReadFileWithVerify("existent", "path", 7, buf2, verifier); err != nil {
t.Fatal(err)
}
if err != nil {

View file

@ -16,7 +16,11 @@
package cmd
import "github.com/minio/minio/pkg/disk"
import (
"io"
"github.com/minio/minio/pkg/disk"
)
// StorageAPI interface.
type StorageAPI interface {
@ -37,8 +41,7 @@ type StorageAPI interface {
// File operations.
ListDir(volume, dirPath string) ([]string, error)
ReadFile(volume string, path string, offset int64, buf []byte) (n int64, err error)
ReadFileWithVerify(volume string, path string, offset int64, buf []byte,
algo HashAlgo, expectedHash string) (n int64, err error)
ReadFileWithVerify(volume string, path string, offset int64, buf []byte, verifier *BitrotVerifier) (n int64, err error)
PrepareFile(volume string, path string, len int64) (err error)
AppendFile(volume string, path string, buf []byte) (err error)
RenameFile(srcVolume, srcPath, dstVolume, dstPath string) error
@ -48,3 +51,47 @@ type StorageAPI interface {
// Read all.
ReadAll(volume string, path string) (buf []byte, err error)
}
// storageReader is an io.Reader view of a disk
type storageReader struct {
storage StorageAPI
volume, path string
offset int64
}
func (r *storageReader) Read(p []byte) (n int, err error) {
nn, err := r.storage.ReadFile(r.volume, r.path, r.offset, p)
r.offset += nn
n = int(nn)
if err == io.ErrUnexpectedEOF && nn > 0 {
err = io.EOF
}
return
}
// storageWriter is a io.Writer view of a disk.
type storageWriter struct {
storage StorageAPI
volume, path string
}
func (w *storageWriter) Write(p []byte) (n int, err error) {
err = w.storage.AppendFile(w.volume, w.path, p)
if err == nil {
n = len(p)
}
return
}
// StorageWriter returns a new io.Writer which appends data to the file
// at the given disk, volume and path.
func StorageWriter(storage StorageAPI, volume, path string) io.Writer {
return &storageWriter{storage, volume, path}
}
// StorageReader returns a new io.Reader which reads data to the file
// at the given disk, volume, path and offset.
func StorageReader(storage StorageAPI, volume, path string, offset int64) io.Reader {
return &storageReader{storage, volume, path, offset}
}

View file

@ -262,8 +262,7 @@ func (n *networkStorage) ReadFile(volume string, path string, offset int64, buff
}
// ReadFileWithVerify - reads a file at remote path and fills the buffer.
func (n *networkStorage) ReadFileWithVerify(volume string, path string, offset int64,
buffer []byte, algo HashAlgo, expectedHash string) (m int64, err error) {
func (n *networkStorage) ReadFileWithVerify(volume string, path string, offset int64, buffer []byte, verifier *BitrotVerifier) (m int64, err error) {
defer func() {
if r := recover(); r != nil {
@ -279,8 +278,8 @@ func (n *networkStorage) ReadFileWithVerify(volume string, path string, offset i
Path: path,
Offset: offset,
Buffer: buffer,
Algo: algo,
ExpectedHash: expectedHash,
Algo: verifier.algorithm,
ExpectedHash: verifier.sum,
}, &result)
// Copy results to buffer.

View file

@ -18,7 +18,6 @@ package cmd
import (
"bytes"
"encoding/hex"
"errors"
"fmt"
"io"
@ -391,13 +390,13 @@ func (s *TestRPCStorageSuite) testRPCStorageFileOps(t *testing.T) {
t.Errorf("Expected %s, got %s", string(buf[4:9]), string(buf1))
}
blakeHash := func(s string) string {
k := blake2b.Sum512([]byte(s))
return hex.EncodeToString(k[:])
blakeHash := func(b []byte) []byte {
k := blake2b.Sum512(b)
return k[:]
}
verifier := NewBitrotVerifier(BLAKE2b512, blakeHash(buf))
buf2 := make([]byte, 2)
n, err = storageDisk.ReadFileWithVerify("myvol", "file1", 1,
buf2, HashBlake2b, blakeHash(string(buf)))
n, err = storageDisk.ReadFileWithVerify("myvol", "file1", 1, buf2, verifier)
if err != nil {
t.Error("Error in ReadFileWithVerify", err)
}

View file

@ -79,11 +79,10 @@ type ReadFileWithVerifyArgs struct {
Buffer []byte
// Algorithm used in bit-rot hash computation.
Algo HashAlgo
Algo BitrotAlgorithm
// Stored hash value (hex-encoded) used to compare with
// computed value.
ExpectedHash string
// Stored hash value used to compare with computed value.
ExpectedHash []byte
}
// PrepareFileArgs represents append file RPC arguments.

View file

@ -164,10 +164,7 @@ func (s *storageServer) ReadFileWithVerifyHandler(args *ReadFileWithVerifyArgs,
if err = args.IsAuthenticated(); err != nil {
return err
}
var n int64
n, err = s.storage.ReadFileWithVerify(args.Vol, args.Path, args.Offset, args.Buffer,
args.Algo, args.ExpectedHash)
n, err := s.storage.ReadFileWithVerify(args.Vol, args.Path, args.Offset, args.Buffer, NewBitrotVerifier(args.Algo, args.ExpectedHash))
// Sending an error over the rpc layer, would cause unmarshalling to fail. In situations
// when we have short read i.e `io.ErrUnexpectedEOF` treat it as good condition and copy
// the buffer properly.

View file

@ -17,11 +17,23 @@
package cmd
import (
"encoding/hex"
"crypto/subtle"
"path/filepath"
"sync"
"time"
"io"
)
// healBufferPool is a pool of reusable buffers used to verify a stream
// while healing.
var healBufferPool = sync.Pool{
New: func() interface{} {
b := make([]byte, readSizeV1)
return &b
},
}
// commonTime returns a maximally occurring time from a list of time.
func commonTime(modTimes []time.Time) (modTime time.Time, count int) {
var maxima int // Counter for remembering max occurrence of elements.
@ -252,36 +264,32 @@ func xlHealStat(xl xlObjects, partsMetadata []xlMetaV1, errs []error) HealObject
// calculating blake2b checksum.
func disksWithAllParts(onlineDisks []StorageAPI, partsMetadata []xlMetaV1, errs []error, bucket, object string) ([]StorageAPI, []error, error) {
availableDisks := make([]StorageAPI, len(onlineDisks))
buffer := healBufferPool.Get().(*[]byte)
defer healBufferPool.Put(buffer)
for diskIndex, onlineDisk := range onlineDisks {
if onlineDisk == nil {
if onlineDisk == OfflineDisk {
continue
}
// disk has a valid xl.json but may not have all the
// parts. This is considered an outdated disk, since
// it needs healing too.
for _, part := range partsMetadata[diskIndex].Parts {
// compute blake2b sum of part.
partPath := filepath.Join(object, part.Name)
checkSumInfo := partsMetadata[diskIndex].Erasure.GetCheckSumInfo(part.Name)
hash := newHash(checkSumInfo.Algorithm)
blakeBytes, hErr := hashSum(onlineDisk, bucket, partPath, hash)
checkSumInfo := partsMetadata[diskIndex].Erasure.GetChecksumInfo(part.Name)
hash := checkSumInfo.Algorithm.New()
_, hErr := io.CopyBuffer(hash, StorageReader(onlineDisk, bucket, partPath, 0), *buffer)
if hErr == errFileNotFound {
errs[diskIndex] = errFileNotFound
availableDisks[diskIndex] = nil
availableDisks[diskIndex] = OfflineDisk
break
}
if hErr != nil && hErr != errFileNotFound {
return nil, nil, traceError(hErr)
}
blakeSum := hex.EncodeToString(blakeBytes)
// if blake2b sum doesn't match for a part
// then this disk is outdated and needs
// healing.
if blakeSum != checkSumInfo.Hash {
if subtle.ConstantTimeCompare(hash.Sum(nil), checkSumInfo.Hash) != 1 {
errs[diskIndex] = errFileNotFound
availableDisks[diskIndex] = nil
availableDisks[diskIndex] = OfflineDisk
break
}
availableDisks[diskIndex] = onlineDisk

View file

@ -87,12 +87,12 @@ func TestCommonTime(t *testing.T) {
// partsMetaFromModTimes - returns slice of modTimes given metadata of
// an object part.
func partsMetaFromModTimes(modTimes []time.Time, checksums []checkSumInfo) []xlMetaV1 {
func partsMetaFromModTimes(modTimes []time.Time, algorithm BitrotAlgorithm, checksums []ChecksumInfo) []xlMetaV1 {
var partsMetadata []xlMetaV1
for _, modTime := range modTimes {
partsMetadata = append(partsMetadata, xlMetaV1{
Erasure: erasureInfo{
Checksum: checksums,
Erasure: ErasureInfo{
Checksums: checksums,
},
Stat: statInfo{
ModTime: modTime,
@ -270,7 +270,7 @@ func TestListOnlineDisks(t *testing.T) {
}
partsMetadata := partsMetaFromModTimes(test.modTimes, xlMeta.Erasure.Checksum)
partsMetadata := partsMetaFromModTimes(test.modTimes, DefaultBitrotAlgorithm, xlMeta.Erasure.Checksums)
onlineDisks, modTime := listOnlineDisks(xlDisks, partsMetadata, test.errs)
availableDisks, newErrs, _ := disksWithAllParts(onlineDisks, partsMetadata, test.errs, bucket, object)
@ -357,8 +357,7 @@ func TestDisksWithAllParts(t *testing.T) {
t.Fatalf("Failed to make a bucket %v", err)
}
_, err = obj.PutObject(bucket, object, int64(len(data)),
bytes.NewReader(data), nil, "")
_, err = obj.PutObject(bucket, object, int64(len(data)), bytes.NewReader(data), nil, "")
if err != nil {
t.Fatalf("Failed to putObject %v", err)
}
@ -368,8 +367,6 @@ func TestDisksWithAllParts(t *testing.T) {
t.Fatalf("Failed to read xl meta data %v", err)
}
// Replace the default blake2b erasure algorithm to HashSha256 and test that
// disks are excluded
diskFailures := make(map[int]string)
// key = disk index, value = part name with hash mismatch
diskFailures[0] = "part.3"
@ -377,16 +374,15 @@ func TestDisksWithAllParts(t *testing.T) {
diskFailures[15] = "part.2"
for diskIndex, partName := range diskFailures {
for index, info := range partsMetadata[diskIndex].Erasure.Checksum {
for index, info := range partsMetadata[diskIndex].Erasure.Checksums {
if info.Name == partName {
partsMetadata[diskIndex].Erasure.Checksum[index].Algorithm = HashSha256
partsMetadata[diskIndex].Erasure.Checksums[index].Hash[0]++
}
}
}
errs = make([]error, len(xlDisks))
filteredDisks, errs, err :=
disksWithAllParts(xlDisks, partsMetadata, errs, bucket, object)
filteredDisks, errs, err := disksWithAllParts(xlDisks, partsMetadata, errs, bucket, object)
if err != nil {
t.Errorf("Unexpected error: %s", err)
}
@ -396,7 +392,6 @@ func TestDisksWithAllParts(t *testing.T) {
}
for diskIndex, disk := range filteredDisks {
if _, ok := diskFailures[diskIndex]; ok {
if disk != nil {
t.Errorf("Disk not filtered as expected, disk: %d", diskIndex)
@ -422,8 +417,7 @@ func TestDisksWithAllParts(t *testing.T) {
t.Fatalf("Failed to read xl meta data %v", err)
}
filteredDisks, errs, err =
disksWithAllParts(xlDisks, partsMetadata, errs, bucket, object)
filteredDisks, errs, err = disksWithAllParts(xlDisks, partsMetadata, errs, bucket, object)
if err != nil {
t.Errorf("Unexpected error: %s", err)
}

View file

@ -446,30 +446,35 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
// Checksum of the part files. checkSumInfos[index] will contain checksums
// of all the part files in the outDatedDisks[index]
checkSumInfos := make([][]checkSumInfo, len(outDatedDisks))
checksumInfos := make([][]ChecksumInfo, len(outDatedDisks))
// Heal each part. erasureHealFile() will write the healed part to
// .minio/tmp/uuid/ which needs to be renamed later to the final location.
storage, err := NewErasureStorage(latestDisks, latestMeta.Erasure.DataBlocks, latestMeta.Erasure.ParityBlocks)
if err != nil {
return 0, 0, toObjectErr(err, bucket, object)
}
checksums := make([][]byte, len(latestDisks))
for partIndex := 0; partIndex < len(latestMeta.Parts); partIndex++ {
partName := latestMeta.Parts[partIndex].Name
partSize := latestMeta.Parts[partIndex].Size
erasure := latestMeta.Erasure
sumInfo := latestMeta.Erasure.GetCheckSumInfo(partName)
var algorithm BitrotAlgorithm
for i, disk := range storage.disks {
if disk != OfflineDisk {
info := partsMetadata[i].Erasure.GetChecksumInfo(partName)
algorithm = info.Algorithm
checksums[i] = info.Hash
}
}
// Heal the part file.
checkSums, hErr := erasureHealFile(latestDisks, outDatedDisks,
bucket, pathJoin(object, partName),
minioMetaTmpBucket, pathJoin(tmpID, partName),
partSize, erasure.BlockSize, erasure.DataBlocks, erasure.ParityBlocks, sumInfo.Algorithm)
file, hErr := storage.HealFile(outDatedDisks, bucket, pathJoin(object, partName), erasure.BlockSize, minioMetaTmpBucket, pathJoin(tmpID, partName), partSize, algorithm, checksums)
if hErr != nil {
return 0, 0, toObjectErr(hErr, bucket, object)
}
for index, sum := range checkSums {
if outDatedDisks[index] != nil {
checkSumInfos[index] = append(checkSumInfos[index], checkSumInfo{
Name: partName,
Algorithm: sumInfo.Algorithm,
Hash: sum,
})
for i := range outDatedDisks {
if outDatedDisks[i] != OfflineDisk {
checksumInfos[i] = append(checksumInfos[i], ChecksumInfo{partName, file.Algorithm, file.Checksums[i]})
}
}
}
@ -480,7 +485,7 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
continue
}
partsMetadata[index] = latestMeta
partsMetadata[index].Erasure.Checksum = checkSumInfos[index]
partsMetadata[index].Erasure.Checksums = checksumInfos[index]
}
// Generate and write `xl.json` generated from other disks.

View file

@ -491,8 +491,7 @@ func TestHealObjectXL(t *testing.T) {
var uploadedParts []completePart
for _, partID := range []int{2, 1} {
pInfo, err1 := obj.PutObjectPart(bucket, object, uploadID, partID,
int64(len(data)), bytes.NewReader(data), "", "")
pInfo, err1 := obj.PutObjectPart(bucket, object, uploadID, partID, int64(len(data)), bytes.NewReader(data), "", "")
if err1 != nil {
t.Fatalf("Failed to upload a part - %v", err1)
}

View file

@ -17,20 +17,97 @@
package cmd
import (
"crypto"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"hash"
"path"
"runtime"
"sort"
"sync"
"time"
"golang.org/x/crypto/blake2b"
)
const erasureAlgorithmKlauspost = "klauspost/reedsolomon/vandermonde"
// DefaultBitrotAlgorithm is the default algorithm used for bitrot protection.
var DefaultBitrotAlgorithm = BLAKE2b512
func init() {
newBLAKE2b := func() hash.Hash {
b2, _ := blake2b.New512(nil) // New512 never returns an error if the key is nil
return b2
}
crypto.RegisterHash(crypto.Hash(SHA256), sha256.New)
crypto.RegisterHash(crypto.Hash(BLAKE2b512), newBLAKE2b)
crypto.RegisterHash(crypto.Hash(HighwayHash256), nil) // TODO(aead): currently not supported, waiting for google to finish algorithm spec.
if runtime.GOARCH == "arm64" { // use SHA256 hardware implementation of arm64
DefaultBitrotAlgorithm = SHA256
}
}
// BitrotAlgorithm specifies a algorithm used for bitrot protection.
type BitrotAlgorithm crypto.Hash
const (
// Erasure related constants.
erasureAlgorithmKlauspost = "klauspost/reedsolomon/vandermonde"
// SHA256 represents the SHA-256 hash function
SHA256 = BitrotAlgorithm(crypto.SHA256)
// HighwayHash256 represents the HighwayHash-256 hash function
HighwayHash256 = BitrotAlgorithm(crypto.SHA3_256) // we must define that HighwayHash-256 is SHA3-256 because there is no HighwayHash constant in golang/crypto yet.
// BLAKE2b512 represents the BLAKE2b-256 hash function
BLAKE2b512 = BitrotAlgorithm(crypto.SHA3_512) // we must define that BLAKE2b-512 is SHA3-512 because there is no BLAKE2b-512 constant in golang/crypto yet - FIXME: Go1.9 has BLAKE2 constants
)
var bitrotAlgorithms = map[BitrotAlgorithm]string{
SHA256: "sha256",
BLAKE2b512: "blake2b",
HighwayHash256: "highwayhash256",
}
// New returns a new hash.Hash calculating the given bitrot algorithm. New panics
// if the algorithm is not supported or not linked into the binary.
func (a BitrotAlgorithm) New() hash.Hash {
if _, ok := bitrotAlgorithms[a]; !ok {
panic(fmt.Sprintf("bitrot algorithm #%d is not supported", a))
}
return crypto.Hash(a).New()
}
// Available reports whether the given algorihm is a supported and linked into the binary.
func (a BitrotAlgorithm) Available() bool {
_, ok := bitrotAlgorithms[a]
return ok && crypto.Hash(a).Available()
}
// String returns the string identifier for a given bitrot algorithm.
// If the algorithm is not supported String panics.
func (a BitrotAlgorithm) String() string {
if name, ok := bitrotAlgorithms[a]; ok {
return name
}
panic(fmt.Sprintf("bitrot algorithm #%d is not supported", a))
}
// BitrotAlgorithmFromString returns a bitrot algorithm from the given string representation.
// It returns 0 if the string representation does not match any supported algorithm.
// The zero value of a bitrot algorithm is never supported.
func BitrotAlgorithmFromString(s string) (a BitrotAlgorithm) {
for alg, name := range bitrotAlgorithms {
if name == s {
return alg
}
}
return
}
// objectPartInfo Info of each part kept in the multipart metadata
// file after CompleteMultipartUpload() is called.
type objectPartInfo struct {
@ -47,91 +124,92 @@ func (t byObjectPartNumber) Len() int { return len(t) }
func (t byObjectPartNumber) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
func (t byObjectPartNumber) Less(i, j int) bool { return t[i].Number < t[j].Number }
// checkSumInfo - carries checksums of individual scattered parts per disk.
type checkSumInfo struct {
Name string `json:"name"`
Algorithm HashAlgo `json:"algorithm"`
Hash string `json:"hash"`
// ChecksumInfo - carries checksums of individual scattered parts per disk.
type ChecksumInfo struct {
Name string
Algorithm BitrotAlgorithm
Hash []byte
}
// HashAlgo - represents a supported hashing algorithm for bitrot
// verification.
type HashAlgo string
const (
// HashBlake2b represents the Blake 2b hashing algorithm
HashBlake2b HashAlgo = "blake2b"
// HashSha256 represents the SHA256 hashing algorithm
HashSha256 HashAlgo = "sha256"
)
// isValidHashAlgo - function that checks if the hash algorithm is
// valid (known and used).
func isValidHashAlgo(algo HashAlgo) bool {
switch algo {
case HashSha256, HashBlake2b:
return true
default:
return false
// MarshalJSON marshals the ChecksumInfo struct
func (c ChecksumInfo) MarshalJSON() ([]byte, error) {
type checksuminfo struct {
Name string `json:"name"`
Algorithm string `json:"algorithm"`
Hash string `json:"hash"`
}
}
// Constant indicates current bit-rot algo used when creating objects.
// Depending on the architecture we are choosing a different checksum.
var bitRotAlgo = getDefaultBitRotAlgo()
// Get the default bit-rot algo depending on the architecture.
// Currently this function defaults to "blake2b" as the preferred
// checksum algorithm on all architectures except ARM64. On ARM64
// we use sha256 (optimized using sha2 instructions of ARM NEON chip).
func getDefaultBitRotAlgo() HashAlgo {
switch runtime.GOARCH {
case "arm64":
// As a special case for ARM64 we use an optimized
// version of hash i.e sha256. This is done so that
// blake2b is sub-optimal and slower on ARM64.
// This would also allows erasure coded writes
// on ARM64 servers to be on-par with their
// counter-part X86_64 servers.
return HashSha256
default:
// Default for all other architectures we use blake2b.
return HashBlake2b
info := checksuminfo{
Name: c.Name,
Algorithm: c.Algorithm.String(),
Hash: hex.EncodeToString(c.Hash),
}
return json.Marshal(info)
}
// erasureInfo - carries erasure coding related information, block
// distribution and checksums.
type erasureInfo struct {
Algorithm HashAlgo `json:"algorithm"`
DataBlocks int `json:"data"`
ParityBlocks int `json:"parity"`
BlockSize int64 `json:"blockSize"`
Index int `json:"index"`
Distribution []int `json:"distribution"`
Checksum []checkSumInfo `json:"checksum,omitempty"`
// UnmarshalJSON unmarshals the the given data into the ChecksumInfo struct
func (c *ChecksumInfo) UnmarshalJSON(data []byte) error {
type checksuminfo struct {
Name string `json:"name"`
Algorithm string `json:"algorithm"`
Hash string `json:"hash"`
}
var info checksuminfo
err := json.Unmarshal(data, &info)
if err != nil {
return err
}
c.Algorithm = BitrotAlgorithmFromString(info.Algorithm)
if !c.Algorithm.Available() {
return errBitrotHashAlgoInvalid
}
c.Hash, err = hex.DecodeString(info.Hash)
if err != nil {
return err
}
c.Name = info.Name
return nil
}
// AddCheckSum - add checksum of a part.
func (e *erasureInfo) AddCheckSumInfo(ckSumInfo checkSumInfo) {
for i, sum := range e.Checksum {
// ErasureInfo holds erasure coding and bitrot related information.
type ErasureInfo struct {
// Algorithm is the string representation of erasure-coding-algorithm
Algorithm string `json:"algorithm"`
// DataBlocks is the number of data blocks for erasure-coding
DataBlocks int `json:"data"`
// ParityBlocks is the number of parity blocks for erasure-coding
ParityBlocks int `json:"parity"`
// BlockSize is the size of one erasure-coded block
BlockSize int64 `json:"blockSize"`
// Index is the index of the current disk
Index int `json:"index"`
// Distribution is the distribution of the data and parity blocks
Distribution []int `json:"distribution"`
// Checksums holds all bitrot checksums of all erasure encoded blocks
Checksums []ChecksumInfo `json:"checksum,omitempty"`
}
// AddChecksumInfo adds a checksum of a part.
func (e *ErasureInfo) AddChecksumInfo(ckSumInfo ChecksumInfo) {
for i, sum := range e.Checksums {
if sum.Name == ckSumInfo.Name {
e.Checksum[i] = ckSumInfo
e.Checksums[i] = ckSumInfo
return
}
}
e.Checksum = append(e.Checksum, ckSumInfo)
e.Checksums = append(e.Checksums, ckSumInfo)
}
// GetCheckSumInfo - get checksum of a part.
func (e erasureInfo) GetCheckSumInfo(partName string) (ckSum checkSumInfo) {
// GetChecksumInfo - get checksum of a part.
func (e ErasureInfo) GetChecksumInfo(partName string) (ckSum ChecksumInfo) {
// Return the checksum.
for _, sum := range e.Checksum {
for _, sum := range e.Checksums {
if sum.Name == partName {
return sum
}
}
return checkSumInfo{Algorithm: bitRotAlgo}
return ChecksumInfo{}
}
// statInfo - carries stat information of the object.
@ -146,7 +224,7 @@ type xlMetaV1 struct {
Format string `json:"format"` // Format of the current `xl.json`.
Stat statInfo `json:"stat"` // Stat of the current object `xl.json`.
// Erasure coded info for the current object `xl.json`.
Erasure erasureInfo `json:"erasure"`
Erasure ErasureInfo `json:"erasure"`
// Minio release tag for current object `xl.json`.
Minio struct {
Release string `json:"release"`
@ -177,7 +255,7 @@ func newXLMetaV1(object string, dataBlocks, parityBlocks int) (xlMeta xlMetaV1)
xlMeta.Version = xlMetaVersion
xlMeta.Format = xlMetaFormat
xlMeta.Minio.Release = ReleaseTag
xlMeta.Erasure = erasureInfo{
xlMeta.Erasure = ErasureInfo{
Algorithm: erasureAlgorithmKlauspost,
DataBlocks: dataBlocks,
ParityBlocks: parityBlocks,

View file

@ -488,6 +488,7 @@ func (xl xlObjects) newMultipartUpload(bucket string, object string, meta map[st
uploadID := mustGetUUID()
uploadIDPath := path.Join(bucket, object, uploadID)
tempUploadIDPath := uploadID
// Write updated `xl.json` to all disks.
disks, err := writeSameXLMetadata(xl.storageDisks, minioMetaTmpBucket, tempUploadIDPath, xlMeta, xl.writeQuorum, xl.readQuorum)
if err != nil {
@ -627,23 +628,16 @@ func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, s
mw := io.MultiWriter(writers...)
var lreader io.Reader
var lreader = data
// Limit the reader to its provided size > 0.
if size > 0 {
// This is done so that we can avoid erroneous clients sending
// more data than the set content size.
lreader = io.LimitReader(data, size)
} else {
// else we read till EOF.
lreader = data
}
// Construct a tee reader for md5sum.
teeReader := io.TeeReader(lreader, mw)
// Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete.
defer xl.deleteObject(minioMetaTmpBucket, tmpPart)
if size > 0 {
if pErr := xl.prepareFile(minioMetaTmpBucket, tmpPartPath, size, onlineDisks, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks); err != nil {
return pi, toObjectErr(pErr, bucket, object)
@ -651,25 +645,26 @@ func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, s
}
}
// We always allow empty part.
allowEmpty := true
// Erasure code data and write across all disks.
onlineDisks, sizeWritten, checkSums, err := erasureCreateFile(onlineDisks, minioMetaTmpBucket, tmpPartPath, teeReader, allowEmpty, xlMeta.Erasure.BlockSize, xl.dataBlocks, xl.parityBlocks, bitRotAlgo, xl.writeQuorum)
storage, err := NewErasureStorage(onlineDisks, xlMeta.Erasure.DataBlocks, xlMeta.Erasure.ParityBlocks)
if err != nil {
return pi, toObjectErr(err, bucket, object)
}
buffer := make([]byte, xlMeta.Erasure.BlockSize, 2*xlMeta.Erasure.BlockSize) // alloc additional space for parity blocks created while erasure coding
file, err := storage.CreateFile(io.TeeReader(lreader, mw), minioMetaTmpBucket, tmpPartPath, buffer, DefaultBitrotAlgorithm, xl.writeQuorum)
if err != nil {
return pi, toObjectErr(err, bucket, object)
}
// Should return IncompleteBody{} error when reader has fewer bytes
// than specified in request header.
if sizeWritten < size {
if file.Size < size {
return pi, traceError(IncompleteBody{})
}
// For size == -1, perhaps client is sending in chunked encoding
// set the size as size that was actually written.
if size == -1 {
size = sizeWritten
size = file.Size
}
// Calculate new md5sum.
@ -727,16 +722,12 @@ func (xl xlObjects) PutObjectPart(bucket, object, uploadID string, partID int, s
// Add the current part.
xlMeta.AddObjectPart(partID, partSuffix, newMD5Hex, size)
for index, disk := range onlineDisks {
if disk == nil {
for i, disk := range onlineDisks {
if disk == OfflineDisk {
continue
}
partsMetadata[index].Parts = xlMeta.Parts
partsMetadata[index].Erasure.AddCheckSumInfo(checkSumInfo{
Name: partSuffix,
Hash: checkSums[index],
Algorithm: bitRotAlgo,
})
partsMetadata[i].Parts = xlMeta.Parts
partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{partSuffix, file.Algorithm, file.Checksums[i]})
}
// Write all the checksum metadata.

View file

@ -249,7 +249,11 @@ func (xl xlObjects) GetObject(bucket, object string, startOffset int64, length i
chunkSize := getChunkSize(xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks)
pool := bpool.NewBytePool(chunkSize, len(onlineDisks))
// Read from all parts.
storage, err := NewErasureStorage(onlineDisks, xlMeta.Erasure.DataBlocks, xlMeta.Erasure.ParityBlocks)
if err != nil {
return toObjectErr(err, bucket, object)
}
checksums := make([][]byte, len(storage.disks))
for ; partIndex <= lastPartIndex; partIndex++ {
if length == totalBytesRead {
break
@ -265,33 +269,24 @@ func (xl xlObjects) GetObject(bucket, object string, startOffset int64, length i
}
// Get the checksums of the current part.
checkSums := make([]string, len(onlineDisks))
var ckSumAlgo HashAlgo
for index, disk := range onlineDisks {
// Disk is not found skip the checksum.
if disk == nil {
checkSums[index] = ""
var algorithm BitrotAlgorithm
for index, disk := range storage.disks {
if disk == OfflineDisk {
continue
}
ckSumInfo := metaArr[index].Erasure.GetCheckSumInfo(partName)
checkSums[index] = ckSumInfo.Hash
// Set checksum algo only once, while it is possible to have
// different algos per block because of our `xl.json`.
// It is not a requirement, set this only once for all the disks.
if ckSumAlgo == "" {
ckSumAlgo = ckSumInfo.Algorithm
}
checksumInfo := metaArr[index].Erasure.GetChecksumInfo(partName)
algorithm = checksumInfo.Algorithm
checksums[index] = checksumInfo.Hash
}
// Start erasure decoding and writing to the client.
n, err := erasureReadFile(mw, onlineDisks, bucket, pathJoin(object, partName), partOffset, readSize, partSize, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks, xlMeta.Erasure.ParityBlocks, checkSums, ckSumAlgo, pool)
file, err := storage.ReadFile(mw, bucket, pathJoin(object, partName), partOffset, readSize, partSize, checksums, algorithm, xlMeta.Erasure.BlockSize, pool)
if err != nil {
errorIf(err, "Unable to read %s of the object `%s/%s`.", partName, bucket, object)
return toObjectErr(err, bucket, object)
}
// Track total bytes read from disk and written to the client.
totalBytesRead += n
totalBytesRead += file.Size
// partOffset will be valid only for the first part, hence reset it to 0 for
// the remaining parts.
@ -540,6 +535,11 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.
// Total size of the written object
var sizeWritten int64
storage, err := NewErasureStorage(onlineDisks, xlMeta.Erasure.DataBlocks, xlMeta.Erasure.ParityBlocks)
if err != nil {
return ObjectInfo{}, toObjectErr(err, bucket, object)
}
buffer := make([]byte, partsMetadata[0].Erasure.BlockSize, 2*partsMetadata[0].Erasure.BlockSize) // alloc additional space for parity blocks created while erasure coding
// Read data and split into parts - similar to multipart mechanism
for partIdx := 1; ; partIdx++ {
// Compute part name
@ -558,55 +558,39 @@ func (xl xlObjects) PutObject(bucket string, object string, size int64, data io.
// Hint the filesystem to pre-allocate one continuous large block.
// This is only an optimization.
if curPartSize > 0 {
pErr := xl.prepareFile(minioMetaTmpBucket, tempErasureObj, curPartSize, onlineDisks, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks)
pErr := xl.prepareFile(minioMetaTmpBucket, tempErasureObj, curPartSize, storage.disks, xlMeta.Erasure.BlockSize, xlMeta.Erasure.DataBlocks)
if pErr != nil {
return ObjectInfo{}, toObjectErr(pErr, bucket, object)
}
}
// partReader streams at most maximum part size
partReader := io.LimitReader(teeReader, globalPutPartSize)
// Allow creating empty earsure file only when this is the first part. This flag is useful
// when size == -1 because in this case, we are not able to predict how many parts we will have.
allowEmptyPart := partIdx == 1
var partSizeWritten int64
var checkSums []string
var erasureErr error
// Erasure code data and write across all disks.
onlineDisks, partSizeWritten, checkSums, erasureErr = erasureCreateFile(onlineDisks, minioMetaTmpBucket, tempErasureObj, partReader, allowEmptyPart, partsMetadata[0].Erasure.BlockSize, partsMetadata[0].Erasure.DataBlocks, partsMetadata[0].Erasure.ParityBlocks, bitRotAlgo, xl.writeQuorum)
file, erasureErr := storage.CreateFile(io.LimitReader(teeReader, globalPutPartSize), minioMetaTmpBucket, tempErasureObj, buffer, DefaultBitrotAlgorithm, xl.writeQuorum)
if erasureErr != nil {
return ObjectInfo{}, toObjectErr(erasureErr, minioMetaTmpBucket, tempErasureObj)
}
// Should return IncompleteBody{} error when reader has fewer bytes
// than specified in request header.
if partSizeWritten < int64(curPartSize) {
if file.Size < int64(curPartSize) {
return ObjectInfo{}, traceError(IncompleteBody{})
}
// Update the total written size
sizeWritten += partSizeWritten
sizeWritten += file.Size
// If erasure stored some data in the loop or created an empty file
if partSizeWritten > 0 || allowEmptyPart {
for index := range partsMetadata {
// Add the part to xl.json.
partsMetadata[index].AddObjectPart(partIdx, partName, "", partSizeWritten)
// Add part checksum info to xl.json.
partsMetadata[index].Erasure.AddCheckSumInfo(checkSumInfo{
Name: partName,
Hash: checkSums[index],
Algorithm: bitRotAlgo,
})
// allowEmpty creating empty earsure file only when this is the first part. This flag is useful
// when size == -1 because in this case, we are not able to predict how many parts we will have.
allowEmpty := partIdx == 1
if file.Size > 0 || allowEmpty {
for i := range partsMetadata {
partsMetadata[i].AddObjectPart(partIdx, partName, "", file.Size)
partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{partName, file.Algorithm, file.Checksums[i]})
}
}
// If we didn't write anything or we know that the next part doesn't have any
// data to write, we should quit this loop immediately
if partSizeWritten == 0 {
if file.Size == 0 {
break
}

View file

@ -17,6 +17,7 @@
package cmd
import (
"encoding/hex"
"errors"
"hash/crc32"
"path"
@ -149,8 +150,8 @@ func parseXLRelease(xlMetaBuf []byte) string {
return gjson.GetBytes(xlMetaBuf, "minio.release").String()
}
func parseXLErasureInfo(xlMetaBuf []byte) erasureInfo {
erasure := erasureInfo{}
func parseXLErasureInfo(xlMetaBuf []byte) (ErasureInfo, error) {
erasure := ErasureInfo{}
erasureResult := gjson.GetBytes(xlMetaBuf, "erasure")
// parse the xlV1Meta.Erasure.Distribution.
disResult := erasureResult.Get("distribution").Array()
@ -161,24 +162,28 @@ func parseXLErasureInfo(xlMetaBuf []byte) erasureInfo {
}
erasure.Distribution = distribution
erasure.Algorithm = HashAlgo(erasureResult.Get("algorithm").String())
erasure.Algorithm = erasureResult.Get("algorithm").String()
erasure.DataBlocks = int(erasureResult.Get("data").Int())
erasure.ParityBlocks = int(erasureResult.Get("parity").Int())
erasure.BlockSize = erasureResult.Get("blockSize").Int()
erasure.Index = int(erasureResult.Get("index").Int())
// Pare xlMetaV1.Erasure.Checksum array.
checkSumsResult := erasureResult.Get("checksum").Array()
checkSums := make([]checkSumInfo, len(checkSumsResult))
for i, checkSumResult := range checkSumsResult {
checkSum := checkSumInfo{}
checkSum.Name = checkSumResult.Get("name").String()
checkSum.Algorithm = HashAlgo(checkSumResult.Get("algorithm").String())
checkSum.Hash = checkSumResult.Get("hash").String()
checkSums[i] = checkSum
}
erasure.Checksum = checkSums
return erasure
checkSumsResult := erasureResult.Get("checksum").Array()
// Parse xlMetaV1.Erasure.Checksum array.
checkSums := make([]ChecksumInfo, len(checkSumsResult))
for i, v := range checkSumsResult {
algorithm := BitrotAlgorithmFromString(v.Get("algorithm").String())
if !algorithm.Available() {
return erasure, traceError(errBitrotHashAlgoInvalid)
}
hash, err := hex.DecodeString(v.Get("hash").String())
if err != nil {
return erasure, traceError(err)
}
checkSums[i] = ChecksumInfo{Name: v.Get("name").String(), Algorithm: algorithm, Hash: hash}
}
erasure.Checksums = checkSums
return erasure, nil
}
func parseXLParts(xlMetaBuf []byte) []objectPartInfo {
@ -207,8 +212,7 @@ func parseXLMetaMap(xlMetaBuf []byte) map[string]string {
}
// Constructs XLMetaV1 using `gjson` lib to retrieve each field.
func xlMetaV1UnmarshalJSON(xlMetaBuf []byte) (xmv xlMetaV1, e error) {
xlMeta := xlMetaV1{}
func xlMetaV1UnmarshalJSON(xlMetaBuf []byte) (xlMeta xlMetaV1, e error) {
// obtain version.
xlMeta.Version = parseXLVersion(xlMetaBuf)
// obtain format.
@ -216,12 +220,15 @@ func xlMetaV1UnmarshalJSON(xlMetaBuf []byte) (xmv xlMetaV1, e error) {
// Parse xlMetaV1.Stat .
stat, err := parseXLStat(xlMetaBuf)
if err != nil {
return xmv, err
return xlMeta, err
}
xlMeta.Stat = stat
// parse the xlV1Meta.Erasure fields.
xlMeta.Erasure = parseXLErasureInfo(xlMetaBuf)
xlMeta.Erasure, err = parseXLErasureInfo(xlMetaBuf)
if err != nil {
return xlMeta, err
}
// Parse the XL Parts.
xlMeta.Parts = parseXLParts(xlMetaBuf)

View file

@ -17,6 +17,8 @@
package cmd
import (
"bytes"
"encoding/hex"
"encoding/json"
"errors"
"reflect"
@ -139,7 +141,7 @@ func newTestXLMetaV1() xlMetaV1 {
xlMeta.Version = xlMetaVersion
xlMeta.Format = xlMetaFormat
xlMeta.Minio.Release = "test"
xlMeta.Erasure = erasureInfo{
xlMeta.Erasure = ErasureInfo{
Algorithm: "klauspost/reedsolomon/vandermonde",
DataBlocks: 5,
ParityBlocks: 5,
@ -158,13 +160,12 @@ func newTestXLMetaV1() xlMetaV1 {
return xlMeta
}
func (m *xlMetaV1) AddTestObjectCheckSum(checkSumNum int, name string, hash string, algo HashAlgo) {
checkSum := checkSumInfo{
Name: name,
Algorithm: algo,
Hash: hash,
func (m *xlMetaV1) AddTestObjectCheckSum(checkSumNum int, name string, algorithm BitrotAlgorithm, hash string) {
checksum, err := hex.DecodeString(hash)
if err != nil {
panic(err)
}
m.Erasure.Checksum[checkSumNum] = checkSum
m.Erasure.Checksums[checkSumNum] = ChecksumInfo{name, algorithm, checksum}
}
// AddTestObjectPart - add a new object part in order.
@ -194,14 +195,14 @@ func getXLMetaBytes(totalParts int) []byte {
func getSampleXLMeta(totalParts int) xlMetaV1 {
xlMeta := newTestXLMetaV1()
// Number of checksum info == total parts.
xlMeta.Erasure.Checksum = make([]checkSumInfo, totalParts)
xlMeta.Erasure.Checksums = make([]ChecksumInfo, totalParts)
// total number of parts.
xlMeta.Parts = make([]objectPartInfo, totalParts)
for i := 0; i < totalParts; i++ {
partName := "part." + strconv.Itoa(i+1)
// hard coding hash and algo value for the checksum, Since we are benchmarking the parsing of xl.json the magnitude doesn't affect the test,
// The magnitude doesn't make a difference, only the size does.
xlMeta.AddTestObjectCheckSum(i, partName, "a23f5eff248c4372badd9f3b2455a285cd4ca86c3d9a570b091d3fc5cd7ca6d9484bbea3f8c5d8d4f84daae96874419eda578fd736455334afbac2c924b3915a", "blake2b")
xlMeta.AddTestObjectCheckSum(i, partName, BLAKE2b512, "a23f5eff248c4372badd9f3b2455a285cd4ca86c3d9a570b091d3fc5cd7ca6d9484bbea3f8c5d8d4f84daae96874419eda578fd736455334afbac2c924b3915a")
xlMeta.AddTestObjectPart(i, partName, "d3fdd79cc3efd5fe5c068d7be397934b", 67108864)
}
return xlMeta
@ -248,18 +249,18 @@ func compareXLMetaV1(t *testing.T, unMarshalXLMeta, gjsonXLMeta xlMetaV1) {
}
}
if len(unMarshalXLMeta.Erasure.Checksum) != len(gjsonXLMeta.Erasure.Checksum) {
t.Errorf("Expected the size of Erasure Checksum to be %d, but got %d.", len(unMarshalXLMeta.Erasure.Checksum), len(gjsonXLMeta.Erasure.Checksum))
if len(unMarshalXLMeta.Erasure.Checksums) != len(gjsonXLMeta.Erasure.Checksums) {
t.Errorf("Expected the size of Erasure Checksums to be %d, but got %d.", len(unMarshalXLMeta.Erasure.Checksums), len(gjsonXLMeta.Erasure.Checksums))
} else {
for i := 0; i < len(unMarshalXLMeta.Erasure.Checksum); i++ {
if unMarshalXLMeta.Erasure.Checksum[i].Name != gjsonXLMeta.Erasure.Checksum[i].Name {
t.Errorf("Expected the Erasure Checksum Name to be \"%s\", got \"%s\".", unMarshalXLMeta.Erasure.Checksum[i].Name, gjsonXLMeta.Erasure.Checksum[i].Name)
for i := 0; i < len(unMarshalXLMeta.Erasure.Checksums); i++ {
if unMarshalXLMeta.Erasure.Checksums[i].Name != gjsonXLMeta.Erasure.Checksums[i].Name {
t.Errorf("Expected the Erasure Checksum Name to be \"%s\", got \"%s\".", unMarshalXLMeta.Erasure.Checksums[i].Name, gjsonXLMeta.Erasure.Checksums[i].Name)
}
if unMarshalXLMeta.Erasure.Checksum[i].Algorithm != gjsonXLMeta.Erasure.Checksum[i].Algorithm {
t.Errorf("Expected the Erasure Checksum Algorithm to be \"%s\", got \"%s.\"", unMarshalXLMeta.Erasure.Checksum[i].Algorithm, gjsonXLMeta.Erasure.Checksum[i].Algorithm)
if unMarshalXLMeta.Erasure.Checksums[i].Algorithm != gjsonXLMeta.Erasure.Checksums[i].Algorithm {
t.Errorf("Expected the Erasure Checksum Algorithm to be \"%s\", got \"%s\".", unMarshalXLMeta.Erasure.Checksums[i].Algorithm, gjsonXLMeta.Erasure.Checksums[i].Algorithm)
}
if unMarshalXLMeta.Erasure.Checksum[i] != gjsonXLMeta.Erasure.Checksum[i] {
t.Errorf("Expected the Erasure Checksum Hash to be \"%s\", got \"%s\".", unMarshalXLMeta.Erasure.Checksum[i].Hash, gjsonXLMeta.Erasure.Checksum[i].Hash)
if !bytes.Equal(unMarshalXLMeta.Erasure.Checksums[i].Hash, gjsonXLMeta.Erasure.Checksums[i].Hash) {
t.Errorf("Expected the Erasure Checksum Hash to be \"%s\", got \"%s\".", unMarshalXLMeta.Erasure.Checksums[i].Hash, gjsonXLMeta.Erasure.Checksums[i].Hash)
}
}
}
@ -304,12 +305,12 @@ func TestGetXLMetaV1GJson1(t *testing.T) {
var unMarshalXLMeta xlMetaV1
if err := json.Unmarshal(xlMetaJSON, &unMarshalXLMeta); err != nil {
t.Errorf("Unmarshalling failed")
t.Errorf("Unmarshalling failed: %v", err)
}
gjsonXLMeta, err := xlMetaV1UnmarshalJSON(xlMetaJSON)
if err != nil {
t.Errorf("gjson parsing of XLMeta failed")
t.Errorf("gjson parsing of XLMeta failed: %v", err)
}
compareXLMetaV1(t, unMarshalXLMeta, gjsonXLMeta)
}
@ -322,11 +323,11 @@ func TestGetXLMetaV1GJson10(t *testing.T) {
var unMarshalXLMeta xlMetaV1
if err := json.Unmarshal(xlMetaJSON, &unMarshalXLMeta); err != nil {
t.Errorf("Unmarshalling failed")
t.Errorf("Unmarshalling failed: %v", err)
}
gjsonXLMeta, err := xlMetaV1UnmarshalJSON(xlMetaJSON)
if err != nil {
t.Errorf("gjson parsing of XLMeta failed")
t.Errorf("gjson parsing of XLMeta failed: %v", err)
}
compareXLMetaV1(t, unMarshalXLMeta, gjsonXLMeta)
}

View file

@ -22,6 +22,19 @@ To get the package use the standard:
go get github.com/klauspost/reedsolomon
```
# Changes
## July 20, 2017
`ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added:
```Go
func (e *YourEnc) ReconstructData(shards [][]byte) error {
return ReconstructData(shards)
}
```
You can of course also do your own implementation. The [`StreamEncoder`](https://godoc.org/github.com/klauspost/reedsolomon#StreamEncoder) handles this without modifying the interface. This is a good lesson on why returning interfaces is not a good design.
# Usage
This section assumes you know the basics of Reed-Solomon encoding. A good start is this [Backblaze blog post](https://www.backblaze.com/blog/reed-solomon/).

View file

@ -616,7 +616,7 @@ func (r reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
var ErrShortData = errors.New("not enough data to fill the number of requested shards")
// Split a data slice into the number of shards given to the encoder,
// and create empty parity shards.
// and create empty parity shards if necessary.
//
// The data will be split into equally sized shards.
// If the data size isn't divisible by the number of shards,
@ -631,12 +631,19 @@ func (r reedSolomon) Split(data []byte) ([][]byte, error) {
if len(data) == 0 {
return nil, ErrShortData
}
// Calculate number of bytes per shard.
// Calculate number of bytes per data shard.
perShard := (len(data) + r.DataShards - 1) / r.DataShards
// Pad data to r.Shards*perShard.
padding := make([]byte, (r.Shards*perShard)-len(data))
data = append(data, padding...)
if cap(data) > len(data) {
data = data[:cap(data)]
}
// Only allocate memory if necessary
if len(data) < (r.Shards * perShard) {
// Pad data to r.Shards*perShard.
padding := make([]byte, (r.Shards*perShard)-len(data))
data = append(data, padding...)
}
// Split into equal-length shards.
dst := make([][]byte, r.Shards)

6
vendor/vendor.json vendored
View file

@ -243,10 +243,10 @@
"revisionTime": "2016-10-16T15:41:25Z"
},
{
"checksumSHA1": "DnS7x0Gqc93p4hQ88FgE35vfIRw=",
"checksumSHA1": "gYAsuckCW3o4veePKZzEHvCcJro=",
"path": "github.com/klauspost/reedsolomon",
"revision": "a9202d772777d8d2264c3e0c6159be5047697380",
"revisionTime": "2017-07-19T04:51:23Z"
"revision": "48a4fd05f1730dd3ef9c3f9e943f6091d063f2c4",
"revisionTime": "2017-07-22T14:16:58Z"
},
{
"checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=",