Update reedsolomon/highwayhash to start using ppc64le support (#7003)

Thanks to @fwessels for the upstream work on reedsolomon and highwayhash which has resulted in 10x performance improvement on ppc64 architecture.
2018-12-20 09:47:05 -08:00 · 2018-12-20 09:47:05 -08:00 · def04f01cf
parent bc67410548
commit def04f01cf
17 changed files with 460 additions and 14 deletions
--- a/vendor/github.com/klauspost/reedsolomon/README.md
+++ b/vendor/github.com/klauspost/reedsolomon/README.md
@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon

 # Changes

+## December 18, 2018
+
+Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
+
 ## November 18, 2017

 Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU.
@ -259,6 +263,18 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
 | 10   | 2      | 20%    |           188 |            1738 |       925% |
 | 10   | 4      | 40%    |            96 |             839 |       877% |

+# Performance on ppc64le
+
+The performance for ppc64le has been accelerated. This gives roughly a 10x performance improvement on this architecture as can been seen below:
+
+```
+benchmark                      old MB/s     new MB/s     speedup
+BenchmarkGalois128K-160        948.87       8878.85      9.36x
+BenchmarkGalois1M-160          968.85       9041.92      9.33x
+BenchmarkGaloisXor128K-160     862.02       7905.00      9.17x
+BenchmarkGaloisXor1M-160       784.60       6296.65      8.03x
+```
+
 # asm2plan9s

 [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
@ -266,8 +282,10 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
 # Links
 * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
 * [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
+* [ocaml-reed-solomon-erasure](https://gitlab.com/darrenldl/ocaml-reed-solomon-erasure). Compatible OCaml implementation.
 * [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package.
 * [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
+* [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation.
 * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
 * [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
 * [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@ -1,5 +1,6 @@
 //+build !noasm
 //+build !appengine
+//+build !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@ -1,4 +1,4 @@
-//+build !noasm !appengine
+//+build !noasm !appengine !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
@ -1,5 +1,6 @@
 //+build !noasm
 //+build !appengine
+//+build !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.
 // Copyright 2017, Minio, Inc.
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
@ -1,4 +1,4 @@
-//+build !noasm !appengine
+//+build !noasm !appengine !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.
 // Copyright 2017, Minio, Inc.
--- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@ -1,5 +1,6 @@
-//+build !amd64 noasm appengine
-//+build !arm64 noasm appengine
+//+build !amd64 noasm appengine gccgo
+//+build !arm64 noasm appengine gccgo
+//+build !ppc64le noasm appengine gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
@ -0,0 +1,67 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2018, Minio, Inc.
+
+package reedsolomon
+
+//go:noescape
+func galMulPpc(low, high, in, out []byte)
+
+//go:noescape
+func galMulPpcXor(low, high, in, out []byte)
+
+// This is what the assembler routines do in blocks of 16 bytes:
+/*
+func galMulPpc(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] = low[l] ^ high[h]
+	}
+}
+func galMulPpcXor(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] ^= low[l] ^ high[h]
+	}
+}
+*/
+
+func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
+	done := (len(in) >> 4) << 4
+	if done > 0 {
+		galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] = mt[in[i]]
+		}
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
+	done := (len(in) >> 4) << 4
+	if done > 0 {
+		galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] ^= mt[in[i]]
+		}
+	}
+}
+
+// slice galois add
+func sliceXor(in, out []byte, sse2 bool) {
+	for n, input := range in {
+		out[n] ^= input
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s
@ -0,0 +1,126 @@
+//+build !noasm !appengine !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2018, Minio, Inc.
+
+#include "textflag.h"
+
+#define LOW       R3
+#define HIGH      R4
+#define IN        R5
+#define LEN       R6
+#define OUT       R7
+#define CONSTANTS R8
+#define OFFSET    R9
+#define OFFSET1   R10
+#define OFFSET2   R11
+
+#define X6        VS34
+#define X6_       V2
+#define X7        VS35
+#define X7_       V3
+#define MSG       VS36
+#define MSG_      V4
+#define MSG_HI    VS37
+#define MSG_HI_   V5
+#define RESULT    VS38
+#define RESULT_   V6
+#define ROTATE    VS39
+#define ROTATE_   V7
+#define MASK      VS40
+#define MASK_     V8
+#define FLIP      VS41
+#define FLIP_     V9
+
+
+// func galMulPpc(low, high, in, out []byte)
+TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96
+    MOVD    low+0(FP), LOW
+    MOVD    high+24(FP), HIGH
+    MOVD    in+48(FP), IN
+    MOVD    in_len+56(FP), LEN
+    MOVD    out+72(FP), OUT
+
+    MOVD    $16, OFFSET1
+    MOVD    $32, OFFSET2
+
+    MOVD    $·constants(SB), CONSTANTS
+    LXVD2X  (CONSTANTS)(R0), ROTATE
+    LXVD2X  (CONSTANTS)(OFFSET1), MASK
+    LXVD2X  (CONSTANTS)(OFFSET2), FLIP
+
+    LXVD2X  (LOW)(R0), X6
+    LXVD2X  (HIGH)(R0), X7
+    VPERM   X6_, V31, FLIP_, X6_
+    VPERM   X7_, V31, FLIP_, X7_
+
+    MOVD    $0, OFFSET
+
+loop:
+    LXVD2X  (IN)(OFFSET), MSG
+
+    VSRB    MSG_, ROTATE_, MSG_HI_
+    VAND    MSG_, MASK_, MSG_
+    VPERM   X6_, V31, MSG_, MSG_
+    VPERM   X7_, V31, MSG_HI_, MSG_HI_
+
+    VXOR    MSG_, MSG_HI_, MSG_
+
+    STXVD2X MSG, (OUT)(OFFSET)
+
+    ADD     $16, OFFSET, OFFSET
+    CMP     LEN, OFFSET
+    BGT     loop
+    RET
+
+
+// func galMulPpcXorlow, high, in, out []byte)
+TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96
+    MOVD    low+0(FP), LOW
+    MOVD    high+24(FP), HIGH
+    MOVD    in+48(FP), IN
+    MOVD    in_len+56(FP), LEN
+    MOVD    out+72(FP), OUT
+
+    MOVD    $16, OFFSET1
+    MOVD    $32, OFFSET2
+
+    MOVD    $·constants(SB), CONSTANTS
+    LXVD2X  (CONSTANTS)(R0), ROTATE
+    LXVD2X  (CONSTANTS)(OFFSET1), MASK
+    LXVD2X  (CONSTANTS)(OFFSET2), FLIP
+
+    LXVD2X  (LOW)(R0), X6
+    LXVD2X  (HIGH)(R0), X7
+    VPERM   X6_, V31, FLIP_, X6_
+    VPERM   X7_, V31, FLIP_, X7_
+
+    MOVD    $0, OFFSET
+
+loopXor:
+    LXVD2X  (IN)(OFFSET), MSG
+    LXVD2X  (OUT)(OFFSET), RESULT
+
+    VSRB    MSG_, ROTATE_, MSG_HI_
+    VAND    MSG_, MASK_, MSG_
+    VPERM   X6_, V31, MSG_, MSG_
+    VPERM   X7_, V31, MSG_HI_, MSG_HI_
+
+    VXOR    MSG_, MSG_HI_, MSG_
+    VXOR    MSG_, RESULT_, RESULT_
+
+    STXVD2X RESULT, (OUT)(OFFSET)
+
+    ADD     $16, OFFSET, OFFSET
+    CMP     LEN, OFFSET
+    BGT     loopXor
+    RET
+
+DATA ·constants+0x0(SB)/8, $0x0404040404040404
+DATA ·constants+0x8(SB)/8, $0x0404040404040404
+DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·constants+0x20(SB)/8, $0x0706050403020100
+DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL ·constants(SB), 8, $48
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@ -471,12 +471,12 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
 		wg.Add(1)
 		go func(start, stop int) {
 			for c := 0; c < r.DataShards; c++ {
-				in := inputs[c]
+				in := inputs[c][start:stop]
 				for iRow := 0; iRow < outputCount; iRow++ {
 					if c == 0 {
-						galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+						galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
 					} else {
-						galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+						galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
 					}
 				}
 			}
--- a/vendor/github.com/minio/highwayhash/README.md
+++ b/vendor/github.com/minio/highwayhash/README.md
@ -7,7 +7,7 @@

 It can be used to prevent hash-flooding attacks or authenticate short-lived messages. Additionally it can be used as a fingerprinting function. HighwayHash is not a general purpose cryptographic hash function (such as Blake2b, SHA-3 or SHA-2) and should not be used if strong collision resistance is required. 

-This repository contains a native Go version and optimized assembly implementations on both Intel and ARM platforms.  
+This repository contains a native Go version and optimized assembly implementations for Intel, ARM and ppc64le architectures.

 ### High performance

@ -50,6 +50,17 @@ ARM64 NEON        | 384 MB/s         | 955 MB/s          | 1053 MB/s

 *Note: For now just the (main) update loop is implemented in assembly, so for small messages there is still considerable overhead due to initialization and finalization.*

+### ppc64le Performance
+
+The ppc64le accelerated version is roughly 10x faster compared to the non-optimized version:
+
+```
+benchmark              old MB/s     new MB/s     speedup
+BenchmarkWrite_8K      531.19       5566.41      10.48x
+BenchmarkSum64_8K      518.86       4971.88      9.58x
+BenchmarkSum256_8K     502.45       4474.20      8.90x
+```
+
 ### Performance compared to other hashing techniques

 On a Skylake CPU (3.0 GHz Xeon Platinum 8124M) the table below shows how HighwayHash compares to other hashing techniques for 5 MB messages (single core performance, all Golang implementations, see [benchmark](https://github.com/fwessels/HashCompare/blob/master/benchmarks_test.go)).
--- a/vendor/github.com/minio/highwayhash/highwayhashAVX2_amd64.go
+++ b/vendor/github.com/minio/highwayhash/highwayhashAVX2_amd64.go
@ -13,6 +13,7 @@ var (
 	useSSE4 = cpu.X86.HasSSE41
 	useAVX2 = cpu.X86.HasAVX2
 	useNEON = false
+	useVMX  = false
 )

 //go:noescape
--- a/vendor/github.com/minio/highwayhash/highwayhash_amd64.go
+++ b/vendor/github.com/minio/highwayhash/highwayhash_amd64.go
@ -13,6 +13,7 @@ var (
 	useSSE4 = cpu.X86.HasSSE41
 	useAVX2 = false
 	useNEON = false
+	useVMX  = false
 )

 //go:noescape
--- a/vendor/github.com/minio/highwayhash/highwayhash_arm64.go
+++ b/vendor/github.com/minio/highwayhash/highwayhash_arm64.go
@ -10,6 +10,7 @@ var (
 	useSSE4 = false
 	useAVX2 = false
 	useNEON = true
+	useVMX  = false
 )

 //go:noescape
--- a/vendor/github.com/minio/highwayhash/highwayhash_ppc64le.go
+++ b/vendor/github.com/minio/highwayhash/highwayhash_ppc64le.go
@ -0,0 +1,33 @@
+//+build !noasm
+
+// Copyright (c) 2017 Minio Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+
+package highwayhash
+
+var (
+	useSSE4 = false
+	useAVX2 = false
+	useNEON = false
+	useVMX  = true
+)
+
+//go:noescape
+func updatePpc64Le(state *[16]uint64, msg []byte)
+
+func initialize(state *[16]uint64, key []byte) {
+	initializeGeneric(state, key)
+}
+
+func update(state *[16]uint64, msg []byte) {
+	if useVMX {
+		updatePpc64Le(state, msg)
+	} else {
+		updateGeneric(state, msg)
+	}
+}
+
+func finalize(out []byte, state *[16]uint64) {
+	finalizeGeneric(out, state)
+}
--- a/vendor/github.com/minio/highwayhash/highwayhash_ppc64le.s
+++ b/vendor/github.com/minio/highwayhash/highwayhash_ppc64le.s
@ -0,0 +1,183 @@
+//+build !noasm !appengine
+
+//
+// Minio Cloud Storage, (C) 2018 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "textflag.h"
+
+// Definition of registers
+#define V0_LO    VS32
+#define V0_LO_   V0
+#define V0_HI    VS33
+#define V0_HI_   V1
+#define V1_LO    VS34
+#define V1_LO_   V2
+#define V1_HI    VS35
+#define V1_HI_   V3
+#define MUL0_LO  VS36
+#define MUL0_LO_ V4
+#define MUL0_HI  VS37
+#define MUL0_HI_ V5
+#define MUL1_LO  VS38
+#define MUL1_LO_ V6
+#define MUL1_HI  VS39
+#define MUL1_HI_ V7
+
+// Message
+#define MSG_LO   VS40
+#define MSG_LO_  V8
+#define MSG_HI   VS41
+
+// Constants
+#define ROTATE   VS42
+#define ROTATE_  V10
+#define MASK     VS43
+#define MASK_    V11
+
+// Temps
+#define TEMP1    VS44
+#define TEMP1_   V12
+#define TEMP2    VS45
+#define TEMP2_   V13
+#define TEMP3    VS46
+#define TEMP3_   V14
+#define TEMP4_   V15
+#define TEMP5_   V16
+#define TEMP6_   V17
+#define TEMP7_   V18
+
+// Regular registers
+#define STATE     R3
+#define MSG_BASE  R4
+#define MSG_LEN   R5
+#define CONSTANTS R6
+#define P1        R7
+#define P2        R8
+#define P3        R9
+#define P4        R10
+#define P5        R11
+#define P6        R12
+#define P7        R14 // avoid using R13
+
+TEXT ·updatePpc64Le(SB), NOFRAME|NOSPLIT, $0-32
+	MOVD state+0(FP), STATE
+	MOVD msg_base+8(FP), MSG_BASE
+	MOVD msg_len+16(FP), MSG_LEN  // length of message
+
+	// Sanity check for length
+	CMPU MSG_LEN, $31
+	BLE  complete
+
+	// Setup offsets
+	MOVD     $16, P1
+	MOVD     $32, P2
+	MOVD     $48, P3
+	MOVD     $64, P4
+	MOVD     $80, P5
+	MOVD     $96, P6
+	MOVD     $112, P7
+
+	// Load state
+	LXVD2X   (STATE)(R0), V0_LO
+	LXVD2X   (STATE)(P1), V0_HI
+	LXVD2X   (STATE)(P2), V1_LO
+	LXVD2X   (STATE)(P3), V1_HI
+	LXVD2X   (STATE)(P4), MUL0_LO
+	LXVD2X   (STATE)(P5), MUL0_HI
+	LXVD2X   (STATE)(P6), MUL1_LO
+	LXVD2X   (STATE)(P7), MUL1_HI
+	XXPERMDI V0_LO,   V0_LO,   $2, V0_LO
+	XXPERMDI V0_HI,   V0_HI,   $2, V0_HI
+	XXPERMDI V1_LO,   V1_LO,   $2, V1_LO
+	XXPERMDI V1_HI,   V1_HI,   $2, V1_HI
+	XXPERMDI MUL0_LO, MUL0_LO, $2, MUL0_LO
+	XXPERMDI MUL0_HI, MUL0_HI, $2, MUL0_HI
+	XXPERMDI MUL1_LO, MUL1_LO, $2, MUL1_LO
+	XXPERMDI MUL1_HI, MUL1_HI, $2, MUL1_HI
+
+	// Load constants table pointer
+	MOVD     $·constants(SB), CONSTANTS
+	LXVD2X   (CONSTANTS)(R0), ROTATE
+	LXVD2X   (CONSTANTS)(P1), MASK
+	XXLNAND  MASK, MASK, MASK
+
+loop:
+	// Main highwayhash update loop
+	LXVD2X   (MSG_BASE)(R0), MSG_LO
+	VADDUDM  V0_LO_,   MUL1_LO_, TEMP1_
+	VRLD     V0_LO_,   ROTATE_,  TEMP2_
+	VADDUDM  MUL1_HI_, V0_HI_,   TEMP3_
+	LXVD2X   (MSG_BASE)(P1), MSG_HI
+	ADD      $32,      MSG_BASE, MSG_BASE
+	XXPERMDI MSG_LO,   MSG_LO,   $2, MSG_LO
+	XXPERMDI MSG_HI,   MSG_HI,   $2, V0_LO
+	VADDUDM  MSG_LO_,  MUL0_LO_, MSG_LO_
+	VADDUDM  V0_LO_,   MUL0_HI_, V0_LO_
+	VADDUDM  MSG_LO_,  V1_LO_,   V1_LO_
+	VSRD     V0_HI_,   ROTATE_,  MSG_LO_
+	VADDUDM  V0_LO_,   V1_HI_,   V1_HI_
+	VPERM    V1_LO_,   V1_LO_,   MASK_, V0_LO_
+	VMULOUW  V1_LO_,   TEMP2_,   TEMP2_
+	VPERM    V1_HI_,   V1_HI_,   MASK_, TEMP7_
+	VADDUDM  V0_LO_,   TEMP1_,   V0_LO_
+	VMULOUW  V1_HI_,   MSG_LO_,  MSG_LO_
+	VADDUDM  TEMP7_,   TEMP3_,   V0_HI_
+	VPERM    V0_LO_,   V0_LO_,   MASK_, TEMP6_
+	VRLD     V1_LO_,   ROTATE_,  TEMP4_
+	VSRD     V1_HI_,   ROTATE_,  TEMP5_
+	VPERM    V0_HI_,   V0_HI_,   MASK_, TEMP7_
+	XXLXOR   MUL0_LO,  TEMP2,    MUL0_LO
+	VMULOUW  TEMP1_,   TEMP4_,   TEMP1_
+	VMULOUW  TEMP3_,   TEMP5_,   TEMP3_
+	XXLXOR   MUL0_HI,  MSG_LO,   MUL0_HI
+	XXLXOR   MUL1_LO,  TEMP1,    MUL1_LO
+	XXLXOR   MUL1_HI,  TEMP3,    MUL1_HI
+	VADDUDM  TEMP6_,   V1_LO_,   V1_LO_
+	VADDUDM  TEMP7_,   V1_HI_,   V1_HI_
+
+	SUB  $32, MSG_LEN, MSG_LEN
+	CMPU MSG_LEN, $32
+	BGE  loop
+
+	// Save state
+	XXPERMDI V0_LO,   V0_LO,   $2, V0_LO
+	XXPERMDI V0_HI,   V0_HI,   $2, V0_HI
+	XXPERMDI V1_LO,   V1_LO,   $2, V1_LO
+	XXPERMDI V1_HI,   V1_HI,   $2, V1_HI
+	XXPERMDI MUL0_LO, MUL0_LO, $2, MUL0_LO
+	XXPERMDI MUL0_HI, MUL0_HI, $2, MUL0_HI
+	XXPERMDI MUL1_LO, MUL1_LO, $2, MUL1_LO
+	XXPERMDI MUL1_HI, MUL1_HI, $2, MUL1_HI
+	STXVD2X  V0_LO,   (STATE)(R0)
+	STXVD2X  V0_HI,   (STATE)(P1)
+	STXVD2X  V1_LO,   (STATE)(P2)
+	STXVD2X  V1_HI,   (STATE)(P3)
+	STXVD2X  MUL0_LO, (STATE)(P4)
+	STXVD2X  MUL0_HI, (STATE)(P5)
+	STXVD2X  MUL1_LO, (STATE)(P6)
+	STXVD2X  MUL1_HI, (STATE)(P7)
+
+complete:
+	RET
+
+
+// Constants table
+DATA ·constants+0x0(SB)/8, $0x0000000000000020
+DATA ·constants+0x8(SB)/8, $0x0000000000000020
+DATA ·constants+0x10(SB)/8, $0x070806090d0a040b  // zipper merge constant
+DATA ·constants+0x18(SB)/8, $0x000f010e05020c03  // zipper merge constant
+
+GLOBL ·constants(SB), 8, $32
--- a/vendor/github.com/minio/highwayhash/highwayhash_ref.go
+++ b/vendor/github.com/minio/highwayhash/highwayhash_ref.go
@ -4,6 +4,7 @@

 // +build !amd64
 // +build !arm64
+// +build !ppc64le

 package highwayhash

@ -11,6 +12,7 @@ var (
 	useSSE4 = false
 	useAVX2 = false
 	useNEON = false
+	useVMX  = false
 )

 func initialize(state *[16]uint64, k []byte) {
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@ -563,10 +563,10 @@
 			"revisionTime": "2017-10-07T12:43:06Z"
 		},
 		{
-			"checksumSHA1": "ehsrWipiGIWqa4To8TmelIx06vI=",
+			"checksumSHA1": "KiQa3vguztElzJkoqeIGHlfLFJA=",
 			"path": "github.com/klauspost/reedsolomon",
-			"revision": "0b30fa71cc8e4e9010c9aba6d0320e2e5b163b29",
-			"revisionTime": "2017-12-19T13:34:37Z"
+			"revision": "8885f3a1c73882e6f11b766242c69a1eb8f44b28",
+			"revisionTime": "2018-12-18T19:39:59Z"
 		},
 		{
 			"checksumSHA1": "xxLSo5tKtXc7jGrR70yoEfza8Cw=",
@ -634,10 +634,10 @@
 			"revisionTime": "2018-01-23T12:12:34Z"
 		},
 		{
-			"checksumSHA1": "2Fu1GmLwDo6FFdahjnlWnPkwJTE=",
+			"checksumSHA1": "CD2MtlgA8h0z6hYJHURS5eOmZ1k=",
 			"path": "github.com/minio/highwayhash",
-			"revision": "85fc8a2dacad36a6beb2865793cd81363a496696",
-			"revisionTime": "2018-05-01T08:09:13Z"
+			"revision": "93ed73d641695483ab4438817457b6586ee5765c",
+			"revisionTime": "2018-12-20T01:13:08Z"
 		},
 		{
 			"checksumSHA1": "7/Hdd23/j4/yt4BXa+h0kqz1yjw=",