diff --git a/include/ircd/simd/simd.h b/include/ircd/simd/simd.h index c0c9aa550..67ea8b575 100644 --- a/include/ircd/simd/simd.h +++ b/include/ircd/simd/simd.h @@ -19,7 +19,11 @@ namespace ircd::simd { template T popmask(const T) noexcept; - template size_t popcount(const T) noexcept; + template T boolmask(const T) noexcept; + + template uint popcount(const T) noexcept; + template uint clz(const T) noexcept; + template uint ctz(const T) noexcept; // xmmx shifter workarounds template T shl(const T &a) noexcept; @@ -37,32 +41,6 @@ namespace ircd using simd::lane_cast; } -/// Convenience template. Vector compare instructions yield 0xff on equal; -/// sometimes one might need an actual value of 1 for accumulators or maybe -/// some bool-type reason... -template -inline size_t -ircd::simd::popcount(const T a) -noexcept -{ - size_t i(0), ret(0); - while(i < lanes(a)) - ret += __builtin_popcountll(a[i++]); - - return ret; -} - -/// Convenience template. Vector compare instructions yield 0xff on equal; -/// sometimes one might need an actual value of 1 for accumulators or maybe -/// some bool-type reason... -template -inline T -ircd::simd::popmask(const T a) -noexcept -{ - return a & 1; -} - #ifdef HAVE_X86INTRIN_H template [[using gnu: always_inline, gnu_inline, artificial]] @@ -126,3 +104,94 @@ noexcept return _mm256_slli_si256(a, b / 8); } #endif + +/// Convenience template. Unfortunately this drops to scalar until specific +/// targets and specializations are created. +template +inline uint +ircd::simd::clz(const T a) +noexcept +{ + uint ret(0), i(0); do + { + const auto mask + { + boolmask(uint(ret == sizeof_lane() * 8 * i)) + }; + + if constexpr(sizeof_lane() <= sizeof(u16)) + ret += __lzcnt16(__builtin_bswap16(a[i++])) & mask; + else if constexpr(sizeof_lane() <= sizeof(u32)) + ret += __lzcnt32(__builtin_bswap32(a[i++])) & mask; + else + ret += __lzcnt64(__builtin_bswap64(a[i++])) & mask; + } + while(i < lanes(a)); + + return ret; +} + +/// Convenience template. Unfortunately this drops to scalar until specific +/// targets and specializations are created. +template +inline uint +ircd::simd::ctz(const T a) +noexcept +{ + uint ret(0), i(lanes(a)), mask(-1U); do + { + if constexpr(sizeof_lane() <= sizeof(u16)) + ret += __lzcnt16(a[--i]) & mask; + else if constexpr(sizeof_lane() <= sizeof(u32)) + ret += __lzcnt32(a[--i]) & mask; + else + ret += __lzcnt64(a[--i]) & mask; + + static const auto lane_bits(sizeof_lane() * 8); + mask &= boolmask(uint(ret % lane_bits == 0)); + mask &= boolmask(uint(ret != 0)); + } + while(i); + + return ret; +} + +/// Convenience template. Unfortunately this drops to scalar until specific +/// targets and specializations are created. +template +inline uint +ircd::simd::popcount(const T a) +noexcept +{ + uint ret(0), i(0); + for(; i < lanes(); ++i) + if constexpr(sizeof_lane() <= sizeof(int)) + ret += __builtin_popcount(a[i]); + else if constexpr(sizeof_lane() <= sizeof(long)) + ret += __builtin_popcountl(a[i]); + else + ret += __builtin_popcountll(a[i]); + + return ret; +} + +/// Convenience template. Extends a bool value where the lsb is 1 or 0 into a +/// mask value like the result of vector comparisons. +template +inline T +ircd::simd::boolmask(const T a) +noexcept +{ + return ~(popmask(a) - 1); +} + +/// Convenience template. Vector compare instructions yield 0xff on equal; +/// sometimes one might need an actual value of 1 for accumulators or maybe +/// some bool-type reason... +template +inline T +ircd::simd::popmask(const T a) +noexcept +{ + return a & 1; +}