diff --git a/include/ircd/simd/lzcnt.h b/include/ircd/simd/lzcnt.h index 132d6f637..ba6b70233 100644 --- a/include/ircd/simd/lzcnt.h +++ b/include/ircd/simd/lzcnt.h @@ -17,50 +17,41 @@ namespace ircd::simd } /// Convenience template. Unfortunately this drops to scalar until specific -/// targets and specializations are created. +/// targets and specializations are created. The behavior can differ among +/// platforms; we make use of lzcnt if available otherwise we account for bsr. template inline uint -__attribute__((target("lzcnt"))) ircd::simd::lzcnt(const T a) noexcept { - // The behavior of lzcnt can differ among platforms; when true we expect - // lzcnt to fall back to bsr-like behavior. - constexpr auto bitscan - { - #ifdef __LZCNT__ - false - #else - true - #endif - }; - - uint ret(0), i(0); do + uint ret(0); + for(uint i(0); i < lanes(); ++i) { const auto mask { boolmask(uint(ret == sizeof_lane() * 8 * i)) }; - if constexpr(bitscan && sizeof_lane() <= sizeof(u16)) - ret += (15 - __lzcnt16(__builtin_bswap16(a[i++]))) & mask; + if constexpr(sizeof_lane() <= sizeof(u8)) + ret += mask & __builtin_clz((uint(a[i]) << 24) | 0x00ffffff); else if constexpr(sizeof_lane() <= sizeof(u16)) - ret += __lzcnt16(__builtin_bswap16(a[i++])) & mask; - - else if constexpr(bitscan && sizeof_lane() <= sizeof(u32)) - ret += (31 - __lzcnt32(__builtin_bswap32(a[i++]))) & mask; + ret += mask & __builtin_clz((uint(__builtin_bswap16(a[i])) << 16) | 0x0000ffffU); else if constexpr(sizeof_lane() <= sizeof(u32)) - ret += __lzcnt32(__builtin_bswap32(a[i++])) & mask; + ret += mask & + ( + (boolmask(uint(a[i] != 0)) & __builtin_clz(__builtin_bswap32(a[i]))) + | (boolmask(uint(a[i] == 0)) & 32U) + ); - else if constexpr(bitscan) - ret += (63 - __lzcnt64(__builtin_bswap64(a[i++]))) & mask; - - else - ret += __lzcnt64(__builtin_bswap64(a[i++])) & mask; + else if constexpr(sizeof_lane() <= sizeof(u64)) + ret += mask & + ( + (boolmask(uint(a[i] != 0)) & __builtin_clzl(__builtin_bswap64(a[i]))) + | (boolmask(uint(a[i] == 0)) & 64U) + ); } - while(i < lanes()); return ret; } diff --git a/include/ircd/simd/tzcnt.h b/include/ircd/simd/tzcnt.h index 8c10da802..ebd376a0c 100644 --- a/include/ircd/simd/tzcnt.h +++ b/include/ircd/simd/tzcnt.h @@ -17,49 +17,40 @@ namespace ircd::simd } /// Convenience template. Unfortunately this drops to scalar until specific -/// targets and specializations are created. +/// targets and specializations are created. The behavior of can differ among +/// platforms; we use lzcnt when available, otherwise we account for bsr. template inline uint -__attribute__((target("lzcnt"))) ircd::simd::tzcnt(const T a) noexcept { - // The behavior of lzcnt/tzcnt can differ among platforms; when false we - // lzcnt/tzcnt to fall back to bsr/bsf-like behavior. - constexpr auto bitscan + uint ret(0), i(lanes() - 1), mask(-1U); do { - #ifdef __LZCNT__ - false - #else - true - #endif - }; - - uint ret(0), i(lanes()), mask(-1U); do - { - if constexpr(bitscan && sizeof_lane() <= sizeof(u16)) - ret += (15 - __lzcnt16(a[--i])) & mask; + if constexpr(sizeof_lane() <= sizeof(u8)) + ret += mask & __builtin_ctz(a[i] | 0xffffff00U); else if constexpr(sizeof_lane() <= sizeof(u16)) - ret += __lzcnt16(a[--i]) & mask; - - else if constexpr(bitscan && sizeof_lane() <= sizeof(u32)) - ret += (31 - __lzcnt32(a[--i])) & mask; + ret += mask & __builtin_ctz(__builtin_bswap16(a[i]) | 0xffff0000U); else if constexpr(sizeof_lane() <= sizeof(u32)) - ret += __lzcnt32(a[--i]) & mask; + ret += mask & + ( + (boolmask(uint(a[i] != 0)) & __builtin_ctz(__builtin_bswap32(a[i]))) + | (boolmask(uint(a[i] == 0)) & 32U) + ); - else if constexpr(bitscan) - ret += (63 - __lzcnt64(a[--i])) & mask; - - else - ret += __lzcnt64(a[--i]) & mask; + else if constexpr(sizeof_lane() <= sizeof(u64)) + ret += mask & + ( + (boolmask(uint(a[i] != 0)) & __builtin_ctzl(__builtin_bswap64(a[i]))) + | (boolmask(uint(a[i] == 0)) & 64U) + ); static const auto lane_bits(sizeof_lane() * 8); mask &= boolmask(uint(ret % lane_bits == 0)); mask &= boolmask(uint(ret != 0)); } - while(i); + while(i--); return ret; }