ircd::simd: Improve x-platform generation of lzcnt/tzcnt.

2024-11-25 08:12:37 +01:00 · 2020-08-22 20:01:11 -07:00 · 2020-08-22 20:01:11 -07:00 · d2a2b7aed1
commit d2a2b7aed1
parent 146a08dabd
2 changed files with 36 additions and 54 deletions
--- a/include/ircd/simd/lzcnt.h
+++ b/include/ircd/simd/lzcnt.h
@ -17,50 +17,41 @@ namespace ircd::simd
 }

 /// Convenience template. Unfortunately this drops to scalar until specific
-/// targets and specializations are created.
+/// targets and specializations are created. The behavior can differ among
+/// platforms; we make use of lzcnt if available otherwise we account for bsr.
 template<class T>
 inline uint
-__attribute__((target("lzcnt")))
 ircd::simd::lzcnt(const T a)
 noexcept
 {
-	// The behavior of lzcnt can differ among platforms; when true we expect
-	// lzcnt to fall back to bsr-like behavior.
-	constexpr auto bitscan
-	{
-		#ifdef __LZCNT__
-			false
-		#else
-			true
-		#endif
-	};
-
-	uint ret(0), i(0); do
+	uint ret(0);
+	for(uint i(0); i < lanes<T>(); ++i)
 	{
 		const auto mask
 		{
 			boolmask(uint(ret == sizeof_lane<T>() * 8 * i))
 		};

-		if constexpr(bitscan && sizeof_lane<T>() <= sizeof(u16))
-			ret += (15 - __lzcnt16(__builtin_bswap16(a[i++]))) & mask;
+		if constexpr(sizeof_lane<T>() <= sizeof(u8))
+			ret += mask & __builtin_clz((uint(a[i]) << 24) | 0x00ffffff);

 		else if constexpr(sizeof_lane<T>() <= sizeof(u16))
-			ret += __lzcnt16(__builtin_bswap16(a[i++])) & mask;
-
-		else if constexpr(bitscan && sizeof_lane<T>() <= sizeof(u32))
-			ret += (31 - __lzcnt32(__builtin_bswap32(a[i++]))) & mask;
+			ret += mask & __builtin_clz((uint(__builtin_bswap16(a[i])) << 16) | 0x0000ffffU);

 		else if constexpr(sizeof_lane<T>() <= sizeof(u32))
-			ret += __lzcnt32(__builtin_bswap32(a[i++])) & mask;
+			ret += mask &
+			(
+				(boolmask(uint(a[i] != 0)) & __builtin_clz(__builtin_bswap32(a[i])))
+				| (boolmask(uint(a[i] == 0)) & 32U)
+			);

-		else if constexpr(bitscan)
-			ret += (63 - __lzcnt64(__builtin_bswap64(a[i++]))) & mask;
-
-		else
-			ret += __lzcnt64(__builtin_bswap64(a[i++])) & mask;
+		else if constexpr(sizeof_lane<T>() <= sizeof(u64))
+			ret += mask &
+			(
+				(boolmask(uint(a[i] != 0)) & __builtin_clzl(__builtin_bswap64(a[i])))
+				| (boolmask(uint(a[i] == 0)) & 64U)
+			);
 	}
-	while(i < lanes<T>());

 	return ret;
 }
--- a/include/ircd/simd/tzcnt.h
+++ b/include/ircd/simd/tzcnt.h
@ -17,49 +17,40 @@ namespace ircd::simd
 }

 /// Convenience template. Unfortunately this drops to scalar until specific
-/// targets and specializations are created.
+/// targets and specializations are created. The behavior of can differ among
+/// platforms; we use lzcnt when available, otherwise we account for bsr.
 template<class T>
 inline uint
-__attribute__((target("lzcnt")))
 ircd::simd::tzcnt(const T a)
 noexcept
 {
-    // The behavior of lzcnt/tzcnt can differ among platforms; when false we
-    // lzcnt/tzcnt to fall back to bsr/bsf-like behavior.
-	constexpr auto bitscan
+	uint ret(0), i(lanes<T>() - 1), mask(-1U); do
 	{
-		#ifdef __LZCNT__
-			false
-		#else
-			true
-		#endif
-	};
-
-	uint ret(0), i(lanes<T>()), mask(-1U); do
-	{
-		if constexpr(bitscan && sizeof_lane<T>() <= sizeof(u16))
-			ret += (15 - __lzcnt16(a[--i])) & mask;
+		if constexpr(sizeof_lane<T>() <= sizeof(u8))
+			ret += mask & __builtin_ctz(a[i] | 0xffffff00U);

 		else if constexpr(sizeof_lane<T>() <= sizeof(u16))
-			ret += __lzcnt16(a[--i]) & mask;
-
-		else if constexpr(bitscan && sizeof_lane<T>() <= sizeof(u32))
-			ret += (31 - __lzcnt32(a[--i])) & mask;
+			ret += mask & __builtin_ctz(__builtin_bswap16(a[i]) | 0xffff0000U);

 		else if constexpr(sizeof_lane<T>() <= sizeof(u32))
-			ret += __lzcnt32(a[--i]) & mask;
+			ret += mask &
+			(
+				(boolmask(uint(a[i] != 0)) & __builtin_ctz(__builtin_bswap32(a[i])))
+				| (boolmask(uint(a[i] == 0)) & 32U)
+			);

-		else if constexpr(bitscan)
-			ret += (63 - __lzcnt64(a[--i])) & mask;
-
-		else
-			ret += __lzcnt64(a[--i]) & mask;
+		else if constexpr(sizeof_lane<T>() <= sizeof(u64))
+			ret += mask &
+			(
+				(boolmask(uint(a[i] != 0)) & __builtin_ctzl(__builtin_bswap64(a[i])))
+				| (boolmask(uint(a[i] == 0)) & 64U)
+			);

 		static const auto lane_bits(sizeof_lane<T>() * 8);
 		mask &= boolmask(uint(ret % lane_bits == 0));
 		mask &= boolmask(uint(ret != 0));
 	}
-	while(i);
+	while(i--);

 	return ret;
 }