// Matrix Construct // // Copyright (C) Matrix Construct Developers, Authors & Contributors // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net> // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. #pragma once #define HAVE_IRCD_MATH_FMMA_H namespace ircd::math { template<class T> void fmma(T *, const T *, const T *, const struct fmma_opts &); } /// Options for the template. struct ircd::math::fmma_opts { size_t cols { 0 }; size_t rows { 0 }; size_t tiles { 1 }; }; /// Fused Matrix-Multiply & Accumulate /// clang11 FMA vfmadd213ps/vfmadd231ps /// clang11 FMA4 vfmaddps template<class T> inline void ircd::math::fmma(T *const __restrict__ out, const T *const __restrict__ in, const T *const __restrict__ weight, const struct fmma_opts &opts) { const auto &cols{opts.cols}, &rows{opts.rows}, &tiles{opts.tiles}, &lanes{simd::lanes<T>()}; const auto width { cols / lanes / tiles }; const auto height { rows / lanes }; for(uint i(0); i < width; i++) for(uint j(0); j < height; j++) for(uint t(0); t < tiles; ++t) for(uint l(0); l < lanes; ++l) { const auto x { i * tiles + t }; const auto y { x * lanes + l }; const T mul { in[x][l] * weight[y * height + j] }; out[j] += mul; } }