fix:make zerocolor edge
This commit is contained in:
@@ -64,7 +64,7 @@
|
||||
namespace {
|
||||
inline unsigned int trailingZeros32(unsigned int value) {
|
||||
#if defined(_MSC_VER)
|
||||
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
unsigned long index = 0;
|
||||
_BitScanForward(&index, value);
|
||||
return (unsigned int)index;
|
||||
@@ -191,19 +191,6 @@ CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double)
|
||||
#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
template <typename _VecTp> inline _VecTp v_setzero_();
|
||||
template <typename _VecTp> inline _VecTp v_setall_(uchar);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(schar);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(ushort);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(short);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(unsigned);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(int);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(uint64);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(int64);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(float);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(double);
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
#endif
|
||||
@@ -249,7 +236,11 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
#include "opencv2/core/hal/intrin_wasm.hpp"
|
||||
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#if defined(CV_RVV_SCALABLE)
|
||||
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
|
||||
#else
|
||||
#include "opencv2/core/hal/intrin_rvv.hpp"
|
||||
#endif
|
||||
|
||||
#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
|
||||
|
||||
@@ -730,15 +721,314 @@ namespace CV__SIMD_NAMESPACE {
|
||||
/** @brief SIMD processing state cleanup call */
|
||||
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
|
||||
|
||||
#if !CV_SIMD_SCALABLE
|
||||
#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
|
||||
// Compatibility layer
|
||||
#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
|
||||
|
||||
template<typename T> struct VTraits {
|
||||
static inline int vlanes() { return T::nlanes; }
|
||||
enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
|
||||
using lane_type = typename T::lane_type;
|
||||
};
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
|
||||
inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a + b; \
|
||||
} \
|
||||
inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a - b; \
|
||||
} \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_add(f1 + f2, vf...); \
|
||||
}
|
||||
#define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
|
||||
inline _Tpvec v_shr(const _Tpvec& a, int n) \
|
||||
{ \
|
||||
return a >> n; \
|
||||
} \
|
||||
inline _Tpvec v_shl(const _Tpvec& a, int n) \
|
||||
{ \
|
||||
return a << n; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
|
||||
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
|
||||
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a & b; \
|
||||
} \
|
||||
inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a | b; \
|
||||
} \
|
||||
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a ^ b; \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
{ \
|
||||
return ~a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint64)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int64)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
|
||||
OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a * b; \
|
||||
} \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_mul(f1 * f2, vf...); \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
|
||||
inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a / b; \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
|
||||
inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a op b; \
|
||||
}
|
||||
#define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a == b; \
|
||||
} \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a != b; \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
|
||||
|
||||
OPENCV_HAL_WRAP_CMP(v_uint8)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint16)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint32)
|
||||
OPENCV_HAL_WRAP_EQ_OP(v_uint64)
|
||||
OPENCV_HAL_WRAP_CMP(v_int8)
|
||||
OPENCV_HAL_WRAP_CMP(v_int16)
|
||||
OPENCV_HAL_WRAP_CMP(v_int32)
|
||||
OPENCV_HAL_WRAP_EQ_OP(v_int64)
|
||||
OPENCV_HAL_WRAP_CMP(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_CMP(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_CMP(v_int8x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_int16x8)
|
||||
OPENCV_HAL_WRAP_CMP(v_int32x4)
|
||||
OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
|
||||
OPENCV_HAL_WRAP_CMP(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_CMP(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_CMP(v_int8x32)
|
||||
OPENCV_HAL_WRAP_CMP(v_int16x16)
|
||||
OPENCV_HAL_WRAP_CMP(v_int32x8)
|
||||
OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
|
||||
OPENCV_HAL_WRAP_CMP(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////// get0 ////////////
|
||||
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
|
||||
@@ -786,96 +1076,6 @@ namespace CV__SIMD_NAMESPACE {
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
|
||||
return v_add(v_add(f1, f2), f3, vf...); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
|
||||
return v_mul(v_mul(f1, f2), f3, vf...); \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
|
||||
@@ -946,6 +1146,74 @@ namespace CV__SIMD_NAMESPACE {
|
||||
|
||||
#endif //!CV_SIMD_SCALABLE
|
||||
|
||||
#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
|
||||
// Compatibility layer for the backend that cleaned up.
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_add(v_add(f1, f2), vf...); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
|
||||
return v_mul(v_mul(f1, f2), vf...); \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
|
||||
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32)
|
||||
|
||||
#endif //CV_NEON
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
// backward compatibility
|
||||
|
||||
@@ -447,10 +447,6 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
|
||||
{ return _Tpvec(_mm256_setzero_si256()); } \
|
||||
inline _Tpvec v256_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v256_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v256_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
|
||||
@@ -476,10 +472,6 @@ OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
|
||||
{ return _Tpvec(_mm256_setzero_##zsuffix()); } \
|
||||
inline _Tpvec v256_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(_mm256_set1_##zsuffix(v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v256_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v256_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
|
||||
@@ -681,51 +673,53 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
|
||||
|
||||
/** Arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a.val = intrin(a.val, b.val); return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32, _mm256_adds_epu8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32, _mm256_subs_epu8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32, _mm256_adds_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32, _mm256_subs_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16, _mm256_adds_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16, _mm256_subs_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8, _mm256_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8, _mm256_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8, _mm256_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8, _mm256_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8, _mm256_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8, _mm256_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4, _mm256_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4, _mm256_sub_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4, _mm256_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4, _mm256_sub_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
|
||||
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
v_uint16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
|
||||
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
|
||||
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
|
||||
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
|
||||
@@ -733,7 +727,7 @@ inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
|
||||
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
|
||||
return v_uint16x16(_v256_packs_epu32(p0, p1));
|
||||
}
|
||||
inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
|
||||
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
|
||||
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
|
||||
@@ -741,6 +735,14 @@ inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
|
||||
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
|
||||
return v_int16x16(_mm256_packs_epi32(p0, p1));
|
||||
}
|
||||
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
/** Non-saturating arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
|
||||
@@ -831,13 +833,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
|
||||
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
|
||||
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
|
||||
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(srai(a.val, imm)); } \
|
||||
template<int imm> \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a) \
|
||||
@@ -865,11 +867,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx
|
||||
|
||||
|
||||
/** Bitwise logic **/
|
||||
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
|
||||
@@ -898,29 +900,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
|
||||
OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
|
||||
|
||||
/** Comparison **/
|
||||
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); } \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_gt(b, a); } \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_lt(a, b)); } \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_ge(b, a); }
|
||||
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); } \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return b > a; } \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a < b); } \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return b >= a; }
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
|
||||
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
__m256i smask = _mm256_set1_##suffix(sbit); \
|
||||
return _Tpuvec(_mm256_cmpgt_##suffix( \
|
||||
_mm256_xor_si256(a.val, smask), \
|
||||
_mm256_xor_si256(b.val, smask))); \
|
||||
} \
|
||||
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
|
||||
@@ -930,25 +932,25 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); }
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, suffix)
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
|
||||
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
|
||||
@@ -1214,9 +1216,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
|
||||
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
|
||||
|
||||
inline int v_reduce_sum(const v_int16x16& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
inline unsigned v_reduce_sum(const v_uint16x16& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
|
||||
inline float v_reduce_sum(const v_float32x8& a)
|
||||
{
|
||||
@@ -1271,27 +1273,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
|
||||
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
|
||||
return v_reduce_sum(v_add(l, h));
|
||||
v_expand(v_add_wrap(a - b, b - a), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
|
||||
return v_reduce_sum(v_add(l, h));
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{
|
||||
return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
|
||||
return v_reduce_sum(v_max(a, b) - v_min(a, b));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 m = v_lt(a, b);
|
||||
return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
|
||||
v_int32x8 m = a < b;
|
||||
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
|
||||
{
|
||||
return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))));
|
||||
return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
|
||||
}
|
||||
|
||||
/** Popcount **/
|
||||
@@ -1306,15 +1308,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a)
|
||||
inline v_uint16x16 v_popcount(const v_uint16x16& a)
|
||||
{
|
||||
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
|
||||
p = v_add(p, v_rotate_right<1>(p));
|
||||
return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff));
|
||||
p += v_rotate_right<1>(p);
|
||||
return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
|
||||
}
|
||||
inline v_uint32x8 v_popcount(const v_uint32x8& a)
|
||||
{
|
||||
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
|
||||
p = v_add(p, v_rotate_right<1>(p));
|
||||
p = v_add(p, v_rotate_right<2>(p));
|
||||
return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff));
|
||||
p += v_rotate_right<1>(p);
|
||||
p += v_rotate_right<2>(p);
|
||||
return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
|
||||
}
|
||||
inline v_uint64x4 v_popcount(const v_uint64x4& a)
|
||||
{
|
||||
@@ -1406,9 +1408,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
|
||||
inline _Tpvec v_sqrt(const _Tpvec& x) \
|
||||
{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_fma(a, a, v_mul(b, b)); } \
|
||||
{ return v_fma(a, a, b * b); } \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
|
||||
{ return v_sqrt(v_fma(a, a, b*b)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
|
||||
OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
|
||||
@@ -1417,7 +1419,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
|
||||
|
||||
inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
|
||||
{
|
||||
return v_add(v_mul(a, b), c);
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
|
||||
@@ -1427,16 +1429,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
|
||||
|
||||
inline v_float32x8 v_invsqrt(const v_float32x8& x)
|
||||
{
|
||||
v_float32x8 half = v_mul(x, v256_setall_f32(0.5));
|
||||
v_float32x8 half = x * v256_setall_f32(0.5);
|
||||
v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
|
||||
// todo: _mm256_fnmsub_ps
|
||||
t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half)));
|
||||
t *= v256_setall_f32(1.5) - ((t * t) * half);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline v_float64x4 v_invsqrt(const v_float64x4& x)
|
||||
{
|
||||
return v_div(v256_setall_f64(1.), v_sqrt(x));
|
||||
return v256_setall_f64(1.) / v_sqrt(x);
|
||||
}
|
||||
|
||||
/** Absolute values **/
|
||||
@@ -1449,23 +1451,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
|
||||
OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
|
||||
|
||||
inline v_float32x8 v_abs(const v_float32x8& x)
|
||||
{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); }
|
||||
{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
|
||||
inline v_float64x4 v_abs(const v_float64x4& x)
|
||||
{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); }
|
||||
{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
|
||||
|
||||
/** Absolute difference **/
|
||||
inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
|
||||
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int8x32 d = v_sub_wrap(a, b);
|
||||
v_int8x32 m = v_lt(a, b);
|
||||
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
|
||||
v_int8x32 m = a < b;
|
||||
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
|
||||
}
|
||||
|
||||
inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
|
||||
@@ -1473,26 +1475,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
|
||||
|
||||
inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 d = v_sub(a, b);
|
||||
v_int32x8 m = v_lt(a, b);
|
||||
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
|
||||
v_int32x8 d = a - b;
|
||||
v_int32x8 m = a < b;
|
||||
return v_reinterpret_as_u32((d ^ m) - m);
|
||||
}
|
||||
|
||||
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int8x32 d = v_sub(a, b);
|
||||
v_int8x32 m = v_lt(a, b);
|
||||
return v_sub(v_xor(d, m), m);
|
||||
v_int8x32 d = a - b;
|
||||
v_int8x32 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
////////// Conversions /////////
|
||||
|
||||
@@ -1787,7 +1789,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
|
||||
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
|
||||
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
|
||||
@@ -1797,7 +1799,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
|
||||
return v_int64x4(_mm256_add_epi64(even, odd));
|
||||
}
|
||||
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
|
||||
@@ -1814,7 +1816,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
|
||||
return v_uint32x8(_mm256_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
@@ -1829,7 +1831,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
|
||||
return v_int32x8(_mm256_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
|
||||
@@ -1853,7 +1855,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
|
||||
));
|
||||
}
|
||||
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
@@ -1869,13 +1871,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
|
||||
));
|
||||
}
|
||||
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
@@ -1921,7 +1923,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
|
||||
return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
|
||||
}
|
||||
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
@@ -1932,7 +1934,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
|
||||
return v_int64x4(_mm256_add_epi64(lo, hi));
|
||||
}
|
||||
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
|
||||
@@ -1951,7 +1953,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
|
||||
v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
|
||||
v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
|
||||
v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
|
||||
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
|
||||
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
|
||||
}
|
||||
|
||||
inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
|
||||
@@ -2056,43 +2058,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
|
||||
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
|
||||
return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
|
||||
v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
|
||||
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
|
||||
v_reinterpret_as_s16((b + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
|
||||
{
|
||||
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
|
||||
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack_u((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
|
||||
{
|
||||
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
|
||||
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_u_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
|
||||
{
|
||||
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// 32
|
||||
@@ -2125,43 +2127,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{
|
||||
// we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
|
||||
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
|
||||
return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
|
||||
v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
|
||||
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
|
||||
v_reinterpret_as_s32((b + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
|
||||
{
|
||||
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 delta = v256_setall_s32(1 << (n-1));
|
||||
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack_u((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
|
||||
{
|
||||
v_int32x8 delta = v256_setall_s32(1 << (n-1));
|
||||
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_u_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 delta = v256_setall_s32(1 << (n-1));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(short* ptr, const v_int32x8& a)
|
||||
{
|
||||
v_int32x8 delta = v256_setall_s32(1 << (n-1));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// 64
|
||||
@@ -2190,28 +2192,28 @@ template<int n> inline
|
||||
v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
|
||||
{
|
||||
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
|
||||
{
|
||||
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
|
||||
{
|
||||
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(int* ptr, const v_int64x4& a)
|
||||
{
|
||||
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// pack boolean
|
||||
@@ -3166,20 +3168,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
|
||||
|
||||
inline void v256_cleanup() { _mm256_zeroall(); }
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
|
||||
inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
|
||||
inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
|
||||
inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
@@ -458,10 +458,6 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
|
||||
{ return _Tpvec(_mm512_setzero_si512()); } \
|
||||
inline _Tpvec v512_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v512_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v512_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
|
||||
@@ -487,10 +483,6 @@ OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64)
|
||||
{ return _Tpvec(_mm512_setzero_##zsuffix()); } \
|
||||
inline _Tpvec v512_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(_mm512_set1_##zsuffix(v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v512_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v512_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
|
||||
@@ -671,56 +663,58 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a.val = intrin(a.val, b.val); return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
|
||||
|
||||
/** Saturating arithmetics **/
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64, _mm512_adds_epu8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64, _mm512_subs_epu8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64, _mm512_adds_epi8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64, _mm512_subs_epi8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32, _mm512_adds_epi16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32, _mm512_subs_epi16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
|
||||
|
||||
// saturating multiply
|
||||
inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b)
|
||||
inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
|
||||
{
|
||||
v_uint16x32 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b)
|
||||
inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
|
||||
{
|
||||
v_int16x32 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
|
||||
inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
|
||||
{
|
||||
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
|
||||
__m512i ph = _mm512_mulhi_epu16(a.val, b.val);
|
||||
@@ -730,7 +724,7 @@ inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
|
||||
const __m512i m = _mm512_set1_epi32(65535);
|
||||
return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
|
||||
}
|
||||
inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
|
||||
inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
|
||||
{
|
||||
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
|
||||
__m512i ph = _mm512_mulhi_epi16(a.val, b.val);
|
||||
@@ -739,6 +733,15 @@ inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
|
||||
return v_int16x32(_mm512_packs_epi32(p0, p1));
|
||||
}
|
||||
|
||||
inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
|
||||
inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
|
||||
|
||||
@@ -799,13 +802,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
|
||||
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
|
||||
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
|
||||
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
|
||||
template<int imm> \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a) \
|
||||
@@ -827,10 +830,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
|
||||
|
||||
/** Bitwise logic **/
|
||||
#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
|
||||
OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
|
||||
@@ -862,16 +865,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
|
||||
|
||||
/** Comparison **/
|
||||
#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1)
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1)
|
||||
@@ -883,16 +886,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
|
||||
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
|
||||
@@ -1247,9 +1250,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16)
|
||||
OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16)
|
||||
|
||||
inline int v_reduce_sum(const v_int16x32& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
inline uint v_reduce_sum(const v_uint16x32& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
|
||||
inline sctype v_reduce_##func(const _Tpvec& a) \
|
||||
@@ -1303,17 +1306,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
|
||||
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
|
||||
{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); }
|
||||
{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
|
||||
inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
|
||||
inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
|
||||
{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); }
|
||||
{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
|
||||
inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
|
||||
{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); }
|
||||
{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
|
||||
inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
|
||||
{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); }
|
||||
{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
|
||||
inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
|
||||
{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); }
|
||||
{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
|
||||
|
||||
/** Popcount **/
|
||||
inline v_uint8x64 v_popcount(const v_int8x64& a)
|
||||
@@ -1348,8 +1351,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a)
|
||||
_mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
|
||||
#else
|
||||
v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
|
||||
p = v_add(p, v_rotate_right<1>(p));
|
||||
return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff));
|
||||
p += v_rotate_right<1>(p);
|
||||
return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
|
||||
#endif
|
||||
}
|
||||
inline v_uint32x16 v_popcount(const v_int32x16& a)
|
||||
@@ -1358,9 +1361,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a)
|
||||
return v_uint32x16(_mm512_popcnt_epi32(a.val));
|
||||
#else
|
||||
v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
|
||||
p = v_add(p, v_rotate_right<1>(p));
|
||||
p = v_add(p, v_rotate_right<2>(p));
|
||||
return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff));
|
||||
p += v_rotate_right<1>(p);
|
||||
p += v_rotate_right<2>(p);
|
||||
return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
|
||||
#endif
|
||||
}
|
||||
inline v_uint64x8 v_popcount(const v_int64x8& a)
|
||||
@@ -1400,9 +1403,9 @@ inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinte
|
||||
inline _Tpvec v_sqrt(const _Tpvec& x) \
|
||||
{ return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_fma(a, a, v_mul(b, b)); } \
|
||||
{ return v_fma(a, a, b * b); } \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
|
||||
{ return v_sqrt(v_fma(a, a, b * b)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
|
||||
OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
|
||||
@@ -1410,7 +1413,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
|
||||
OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd)
|
||||
|
||||
inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
|
||||
{ return v_add(v_mul(a, b), c); }
|
||||
{ return a * b + c; }
|
||||
inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
|
||||
{ return v_fma(a, b, c); }
|
||||
|
||||
@@ -1419,9 +1422,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x)
|
||||
#if CV_AVX_512ER
|
||||
return v_float32x16(_mm512_rsqrt28_ps(x.val));
|
||||
#else
|
||||
v_float32x16 half = v_mul(x, v512_setall_f32(0.5));
|
||||
v_float32x16 half = x * v512_setall_f32(0.5);
|
||||
v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
|
||||
t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half)));
|
||||
t *= v512_setall_f32(1.5) - ((t * t) * half);
|
||||
return t;
|
||||
#endif
|
||||
}
|
||||
@@ -1431,7 +1434,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x)
|
||||
#if CV_AVX_512ER
|
||||
return v_float64x8(_mm512_rsqrt28_pd(x.val));
|
||||
#else
|
||||
return v_div(v512_setall_f64(1.), v_sqrt(x));
|
||||
return v512_setall_f64(1.) / v_sqrt(x);
|
||||
// v_float64x8 half = x * v512_setall_f64(0.5);
|
||||
// v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
|
||||
// t *= v512_setall_f64(1.5) - ((t * t) * half);
|
||||
@@ -1479,17 +1482,17 @@ inline v_float64x8 v_abs(const v_float64x8& x)
|
||||
|
||||
/** Absolute difference **/
|
||||
inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
|
||||
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
|
||||
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
|
||||
{
|
||||
v_int8x64 d = v_sub_wrap(a, b);
|
||||
v_int8x64 m = v_lt(a, b);
|
||||
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
|
||||
v_int8x64 m = a < b;
|
||||
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
|
||||
}
|
||||
|
||||
inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
|
||||
@@ -1497,26 +1500,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
|
||||
|
||||
inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
|
||||
{
|
||||
v_int32x16 d = v_sub(a, b);
|
||||
v_int32x16 m = v_lt(a, b);
|
||||
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
|
||||
v_int32x16 d = a - b;
|
||||
v_int32x16 m = a < b;
|
||||
return v_reinterpret_as_u32((d ^ m) - m);
|
||||
}
|
||||
|
||||
inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
|
||||
{
|
||||
v_int8x64 d = v_sub(a, b);
|
||||
v_int8x64 m = v_lt(a, b);
|
||||
return v_sub(v_xor(d, m), m);
|
||||
v_int8x64 d = a - b;
|
||||
v_int8x64 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
////////// Conversions /////////
|
||||
|
||||
@@ -1815,7 +1818,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
|
||||
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
|
||||
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
|
||||
@@ -1825,7 +1828,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
|
||||
return v_int64x8(_mm512_add_epi64(even, odd));
|
||||
}
|
||||
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
|
||||
@@ -1841,7 +1844,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
|
||||
return v_uint32x16(_mm512_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
|
||||
{
|
||||
@@ -1856,7 +1859,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
|
||||
return v_int32x16(_mm512_add_epi32(prod0, prod1));
|
||||
}
|
||||
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
|
||||
@@ -1880,7 +1883,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
|
||||
));
|
||||
}
|
||||
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
|
||||
{
|
||||
@@ -1890,13 +1893,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
|
||||
return v_int64x8(_mm512_add_epi64(even, odd));
|
||||
}
|
||||
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
@@ -1941,7 +1944,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32&
|
||||
return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
|
||||
}
|
||||
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
@@ -1952,7 +1955,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b,
|
||||
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
|
||||
#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
|
||||
@@ -1966,7 +1969,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v,
|
||||
v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
|
||||
v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
|
||||
v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
|
||||
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
|
||||
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
|
||||
}
|
||||
|
||||
inline v_float32x16 v_matmuladd(const v_float32x16& v,
|
||||
@@ -2067,43 +2070,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
|
||||
{
|
||||
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
|
||||
v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
|
||||
return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
|
||||
v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
|
||||
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
|
||||
v_reinterpret_as_s16((b + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
|
||||
{
|
||||
v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
|
||||
{
|
||||
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
|
||||
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack_u((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
|
||||
{
|
||||
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
|
||||
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_u_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
|
||||
{
|
||||
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
|
||||
{
|
||||
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// 32
|
||||
@@ -2136,43 +2139,43 @@ template<int n> inline
|
||||
v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
|
||||
{
|
||||
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
|
||||
return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
|
||||
v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
|
||||
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
|
||||
v_reinterpret_as_s32((b + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
|
||||
{
|
||||
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
|
||||
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
|
||||
{
|
||||
v_int32x16 delta = v512_setall_s32(1 << (n-1));
|
||||
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack_u((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
|
||||
{
|
||||
v_int32x16 delta = v512_setall_s32(1 << (n-1));
|
||||
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_u_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
|
||||
{
|
||||
v_int32x16 delta = v512_setall_s32(1 << (n-1));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(short* ptr, const v_int32x16& a)
|
||||
{
|
||||
v_int32x16 delta = v512_setall_s32(1 << (n-1));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// 64
|
||||
@@ -2193,28 +2196,28 @@ template<int n> inline
|
||||
v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
|
||||
{
|
||||
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
|
||||
{
|
||||
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
|
||||
{
|
||||
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
|
||||
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
|
||||
return v_pack((a + delta) >> n, (b + delta) >> n);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(int* ptr, const v_int64x8& a)
|
||||
{
|
||||
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
|
||||
v_pack_store(ptr, v_shr(v_add(a, delta), n));
|
||||
v_pack_store(ptr, (a + delta) >> n);
|
||||
}
|
||||
|
||||
// pack boolean
|
||||
@@ -3078,20 +3081,6 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm
|
||||
|
||||
inline void v512_cleanup() { _mm256_zeroall(); }
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x16 v_exp(const v_float32x16& x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
|
||||
inline v_float32x16 v_log(const v_float32x16& x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
|
||||
inline void v_sincos(const v_float32x16& x, v_float32x16& s, v_float32x16& c) { v_sincos_default_32f<v_float32x16, v_int32x16>(x, s, c); }
|
||||
inline v_float32x16 v_sin(const v_float32x16& x) { return v_sin_default_32f<v_float32x16, v_int32x16>(x); }
|
||||
inline v_float32x16 v_cos(const v_float32x16& x) { return v_cos_default_32f<v_float32x16, v_int32x16>(x); }
|
||||
inline v_float32x16 v_erf(const v_float32x16& x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
|
||||
|
||||
inline v_float64x8 v_exp(const v_float64x8& x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
|
||||
inline v_float64x8 v_log(const v_float64x8& x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
|
||||
inline void v_sincos(const v_float64x8& x, v_float64x8& s, v_float64x8& c) { v_sincos_default_64f<v_float64x8, v_int64x8>(x, s, c); }
|
||||
inline v_float64x8 v_sin(const v_float64x8& x) { return v_sin_default_64f<v_float64x8, v_int64x8>(x); }
|
||||
inline v_float64x8 v_cos(const v_float64x8& x) { return v_cos_default_64f<v_float64x8, v_int64x8>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
@@ -81,26 +81,9 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
|
||||
different platforms. Currently a few different SIMD extensions on different architectures are supported.
|
||||
|
||||
OpenCV Universal Intrinsics support the following instruction sets:
|
||||
|
||||
- *128 bit* registers of various types support is implemented for a wide range of architectures including
|
||||
- x86(SSE/SSE2/SSE4.2),
|
||||
- ARM(NEON): 64-bit float (64F) requires AArch64,
|
||||
- PowerPC(VSX),
|
||||
- MIPS(MSA),
|
||||
- LoongArch(LSX),
|
||||
- RISC-V(RVV 0.7.1): Fixed-length implementation,
|
||||
- WASM: 64-bit float (64F) is not supported,
|
||||
- *256 bit* registers are supported on
|
||||
- x86(AVX2),
|
||||
- LoongArch (LASX),
|
||||
- *512 bit* registers are supported on
|
||||
- x86(AVX512),
|
||||
- *Vector Length Agnostic (VLA)* registers are supported on
|
||||
- RISC-V(RVV 1.0)
|
||||
- ARM(SVE/SVE2): Powered by Arm KleidiCV integration (OpenCV 4.11+),
|
||||
|
||||
128 bit registers of various types support is implemented for a wide range of architectures
|
||||
including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
|
||||
256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
|
||||
In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
|
||||
will be chosen and code will work as expected although it could be slower.
|
||||
|
||||
@@ -242,30 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto
|
||||
Element-wise binary and unary operations.
|
||||
|
||||
- Arithmetics:
|
||||
@ref v_add,
|
||||
@ref v_sub,
|
||||
@ref v_mul,
|
||||
@ref v_div,
|
||||
@ref operator +(const v_reg &a, const v_reg &b) "+",
|
||||
@ref operator -(const v_reg &a, const v_reg &b) "-",
|
||||
@ref operator *(const v_reg &a, const v_reg &b) "*",
|
||||
@ref operator /(const v_reg &a, const v_reg &b) "/",
|
||||
@ref v_mul_expand
|
||||
|
||||
- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
|
||||
|
||||
- Bitwise shifts:
|
||||
@ref operator <<(const v_reg &a, int s) "<<",
|
||||
@ref operator >>(const v_reg &a, int s) ">>",
|
||||
@ref v_shl, @ref v_shr
|
||||
|
||||
- Bitwise logic:
|
||||
@ref v_and,
|
||||
@ref v_or,
|
||||
@ref v_xor,
|
||||
@ref v_not
|
||||
@ref operator &(const v_reg &a, const v_reg &b) "&",
|
||||
@ref operator |(const v_reg &a, const v_reg &b) "|",
|
||||
@ref operator ^(const v_reg &a, const v_reg &b) "^",
|
||||
@ref operator ~(const v_reg &a) "~"
|
||||
|
||||
- Comparison:
|
||||
@ref v_gt,
|
||||
@ref v_ge,
|
||||
@ref v_lt,
|
||||
@ref v_le,
|
||||
@ref v_eq,
|
||||
@ref v_ne
|
||||
@ref operator >(const v_reg &a, const v_reg &b) ">",
|
||||
@ref operator >=(const v_reg &a, const v_reg &b) ">=",
|
||||
@ref operator <(const v_reg &a, const v_reg &b) "<",
|
||||
@ref operator <=(const v_reg &a, const v_reg &b) "<=",
|
||||
@ref operator ==(const v_reg &a, const v_reg &b) "==",
|
||||
@ref operator !=(const v_reg &a, const v_reg &b) "!="
|
||||
|
||||
- min/max: @ref v_min, @ref v_max
|
||||
|
||||
@@ -278,8 +263,7 @@ Most of these operations return only one value.
|
||||
|
||||
### Other math
|
||||
|
||||
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log,
|
||||
@ref v_erf, @ref v_sin, @ref v_cos
|
||||
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
|
||||
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
|
||||
|
||||
### Conversions
|
||||
@@ -379,9 +363,6 @@ Floating point:
|
||||
|reverse | x | x |
|
||||
|extract_n | x | x |
|
||||
|broadcast_element | x | |
|
||||
|exp | x | x |
|
||||
|log | x | x |
|
||||
|sin, cos | x | x |
|
||||
|
||||
@{ */
|
||||
|
||||
@@ -589,43 +570,50 @@ enum {
|
||||
/** @brief Add values
|
||||
|
||||
For all types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Subtract values
|
||||
|
||||
For all types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Multiply values
|
||||
|
||||
For 16- and 32-bit integer types and floating types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Divide values
|
||||
|
||||
For floating types only. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
|
||||
/** @brief Bitwise AND
|
||||
|
||||
Only for integer types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Bitwise OR
|
||||
|
||||
Only for integer types. */
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Bitwise XOR
|
||||
|
||||
Only for integer types.*/
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
|
||||
|
||||
/** @brief Bitwise NOT
|
||||
|
||||
Only for integer types.*/
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a);
|
||||
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
|
||||
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
@@ -648,26 +636,33 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
|
||||
|
||||
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \
|
||||
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
|
||||
template<int n> inline \
|
||||
v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
|
||||
return c; \
|
||||
} \
|
||||
template<int n> inline \
|
||||
v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func)
|
||||
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
|
||||
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul)
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(+)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(-)
|
||||
CV__HAL_INTRIN_IMPL_BIN_OP(*)
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
|
||||
|
||||
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \
|
||||
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
|
||||
template<int n> CV_INLINE \
|
||||
v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
||||
@@ -675,20 +670,29 @@ v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
|
||||
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
|
||||
return c; \
|
||||
} \
|
||||
template<int n> CV_INLINE \
|
||||
v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
|
||||
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */
|
||||
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
|
||||
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
|
||||
|
||||
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(&)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(|)
|
||||
CV__HAL_INTRIN_IMPL_BIT_OP(^)
|
||||
|
||||
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \
|
||||
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
|
||||
template<int n> CV_INLINE \
|
||||
v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
|
||||
v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
@@ -696,7 +700,7 @@ v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
|
||||
return c; \
|
||||
} \
|
||||
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not)
|
||||
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
|
||||
|
||||
#endif // !CV_DOXYGEN
|
||||
|
||||
@@ -717,85 +721,12 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
|
||||
Only for floating point types.*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
|
||||
|
||||
/**
|
||||
* @brief Exponential \f$ e^x \f$ of elements
|
||||
*
|
||||
* Only for floating point types. Core implementation steps:
|
||||
* 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
|
||||
* \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
|
||||
* 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
|
||||
* 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
|
||||
* \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
|
||||
* 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
|
||||
* - For float16 and float32, use a Taylor Series with 6 terms.
|
||||
* - For float64, use Pade Polynomials Approximation with 4 terms.
|
||||
* 5. Combine Results: Multiply the two parts together to get the final result:
|
||||
* \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
|
||||
*
|
||||
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
|
||||
*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
|
||||
#define OPENCV_HAL_MATH_HAVE_EXP 1
|
||||
|
||||
/**
|
||||
* @brief Natural logarithm \f$ \log(x) \f$ of elements
|
||||
*
|
||||
* Only for floating point types. Core implementation steps:
|
||||
* 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$.
|
||||
* 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation.
|
||||
* 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result.
|
||||
* - For float16 and float32, use a Taylor Series with 9 terms.
|
||||
* - For float64, use Pade Polynomials Approximation with 6 terms.
|
||||
* 4. Combine Results: Add the two parts together to get the final result.
|
||||
*
|
||||
* @note The precision of the calculation depends on the implementation and the data type of the input.
|
||||
*
|
||||
* @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
|
||||
*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
|
||||
|
||||
/**
|
||||
* @brief Error function.
|
||||
*
|
||||
* @note Support FP32 precision for now.
|
||||
*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
|
||||
|
||||
/**
|
||||
* @brief Compute sine \f$ sin(x) \f$ and cosine \f$ cos(x) \f$ of elements at the same time
|
||||
*
|
||||
* Only for floating point types. Core implementation steps:
|
||||
* 1. Input Normalization: Scale the periodicity from 2π to 4 and reduce the angle to the range \f$ [0, \frac{\pi}{4}] \f$ using periodicity and trigonometric identities.
|
||||
* 2. Polynomial Approximation for \f$ sin(x) \f$ and \f$ cos(x) \f$:
|
||||
* - For float16 and float32, use a Taylor series with 4 terms for sine and 5 terms for cosine.
|
||||
* - For float64, use a Taylor series with 7 terms for sine and 8 terms for cosine.
|
||||
* 3. Select Results: select and convert the final sine and cosine values for the original input angle.
|
||||
*
|
||||
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
|
||||
*/
|
||||
template<typename _Tp, int n>
|
||||
inline void v_sincos(const v_reg<_Tp, n>& x, v_reg<_Tp, n>& s, v_reg<_Tp, n>& c)
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
{
|
||||
s.s[i] = std::sin(x.s[i]);
|
||||
c.s[i] = std::cos(x.s[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Sine \f$ sin(x) \f$ of elements
|
||||
*
|
||||
* Only for floating point types. Core implementation the same as @ref v_sincos.
|
||||
*/
|
||||
//! @cond IGNORED
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
|
||||
|
||||
/**
|
||||
* @brief Cosine \f$ cos(x) \f$ of elements
|
||||
*
|
||||
* Only for floating point types. Core implementation the same as @ref v_sincos.
|
||||
*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
|
||||
//! @endcond
|
||||
|
||||
/** @brief Absolute value of elements
|
||||
|
||||
@@ -918,9 +849,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \
|
||||
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
|
||||
template<typename _Tp, int n> \
|
||||
inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
{ \
|
||||
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
||||
v_reg<_Tp, n> c; \
|
||||
@@ -932,28 +863,28 @@ inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
||||
/** @brief Less-than comparison
|
||||
|
||||
For all types except 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_CMP_OP(<, v_lt)
|
||||
OPENCV_HAL_IMPL_CMP_OP(<)
|
||||
|
||||
/** @brief Greater-than comparison
|
||||
|
||||
For all types except 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_CMP_OP(>, v_gt)
|
||||
OPENCV_HAL_IMPL_CMP_OP(>)
|
||||
|
||||
/** @brief Less-than or equal comparison
|
||||
|
||||
For all types except 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_CMP_OP(<=, v_le)
|
||||
OPENCV_HAL_IMPL_CMP_OP(<=)
|
||||
|
||||
/** @brief Greater-than or equal comparison
|
||||
|
||||
For all types except 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_CMP_OP(>=, v_ge)
|
||||
OPENCV_HAL_IMPL_CMP_OP(>=)
|
||||
|
||||
/** @brief Equal comparison */
|
||||
OPENCV_HAL_IMPL_CMP_OP(==, v_eq)
|
||||
OPENCV_HAL_IMPL_CMP_OP(==)
|
||||
|
||||
/** @brief Not equal comparison */
|
||||
OPENCV_HAL_IMPL_CMP_OP(!=, v_ne)
|
||||
OPENCV_HAL_IMPL_CMP_OP(!=)
|
||||
|
||||
template<int n>
|
||||
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
|
||||
@@ -1322,8 +1253,8 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \
|
||||
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
|
||||
template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
|
||||
{ \
|
||||
v_reg<_Tp, n> c; \
|
||||
for( int i = 0; i < n; i++ ) \
|
||||
@@ -1334,12 +1265,12 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a,
|
||||
/** @brief Bitwise shift left
|
||||
|
||||
For 16-, 32- and 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl)
|
||||
OPENCV_HAL_IMPL_SHIFT_OP(<< )
|
||||
|
||||
/** @brief Bitwise shift right
|
||||
|
||||
For 16-, 32- and 64-bit integer values. */
|
||||
OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr)
|
||||
OPENCV_HAL_IMPL_SHIFT_OP(>> )
|
||||
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
@@ -2848,8 +2779,7 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
|
||||
inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } \
|
||||
template <> inline _Tpvec v_setzero_() { return _Tpvec::zero(); }
|
||||
inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
|
||||
|
||||
//! @name Init with zero
|
||||
//! @{
|
||||
@@ -2895,8 +2825,7 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
|
||||
//! @brief Helper macro
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
|
||||
inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp val) { return _Tpvec::all(val); }
|
||||
inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
|
||||
|
||||
//! @name Init with value
|
||||
//! @{
|
||||
@@ -2965,7 +2894,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
|
||||
template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
|
||||
{ return v_shl(a, shift); }
|
||||
{ return a << shift; }
|
||||
|
||||
//! @name Left shift
|
||||
//! @{
|
||||
@@ -2982,7 +2911,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64)
|
||||
//! @ingroup core_hal_intrin_impl
|
||||
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
|
||||
template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
|
||||
{ return v_shr(a, shift); }
|
||||
{ return a >> shift; }
|
||||
|
||||
//! @name Right shift
|
||||
//! @{
|
||||
@@ -3308,7 +3237,7 @@ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
|
||||
|
||||
|
||||
template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
|
||||
template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
|
||||
const v_reg<double, n/2>& c)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
|
||||
|
||||
@@ -557,10 +557,6 @@ inline __m256i _lasx_256_castpd_si256(const __m256d& v)
|
||||
{ return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
|
||||
inline _Tpvec v256_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v256_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v256_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
|
||||
@@ -592,11 +588,7 @@ inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
|
||||
inline _Tpvec v256_setzero_##suffix() \
|
||||
{ return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
|
||||
inline _Tpvec v256_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(_v256_setall_##zsuffix(v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v256_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v256_setall_##suffix(v); } \
|
||||
{ return _Tpvec(_v256_setall_##zsuffix(v)); } \
|
||||
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
|
||||
@@ -658,18 +650,16 @@ inline v_float32x8 v256_shuffle(const v_float32x8 &a)
|
||||
template<int m>
|
||||
inline v_float64x4 v256_shuffle(const v_float64x4 &a)
|
||||
{
|
||||
const int m1 = m & 0b1;
|
||||
const int m2 = m & 0b10;
|
||||
const int m3 = m & 0b100;
|
||||
const int m4 = m & 0b1000;
|
||||
const int m5 = m2 << 1;
|
||||
const int m6 = m3 << 2;
|
||||
const int m7 = m4 << 3;
|
||||
const int m8 = m1 & m5 & m6 & m7;
|
||||
int imm8 = m & 0b0001; //0 or 1
|
||||
if (m & 0x0b0010) imm8 |= 0b0100;
|
||||
//else imm8 |= 0b0000;
|
||||
if (m & 0x0b0100) imm8 |= 0b110000; //2 or 3
|
||||
else imm8 |= 0b100000;
|
||||
if (m & 0x0b1000) imm8 |= 0b11000000;
|
||||
else imm8 |= 0b10000000;
|
||||
|
||||
return v_float64x4(__lasx_xvshuf4i_d(*((__m256i*)&a.val), *((__m256i*)&a.val), m8));
|
||||
return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
|
||||
}
|
||||
|
||||
template<typename _Tpvec>
|
||||
inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
|
||||
{
|
||||
@@ -754,51 +744,53 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
|
||||
|
||||
/** Arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a.val = intrin(a.val, b.val); return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32, __lasx_xvsadd_bu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32, __lasx_xvssub_bu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32, __lasx_xvsadd_b)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32, __lasx_xvssub_b)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16, __lasx_xvsadd_h)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16, __lasx_xvssub_h)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8, __lasx_xvadd_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8, __lasx_xvsub_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8, __lasx_xvmul_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8, __lasx_xvadd_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8, __lasx_xvsub_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8, __lasx_xvmul_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4, __lasx_xvadd_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4, __lasx_xvsub_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4, __lasx_xvadd_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4, __lasx_xvsub_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
|
||||
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
v_uint16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
|
||||
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int16x16 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
|
||||
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
__m256i pl = __lasx_xvmul_h(a.val, b.val);
|
||||
__m256i ph = __lasx_xvmuh_hu(a.val, b.val);
|
||||
@@ -806,7 +798,7 @@ inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
|
||||
__m256i p1 = __lasx_xvilvh_h(ph, pl);
|
||||
return v_uint16x16(_v256_packs_epu32(p0, p1));
|
||||
}
|
||||
inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
|
||||
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
__m256i pl = __lasx_xvmul_h(a.val, b.val);
|
||||
__m256i ph = __lasx_xvmuh_h(a.val, b.val);
|
||||
@@ -814,6 +806,14 @@ inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
|
||||
__m256i p1 = __lasx_xvilvh_h(ph, pl);
|
||||
return v_int16x16(_lasx_packs_w(p0, p1));
|
||||
}
|
||||
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
/** Non-saturating arithmetics **/
|
||||
|
||||
@@ -902,13 +902,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
|
||||
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
|
||||
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
|
||||
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
|
||||
template<int imm> \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a) \
|
||||
@@ -930,10 +930,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
|
||||
|
||||
/** Bitwise logic **/
|
||||
#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix) \
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix) \
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
|
||||
OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
|
||||
@@ -946,14 +946,16 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
|
||||
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
|
||||
|
||||
#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
|
||||
|
||||
#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
|
||||
@@ -979,25 +981,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const
|
||||
|
||||
/** Comparison **/
|
||||
#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); } \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_gt(b, a); } \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_lt(a, b)); } \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_ge(b, a); }
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); } \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return b > a; } \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a < b); } \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return b >= a; }
|
||||
|
||||
#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
|
||||
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
|
||||
} \
|
||||
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
|
||||
@@ -1007,37 +1009,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
|
||||
|
||||
#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); }
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); }
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
|
||||
|
||||
#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt, xvfcmp_clt, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix)
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
|
||||
OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
|
||||
|
||||
inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b)
|
||||
inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
|
||||
{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
|
||||
|
||||
inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b)
|
||||
inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
|
||||
{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
|
||||
|
||||
inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b)
|
||||
inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
|
||||
{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
|
||||
|
||||
inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b)
|
||||
inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
|
||||
{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
|
||||
|
||||
inline v_float32x8 v_not_nan(const v_float32x8& a)
|
||||
@@ -1098,7 +1100,7 @@ inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
|
||||
template<int imm>
|
||||
inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
|
||||
{
|
||||
enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
|
||||
enum {IMM_L = (imm - 16) & 0xFF};
|
||||
enum {IMM_R = (16 - imm) & 0xFF};
|
||||
|
||||
if (imm == 0) return a;
|
||||
@@ -1115,7 +1117,7 @@ inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
|
||||
template<int imm>
|
||||
inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
|
||||
{
|
||||
enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
|
||||
enum {IMM_L = (imm - 16) & 0xFF};
|
||||
|
||||
if (imm == 0) return a;
|
||||
if (imm > 32) return v_uint8x32();
|
||||
@@ -1305,9 +1307,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
|
||||
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
|
||||
|
||||
inline int v_reduce_sum(const v_int16x16& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
inline unsigned v_reduce_sum(const v_uint16x16& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
|
||||
inline float v_reduce_sum(const v_float32x8& a)
|
||||
{
|
||||
@@ -1375,27 +1377,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
|
||||
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
|
||||
return v_reduce_sum(v_add(l, h));
|
||||
v_expand(v_add_wrap(a - b, b - a), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
|
||||
return v_reduce_sum(v_add(l, h));
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{
|
||||
return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
|
||||
return v_reduce_sum(v_max(a, b) - v_min(a, b));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 m = v_lt(a, b);
|
||||
return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
|
||||
v_int32x8 m = a < b;
|
||||
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
|
||||
{
|
||||
v_float32x8 a_b = v_sub(a, b);
|
||||
v_float32x8 a_b = a - b;
|
||||
return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
|
||||
}
|
||||
|
||||
@@ -1499,9 +1501,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
|
||||
inline _Tpvec v_sqrt(const _Tpvec& x) \
|
||||
{ return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_fma(a, a, v_mul(b, b)); } \
|
||||
{ return v_fma(a, a, b * b); } \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
|
||||
{ return v_sqrt(v_fma(a, a, b*b)); }
|
||||
|
||||
OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
|
||||
OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
|
||||
@@ -1552,20 +1554,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
|
||||
|
||||
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
v_int8x32 d = v_sub(a, b);
|
||||
v_int8x32 m = v_lt(a, b);
|
||||
return v_sub(v_xor(d, m), m);
|
||||
v_int8x32 d = a - b;
|
||||
v_int8x32 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
////////// Conversions /////////
|
||||
|
||||
@@ -1887,7 +1889,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
|
||||
{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
|
||||
|
||||
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
|
||||
@@ -1911,7 +1913,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
|
||||
return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
|
||||
}
|
||||
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
@@ -1922,7 +1924,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
|
||||
return v_int32x8(__lasx_xvadd_w(prod0, prod1));
|
||||
}
|
||||
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
|
||||
@@ -1934,7 +1936,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
|
||||
return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
|
||||
}
|
||||
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
@@ -1946,13 +1948,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
|
||||
}
|
||||
|
||||
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
@@ -1989,7 +1991,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
|
||||
return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
|
||||
}
|
||||
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
@@ -2000,7 +2002,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
|
||||
return v_int64x4(__lasx_xvadd_d(lo, hi));
|
||||
}
|
||||
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
|
||||
@@ -2020,7 +2022,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
|
||||
v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
|
||||
v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
|
||||
v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
|
||||
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
|
||||
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
|
||||
}
|
||||
|
||||
inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
|
||||
@@ -3013,20 +3015,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
|
||||
|
||||
inline void v256_cleanup() {}
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
|
||||
inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
|
||||
|
||||
inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
|
||||
inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
// This file has been created for compatibility with older versions of Universal Intrinscs
|
||||
// Binary operators for vector types has been removed since version 4.11
|
||||
// Include this file manually after OpenCV headers if you need these operators
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
|
||||
#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
|
||||
|
||||
#ifdef __OPENCV_BUILD
|
||||
#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#warning "Operators might conflict with built-in functions on RISC-V platform"
|
||||
#endif
|
||||
|
||||
#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
|
||||
#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
|
||||
#endif
|
||||
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
#define BIN_OP(OP, FUN) \
|
||||
template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
|
||||
|
||||
#define BIN_A_OP(OP, FUN) \
|
||||
template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
|
||||
|
||||
#define UN_OP(OP, FUN) \
|
||||
template <typename R> R operator OP (const R & val) { return FUN(val); }
|
||||
|
||||
BIN_OP(+, v_add)
|
||||
BIN_OP(-, v_sub)
|
||||
BIN_OP(*, v_mul)
|
||||
BIN_OP(/, v_div)
|
||||
BIN_OP(&, v_and)
|
||||
BIN_OP(|, v_or)
|
||||
BIN_OP(^, v_xor)
|
||||
|
||||
BIN_OP(==, v_eq)
|
||||
BIN_OP(!=, v_ne)
|
||||
BIN_OP(<, v_lt)
|
||||
BIN_OP(>, v_gt)
|
||||
BIN_OP(<=, v_le)
|
||||
BIN_OP(>=, v_ge)
|
||||
|
||||
BIN_A_OP(+=, v_add)
|
||||
BIN_A_OP(-=, v_sub)
|
||||
BIN_A_OP(*=, v_mul)
|
||||
BIN_A_OP(/=, v_div)
|
||||
BIN_A_OP(&=, v_and)
|
||||
BIN_A_OP(|=, v_or)
|
||||
BIN_A_OP(^=, v_xor)
|
||||
|
||||
UN_OP(~, v_not)
|
||||
|
||||
// TODO: shift operators?
|
||||
|
||||
}} // cv::hal::
|
||||
|
||||
//==============================================================================
|
||||
|
||||
#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
inline static void opencv_operator_compile_test()
|
||||
{
|
||||
using namespace cv;
|
||||
v_float32 a, b, c;
|
||||
uint8_t shift = 1;
|
||||
a = b + c;
|
||||
a = b - c;
|
||||
a = b * c;
|
||||
a = b / c;
|
||||
a = b & c;
|
||||
a = b | c;
|
||||
a = b ^ c;
|
||||
// a = b >> shift;
|
||||
// a = b << shift;
|
||||
|
||||
a = (b == c);
|
||||
a = (b != c);
|
||||
a = (b < c);}}
|
||||
a = (b > c);
|
||||
a = (b <= c);
|
||||
a = (b >= c);
|
||||
|
||||
a += b;
|
||||
a -= b;
|
||||
a *= b;
|
||||
a /= b;
|
||||
a &= b;
|
||||
a |= b;
|
||||
a ^= b;
|
||||
// a <<= shift;
|
||||
// a >>= shift;
|
||||
|
||||
a = ~b;
|
||||
}
|
||||
|
||||
}} // cv::hal::
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
|
||||
@@ -417,10 +417,6 @@ inline __m128i _lsx_128_castpd_si128(const __m128d& v)
|
||||
{ return _Tpvec(__lsx_vldi(0)); } \
|
||||
inline _Tpvec v_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \
|
||||
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \
|
||||
@@ -452,10 +448,6 @@ inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
|
||||
{ return _Tpvec(__lsx_vldi(0)); } \
|
||||
inline _Tpvec v_setall_##suffix(_Tp v) \
|
||||
{ return _Tpvec(_v128_setall_##zsuffix(v)); } \
|
||||
template <> inline _Tpvec v_setzero_() \
|
||||
{ return v_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) \
|
||||
{ return v_setall_##suffix(v); } \
|
||||
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \
|
||||
@@ -533,51 +525,53 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
|
||||
|
||||
/** Arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a.val = intrin(a.val, b.val); return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16, __lsx_vsadd_bu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16, __lsx_vssub_bu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16, __lsx_vsadd_b)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16, __lsx_vssub_b)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8, __lsx_vsadd_hu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8, __lsx_vssub_hu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8, __lsx_vsadd_h)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8, __lsx_vssub_h)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4, __lsx_vadd_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4, __lsx_vsub_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4, __lsx_vmul_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4, __lsx_vadd_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4, __lsx_vsub_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4, __lsx_vmul_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2, __lsx_vadd_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2, __lsx_vsub_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2, __lsx_vadd_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2, __lsx_vsub_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d)
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b)
|
||||
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
v_uint16x8 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b)
|
||||
inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v_int16x8 c, d;
|
||||
v_mul_expand(a, b, c, d);
|
||||
return v_pack(c, d);
|
||||
}
|
||||
inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
|
||||
inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
__m128i a0 = a.val, b0 = b.val;
|
||||
__m128i pev = __lsx_vmulwev_w_hu(a0, b0);
|
||||
@@ -586,7 +580,7 @@ inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
|
||||
__m128i ph = __lsx_vilvh_w(pod, pev);
|
||||
return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
|
||||
}
|
||||
inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
|
||||
inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
__m128i a0 = a.val, b0 = b.val;
|
||||
__m128i pev = __lsx_vmulwev_w_h(a0, b0);
|
||||
@@ -595,6 +589,14 @@ inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
|
||||
__m128i ph = __lsx_vilvh_w(pod, pev);
|
||||
return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
|
||||
}
|
||||
inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
|
||||
{ a = a * b; return a; }
|
||||
inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
|
||||
{ a = a * b; return a; }
|
||||
|
||||
/** Non-saturating arithmetics **/
|
||||
|
||||
@@ -679,13 +681,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
|
||||
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
|
||||
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
|
||||
{ return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
|
||||
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
|
||||
{ return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
|
||||
template<int imm> \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a) \
|
||||
@@ -706,10 +708,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
|
||||
|
||||
/** Bitwise logic **/
|
||||
#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix) \
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix) \
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
|
||||
OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
|
||||
inline _Tpvec operator ~(const _Tpvec& a) \
|
||||
{ return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v)
|
||||
@@ -722,14 +724,18 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v)
|
||||
OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v)
|
||||
|
||||
#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
|
||||
a.val = cast(c); \
|
||||
return a;}
|
||||
|
||||
#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
|
||||
@@ -754,23 +760,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const
|
||||
|
||||
/** Comparison **/
|
||||
#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); } \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_gt(b, a); } \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_lt(a, b)); } \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_ge(b, a); } \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~( a == b ); } \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return b > a ; } \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a < b); } \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return b >= a; } \
|
||||
|
||||
#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
|
||||
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
|
||||
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
|
||||
@@ -780,37 +786,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu)
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu)
|
||||
|
||||
#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); }
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); }
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
|
||||
|
||||
#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
|
||||
|
||||
#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt, vfcmp_clt, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
|
||||
OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
|
||||
OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
|
||||
|
||||
inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b)
|
||||
inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
|
||||
{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
|
||||
|
||||
inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b)
|
||||
inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
|
||||
{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
|
||||
|
||||
inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b)
|
||||
inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
|
||||
{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
|
||||
|
||||
inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b)
|
||||
inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
|
||||
{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
|
||||
|
||||
inline v_float32x4 v_not_nan(const v_float32x4& a)
|
||||
@@ -1182,7 +1188,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
v_float32x4 a_b = v_sub(a, b);
|
||||
v_float32x4 a_b = a - b;
|
||||
return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
|
||||
}
|
||||
|
||||
@@ -1289,9 +1295,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
|
||||
inline _Tpvec v_sqrt(const _Tpvec& x) \
|
||||
{ return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_fma(a, a, v_mul(b, b)); } \
|
||||
{ return v_fma(a, a, b * b); } \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
|
||||
{ return v_sqrt(v_fma(a, a, b * b)); }
|
||||
|
||||
OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
|
||||
OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
|
||||
@@ -1343,20 +1349,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
|
||||
|
||||
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v_int8x16 d = v_sub(a, b);
|
||||
v_int8x16 m = v_lt(a, b);
|
||||
return v_sub(v_xor(d, m), m);
|
||||
v_int8x16 d = a - b;
|
||||
v_int8x16 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
///////// Conversions /////////
|
||||
|
||||
@@ -1667,7 +1673,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
}
|
||||
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c) ;}
|
||||
{ return v_dotprod_expand(a, b) + c ;}
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
@@ -1679,7 +1685,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
return v_int32x4(__lsx_vadd_w(prod0, prod1));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
@@ -1692,7 +1698,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
return v_uint64x2(__lsx_vadd_d(prod0, prod1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@@ -1704,13 +1710,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int64x2(__lsx_vadd_d(prod0, prod1));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
|
||||
///////// Fast Dot Product //////
|
||||
@@ -1749,7 +1755,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
|
||||
return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@@ -1761,7 +1767,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int64x2(__lsx_vadd_d(lo, hi));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
@@ -2523,20 +2529,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
|
||||
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
|
||||
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
|
||||
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
@@ -1,687 +0,0 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
/* Universal Intrinsics implementation of sin, cos, exp and log
|
||||
|
||||
Inspired by Intel Approximate Math library, and based on the
|
||||
corresponding algorithms of the cephes math library
|
||||
*/
|
||||
|
||||
/* Copyright (C) 2010,2011 RJVB - extensions */
|
||||
/* Copyright (C) 2011 Julien Pommier
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
(this is the zlib license)
|
||||
*/
|
||||
#ifndef OPENCV_HAL_INTRIN_MATH_HPP
|
||||
#define OPENCV_HAL_INTRIN_MATH_HPP
|
||||
|
||||
//! @name Exponential
|
||||
//! @{
|
||||
// Implementation is the same as float32 vector.
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
|
||||
const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
|
||||
const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
|
||||
const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
|
||||
const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
|
||||
const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
|
||||
const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
|
||||
const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
|
||||
const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
|
||||
const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
|
||||
const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
|
||||
const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
|
||||
const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
|
||||
const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
|
||||
|
||||
_TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
|
||||
_TpVec16S _vexp_mm;
|
||||
const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f16);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f16(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
|
||||
_vexp_mm = v_shl(_vexp_mm, 10);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
|
||||
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
|
||||
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
_TpVec16F mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
|
||||
const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
|
||||
const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
|
||||
const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
|
||||
const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
|
||||
const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
|
||||
const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
|
||||
const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
|
||||
const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
|
||||
const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
|
||||
const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
|
||||
const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
|
||||
const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
|
||||
const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
|
||||
|
||||
_TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
|
||||
_TpVec32S _vexp_mm;
|
||||
const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f32);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f32(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
|
||||
_vexp_mm = v_shl(_vexp_mm, 23);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
|
||||
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
|
||||
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
_TpVec32F mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
|
||||
const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
|
||||
const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
|
||||
const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
|
||||
const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
|
||||
const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
|
||||
const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
|
||||
const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
|
||||
const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
|
||||
const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
|
||||
const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
|
||||
const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
|
||||
const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
|
||||
const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
|
||||
const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
|
||||
const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
|
||||
|
||||
_TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
|
||||
_TpVec64S _vexp_mm;
|
||||
const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f64);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
|
||||
_vexp_mm = v_expand_low(v_floor(_vexp_));
|
||||
_vexp_ = v_cvt_f64(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
|
||||
_vexp_mm = v_shl(_vexp_mm, 52);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
|
||||
_vexp_y = v_mul(_vexp_y, _vexp_x);
|
||||
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
|
||||
|
||||
_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
|
||||
_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
|
||||
_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
_TpVec64F mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
|
||||
}
|
||||
//! @}
|
||||
|
||||
//! @name Natural Logarithm
|
||||
//! @{
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
|
||||
const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
|
||||
const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
|
||||
const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
|
||||
const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
|
||||
const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
|
||||
const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
|
||||
const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
|
||||
const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
|
||||
const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
|
||||
const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
|
||||
const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
|
||||
const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
|
||||
const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
|
||||
|
||||
_TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
|
||||
_TpVec16S _vlog_ux, _vlog_emm0;
|
||||
const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
|
||||
|
||||
_vlog_ux = v_reinterpret_as_s16(x);
|
||||
_vlog_emm0 = v_shr(_vlog_ux, 10);
|
||||
|
||||
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
|
||||
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
|
||||
_vlog_x = v_reinterpret_as_f16(_vlog_ux);
|
||||
|
||||
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
|
||||
_vlog_e = v_cvt_f16(_vlog_emm0);
|
||||
|
||||
_vlog_e = v_add(_vlog_e, _vlog_one_fp16);
|
||||
|
||||
_TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
|
||||
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
|
||||
_vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
|
||||
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
|
||||
_vlog_x = v_add(_vlog_x, _vlog_tmp);
|
||||
|
||||
_vlog_z = v_mul(_vlog_x, _vlog_x);
|
||||
|
||||
_vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_x);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_z);
|
||||
|
||||
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
|
||||
|
||||
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
|
||||
|
||||
_vlog_x = v_add(_vlog_x, _vlog_y);
|
||||
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
|
||||
// log(0) -> -INF
|
||||
_TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
|
||||
_vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
|
||||
// log(NEG), log(NAN) -> NAN
|
||||
_TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
|
||||
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
|
||||
// log(INF) -> INF
|
||||
_TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
|
||||
_vlog_x = v_select(mask_inf, x, _vlog_x);
|
||||
return _vlog_x;
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
|
||||
const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
|
||||
const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
|
||||
const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
|
||||
const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
|
||||
const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
|
||||
const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
|
||||
const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
|
||||
const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
|
||||
const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
|
||||
const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
|
||||
const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
|
||||
const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
|
||||
const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
|
||||
|
||||
_TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
|
||||
_TpVec32S _vlog_ux, _vlog_emm0;
|
||||
const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
|
||||
|
||||
_vlog_ux = v_reinterpret_as_s32(x);
|
||||
_vlog_emm0 = v_shr(_vlog_ux, 23);
|
||||
|
||||
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
|
||||
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
|
||||
_vlog_x = v_reinterpret_as_f32(_vlog_ux);
|
||||
|
||||
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
|
||||
_vlog_e = v_cvt_f32(_vlog_emm0);
|
||||
|
||||
_vlog_e = v_add(_vlog_e, _vlog_one_fp32);
|
||||
|
||||
_TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
|
||||
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
|
||||
_vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
|
||||
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
|
||||
_vlog_x = v_add(_vlog_x, _vlog_tmp);
|
||||
|
||||
_vlog_z = v_mul(_vlog_x, _vlog_x);
|
||||
|
||||
_vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_x);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_z);
|
||||
|
||||
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
|
||||
|
||||
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
|
||||
|
||||
_vlog_x = v_add(_vlog_x, _vlog_y);
|
||||
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
|
||||
// log(0) -> -INF
|
||||
_TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
|
||||
_vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
|
||||
// log(NEG), log(NAN) -> NAN
|
||||
_TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
|
||||
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
|
||||
// log(INF) -> INF
|
||||
_TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
|
||||
_vlog_x = v_select(mask_inf, x, _vlog_x);
|
||||
return _vlog_x;
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
|
||||
const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
|
||||
const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
|
||||
const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
|
||||
const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
|
||||
const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
|
||||
const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
|
||||
const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
|
||||
const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
|
||||
const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
|
||||
const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
|
||||
const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
|
||||
const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
|
||||
const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
|
||||
|
||||
const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
|
||||
const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
|
||||
|
||||
_TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
|
||||
_TpVec64S _vlog_ux, _vlog_emm0;
|
||||
const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
|
||||
|
||||
_vlog_ux = v_reinterpret_as_s64(x);
|
||||
_vlog_emm0 = v_shr(_vlog_ux, 52);
|
||||
|
||||
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
|
||||
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
|
||||
_vlog_x = v_reinterpret_as_f64(_vlog_ux);
|
||||
|
||||
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
|
||||
_vlog_e = v_cvt_f64(_vlog_emm0);
|
||||
|
||||
_vlog_e = v_add(_vlog_e, _vlog_one_fp64);
|
||||
|
||||
_TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
|
||||
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
|
||||
_vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
|
||||
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
|
||||
_vlog_x = v_add(_vlog_x, _vlog_tmp);
|
||||
|
||||
_vlog_xx = v_mul(_vlog_x, _vlog_x);
|
||||
|
||||
_vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_x);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_xx);
|
||||
|
||||
_vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
|
||||
|
||||
_vlog_z = v_div(_vlog_y, _vlog_z);
|
||||
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
|
||||
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
|
||||
|
||||
_vlog_z = v_add(_vlog_z, _vlog_x);
|
||||
_vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
|
||||
|
||||
// log(0) -> -INF
|
||||
_TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
|
||||
_vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
|
||||
// log(NEG), log(NAN) -> NAN
|
||||
_TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
|
||||
_vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
|
||||
// log(INF) -> INF
|
||||
_TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
|
||||
_vlog_z = v_select(mask_inf, x, _vlog_z);
|
||||
return _vlog_z;
|
||||
}
|
||||
//! @}
|
||||
|
||||
//! @name Sine and Cosine
|
||||
//! @{
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
|
||||
const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
|
||||
const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
|
||||
const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
|
||||
const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
|
||||
const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
|
||||
const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
|
||||
const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
|
||||
const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
|
||||
const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
|
||||
const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
|
||||
const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
|
||||
const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
|
||||
|
||||
_TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
|
||||
_TpVec16S emm2;
|
||||
|
||||
sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
|
||||
_vx = v_abs(x);
|
||||
_vy = v_mul(_vx, v_cephes_FOPI);
|
||||
|
||||
emm2 = v_trunc(_vy);
|
||||
emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
|
||||
emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
|
||||
_vy = v_cvt_f16(emm2);
|
||||
|
||||
_TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
|
||||
|
||||
_vx = v_fma(_vy, v_minus_DP1, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP2, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP3, _vx);
|
||||
|
||||
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
|
||||
sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
|
||||
|
||||
_TpVec16F _vxx = v_mul(_vx, _vx);
|
||||
_TpVec16F y1, y2;
|
||||
|
||||
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
|
||||
y1 = v_fma(y1, _vxx, v_coscof_p2);
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
|
||||
|
||||
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
|
||||
y2 = v_fma(y2, _vxx, v_sincof_p2);
|
||||
y2 = v_mul(y2, _vxx);
|
||||
y2 = v_fma(y2, _vx, _vx);
|
||||
|
||||
ysin = v_select(poly_mask, y2, y1);
|
||||
ycos = v_select(poly_mask, y1, y2);
|
||||
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
|
||||
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
|
||||
|
||||
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
|
||||
_TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
|
||||
_TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
|
||||
ysin = v_select(mask_nan, v_nan, ysin);
|
||||
ycos = v_select(mask_nan, v_nan, ycos);
|
||||
}
|
||||
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
|
||||
_TpVec16F ysin, ycos;
|
||||
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
|
||||
return ysin;
|
||||
}
|
||||
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
|
||||
_TpVec16F ysin, ycos;
|
||||
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
|
||||
return ycos;
|
||||
}
|
||||
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
|
||||
const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
|
||||
const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
|
||||
const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
|
||||
const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
|
||||
const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
|
||||
const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
|
||||
const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
|
||||
const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
|
||||
const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
|
||||
const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
|
||||
const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
|
||||
const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
|
||||
|
||||
_TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
|
||||
_TpVec32S emm2;
|
||||
|
||||
sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
|
||||
_vx = v_abs(x);
|
||||
_vy = v_mul(_vx, v_cephes_FOPI);
|
||||
|
||||
emm2 = v_trunc(_vy);
|
||||
emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
|
||||
emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
|
||||
_vy = v_cvt_f32(emm2);
|
||||
|
||||
_TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
|
||||
|
||||
_vx = v_fma(_vy, v_minus_DP1, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP2, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP3, _vx);
|
||||
|
||||
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
|
||||
sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
|
||||
|
||||
_TpVec32F _vxx = v_mul(_vx, _vx);
|
||||
_TpVec32F y1, y2;
|
||||
|
||||
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
|
||||
y1 = v_fma(y1, _vxx, v_coscof_p2);
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
|
||||
|
||||
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
|
||||
y2 = v_fma(y2, _vxx, v_sincof_p2);
|
||||
y2 = v_mul(y2, _vxx);
|
||||
y2 = v_fma(y2, _vx, _vx);
|
||||
|
||||
ysin = v_select(poly_mask, y2, y1);
|
||||
ycos = v_select(poly_mask, y1, y2);
|
||||
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
|
||||
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
|
||||
|
||||
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
|
||||
_TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
|
||||
_TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
|
||||
ysin = v_select(mask_nan, v_nan, ysin);
|
||||
ycos = v_select(mask_nan, v_nan, ycos);
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
|
||||
_TpVec32F ysin, ycos;
|
||||
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
|
||||
return ysin;
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
|
||||
_TpVec32F ysin, ycos;
|
||||
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
|
||||
return ycos;
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
|
||||
const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
|
||||
const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
|
||||
const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
|
||||
const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
|
||||
const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
|
||||
const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
|
||||
const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
|
||||
const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
|
||||
const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
|
||||
const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
|
||||
const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
|
||||
const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
|
||||
const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
|
||||
const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
|
||||
const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
|
||||
const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
|
||||
const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
|
||||
const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
|
||||
|
||||
_TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
|
||||
_TpVec64S emm2;
|
||||
|
||||
sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
|
||||
_vx = v_abs(x);
|
||||
_vy = v_mul(_vx, v_cephes_FOPI);
|
||||
|
||||
emm2 = v_expand_low(v_trunc(_vy));
|
||||
emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
|
||||
emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
|
||||
_vy = v_cvt_f64(emm2);
|
||||
|
||||
_TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
|
||||
|
||||
_vx = v_fma(_vy, v_minus_DP1, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP2, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP3, _vx);
|
||||
|
||||
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
|
||||
sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
|
||||
|
||||
_TpVec64F _vxx = v_mul(_vx, _vx);
|
||||
_TpVec64F y1, y2;
|
||||
|
||||
y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C3);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C4);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C5);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C6);
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
|
||||
|
||||
y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C3);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C4);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C5);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C6);
|
||||
y2 = v_mul(y2, _vxx);
|
||||
y2 = v_fma(y2, _vx, _vx);
|
||||
|
||||
ysin = v_select(poly_mask, y2, y1);
|
||||
ycos = v_select(poly_mask, y1, y2);
|
||||
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
|
||||
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
|
||||
|
||||
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
|
||||
_TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
|
||||
_TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
|
||||
ysin = v_select(mask_nan, v_nan, ysin);
|
||||
ycos = v_select(mask_nan, v_nan, ycos);
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
|
||||
_TpVec64F ysin, ycos;
|
||||
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
|
||||
return ysin;
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
|
||||
_TpVec64F ysin, ycos;
|
||||
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
|
||||
return ycos;
|
||||
}
|
||||
//! @}
|
||||
|
||||
|
||||
/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
|
||||
https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
|
||||
*/
|
||||
|
||||
//! @name Error Function
|
||||
//! @{
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
|
||||
const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
|
||||
coef1 = v_setall_<_TpVec32F>(1.061405429f),
|
||||
coef2 = v_setall_<_TpVec32F>(-1.453152027f),
|
||||
coef3 = v_setall_<_TpVec32F>(1.421413741f),
|
||||
coef4 = v_setall_<_TpVec32F>(-0.284496736f),
|
||||
coef5 = v_setall_<_TpVec32F>(0.254829592f),
|
||||
ones = v_setall_<_TpVec32F>(1.0f),
|
||||
neg_zeros = v_setall_<_TpVec32F>(-0.f);
|
||||
_TpVec32F t = v_abs(v);
|
||||
// sign(v)
|
||||
_TpVec32F sign_mask = v_and(neg_zeros, v);
|
||||
|
||||
t = v_div(ones, v_fma(coef0, t, ones));
|
||||
_TpVec32F r = v_fma(coef1, t, coef2);
|
||||
r = v_fma(r, t, coef3);
|
||||
r = v_fma(r, t, coef4);
|
||||
r = v_fma(r, t, coef5);
|
||||
// - v * v
|
||||
_TpVec32F v2 = v_mul(v, v);
|
||||
_TpVec32F mv2 = v_xor(neg_zeros, v2);
|
||||
// - exp(- v * v)
|
||||
_TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
|
||||
_TpVec32F neg_exp = v_xor(neg_zeros, exp);
|
||||
_TpVec32F res = v_mul(t, neg_exp);
|
||||
res = v_fma(r, res, ones);
|
||||
return v_xor(sign_mask, res);
|
||||
}
|
||||
//! @}
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_MATH_HPP
|
||||
@@ -235,8 +235,6 @@ struct v_float64x2
|
||||
#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
|
||||
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
|
||||
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
|
||||
template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
|
||||
template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
|
||||
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
|
||||
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
|
||||
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
|
||||
@@ -347,46 +345,53 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val)); \
|
||||
} \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
a.val = intrin(a.val, b.val); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
}
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
@@ -541,13 +546,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int64x2(msa_hadd_s64(prod, prod));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
@@ -591,10 +596,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
|
||||
{ return v_dotprod_expand(a, b, c); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
|
||||
OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ \
|
||||
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
|
||||
}
|
||||
@@ -609,16 +614,21 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
|
||||
OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
|
||||
inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
|
||||
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
|
||||
} \
|
||||
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
|
||||
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
|
||||
|
||||
inline v_float32x4 v_not(const v_float32x4& a)
|
||||
inline v_float32x4 operator ~ (const v_float32x4& a)
|
||||
{
|
||||
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
|
||||
}
|
||||
@@ -649,16 +659,21 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
|
||||
inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
|
||||
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
|
||||
{ \
|
||||
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
|
||||
} \
|
||||
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
|
||||
{ \
|
||||
a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
|
||||
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
|
||||
|
||||
inline v_float64x2 v_not(const v_float64x2& a)
|
||||
inline v_float64x2 operator ~ (const v_float64x2& a)
|
||||
{
|
||||
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
|
||||
}
|
||||
@@ -689,17 +704,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
|
||||
OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
|
||||
|
||||
OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
|
||||
@@ -806,9 +821,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
|
||||
|
||||
// trade efficiency for convenience
|
||||
#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
|
||||
inline _Tpvec v_shl(const _Tpvec& a, int n) \
|
||||
inline _Tpvec operator << (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
|
||||
inline _Tpvec v_shr(const _Tpvec& a, int n) \
|
||||
inline _Tpvec operator >> (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
|
||||
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
|
||||
@@ -1863,20 +1878,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
|
||||
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
|
||||
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
|
||||
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
@@ -56,7 +56,7 @@ namespace cv
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
#define CV_SIMD128 1
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define CV_SIMD128_64F 1
|
||||
#else
|
||||
#define CV_SIMD128_64F 0
|
||||
@@ -72,7 +72,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
//
|
||||
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
|
||||
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
|
||||
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
|
||||
#define CV_NEON_AARCH64 1
|
||||
#else
|
||||
#define CV_NEON_AARCH64 0
|
||||
@@ -381,8 +381,6 @@ private:
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
|
||||
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
|
||||
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
|
||||
template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
|
||||
template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
|
||||
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
|
||||
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
|
||||
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
|
||||
@@ -888,10 +886,9 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
int32x4_t s0 = vaddl_s16(vget_low_s16(p0), vget_low_s16(p1));
|
||||
return v_int32x4(vaddq_s32(s0, vaddl_s16(vget_high_s16(p0), vget_high_s16(p1))));
|
||||
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{
|
||||
@@ -1081,7 +1078,7 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
static inline uint64x2_t vmvnq_u64(uint64x2_t a)
|
||||
{
|
||||
uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
|
||||
@@ -1823,7 +1820,7 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
|
||||
return v_int32x4(vmovl_s16(v1));
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
|
||||
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
|
||||
{ \
|
||||
@@ -2649,28 +2646,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
|
||||
inline v_float16x8 v_exp(const v_float16x8& x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
|
||||
inline v_float16x8 v_log(const v_float16x8& x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
|
||||
inline void v_sincos(const v_float16x8& x, v_float16x8& s, v_float16x8& c) { v_sincos_default_16f<v_float16x8, v_int16x8>(x, s, c); }
|
||||
inline v_float16x8 v_sin(const v_float16x8& x) { return v_sin_default_16f<v_float16x8, v_int16x8>(x); }
|
||||
inline v_float16x8 v_cos(const v_float16x8& x) { return v_cos_default_16f<v_float16x8, v_int16x8>(x); }
|
||||
#endif
|
||||
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
|
||||
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
|
||||
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
#endif
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
3345
3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv.hpp
Normal file
3345
3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -355,12 +355,10 @@ inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
|
||||
inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \
|
||||
inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } \
|
||||
template <> inline v_##_Tp##x##num v_setzero_() { return v_setzero_##suffix(); } \
|
||||
template <> inline v_##_Tp##x##num v_setall_(__Tp v) { return v_setall_##suffix(v); }
|
||||
inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_INIT_SET(schar, int8, s8, i8, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
|
||||
@@ -373,57 +371,72 @@ inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v,
|
||||
inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
|
||||
inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
|
||||
|
||||
template <> inline v_float32x4 v_setzero_() { return v_setzero_f32(); }
|
||||
template <> inline v_float32x4 v_setall_(float v) { return v_setall_f32(v); }
|
||||
|
||||
template <> inline v_float64x2 v_setzero_() { return v_setzero_f64(); }
|
||||
template <> inline v_float64x2 v_setall_(double v) { return v_setall_f64(v); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val)); \
|
||||
} \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
a.val = intrin(a.val, b.val); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val, num)); \
|
||||
} \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
a.val = intrin(a.val, b.val, num); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4)
|
||||
inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
|
||||
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
|
||||
}
|
||||
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
|
||||
return a;
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2)
|
||||
inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
|
||||
inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
|
||||
}
|
||||
inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
|
||||
return a;
|
||||
}
|
||||
// TODO: exp, log, sin, cos
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
|
||||
@@ -549,10 +562,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num) \
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \
|
||||
inline _Tpvec v_not(const _Tpvec & a) \
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
|
||||
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
|
||||
inline _Tpvec operator ~ (const _Tpvec & a) \
|
||||
{ \
|
||||
return _Tpvec(vnot_v_##suffix(a.val, num)); \
|
||||
}
|
||||
@@ -567,31 +580,41 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
|
||||
OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
|
||||
inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
|
||||
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
|
||||
} \
|
||||
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
|
||||
|
||||
inline v_float32x4 v_not(const v_float32x4& a)
|
||||
inline v_float32x4 operator ~ (const v_float32x4& a)
|
||||
{
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
|
||||
inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
|
||||
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
|
||||
{ \
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
|
||||
} \
|
||||
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
|
||||
{ \
|
||||
a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
|
||||
|
||||
inline v_float64x2 v_not(const v_float64x2& a)
|
||||
inline v_float64x2 operator ~ (const v_float64x2& a)
|
||||
{
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
|
||||
}
|
||||
@@ -1151,32 +1174,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
|
||||
OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
|
||||
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
|
||||
} \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
|
||||
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
|
||||
} \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
|
||||
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
|
||||
} \
|
||||
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
|
||||
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
|
||||
} \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
|
||||
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
|
||||
} \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
|
||||
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
|
||||
@@ -1192,37 +1215,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
|
||||
OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
|
||||
|
||||
//TODO: ==
|
||||
inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
|
||||
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
|
||||
}
|
||||
inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
|
||||
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
|
||||
}
|
||||
inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
|
||||
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
|
||||
}
|
||||
inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
|
||||
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
|
||||
}
|
||||
inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
|
||||
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
|
||||
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
|
||||
}
|
||||
inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b)
|
||||
inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
|
||||
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
|
||||
@@ -1236,37 +1259,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a)
|
||||
}
|
||||
|
||||
//TODO: ==
|
||||
inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b)
|
||||
inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
|
||||
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
|
||||
}
|
||||
inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b)
|
||||
inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
|
||||
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
|
||||
}
|
||||
inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b)
|
||||
inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
|
||||
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
|
||||
}
|
||||
inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b)
|
||||
inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
|
||||
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
|
||||
}
|
||||
inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b)
|
||||
inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
|
||||
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
|
||||
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
|
||||
}
|
||||
inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b)
|
||||
inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
|
||||
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
|
||||
@@ -1308,13 +1331,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
|
||||
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
|
||||
inline _Tpvec v_shl(const _Tpvec& a, int n) \
|
||||
inline _Tpvec operator << (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
|
||||
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
|
||||
inline _Tpvec v_shr(const _Tpvec& a, int n) \
|
||||
inline _Tpvec operator >> (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
|
||||
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
||||
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
|
||||
@@ -2014,11 +2037,13 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
auto res = mul(a.val, b.val, num); \
|
||||
return _Tpvec(cvt(res, 0, num)); \
|
||||
}
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
|
||||
@@ -2820,7 +2845,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
|
||||
const v_float64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
|
||||
@@ -2829,7 +2854,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
}
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ v_float64x2 res = v_dotprod_expand_fast(a, b);
|
||||
return v_add(res, c); }
|
||||
return res + c; }
|
||||
#endif
|
||||
////// FP16 support ///////
|
||||
#if __riscv_v == 7000
|
||||
@@ -2866,20 +2891,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
|
||||
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
|
||||
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
|
||||
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,768 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// Copied from
|
||||
// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
|
||||
|
||||
#ifndef __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
|
||||
#define __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
|
||||
|
||||
|
||||
// The maximum number of parameters is 20, this is held by segment load
|
||||
// instructions with a NFIELD (NF) of 8. 20 is contributed by 8 vector register
|
||||
// pointers passed, 1 vector mask register, 8 passthrough register for
|
||||
// undisturbed policy, and 3 for address base, byte index, vl.
|
||||
#define _GET_OVERRIDE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,\
|
||||
_14, _15, _16, _17, _18, _19, _20, NAME, ...) NAME
|
||||
|
||||
|
||||
#if __has_include ("riscv_vector.h")
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
#ifndef __RISCV_VECTOR_H
|
||||
#include_next <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
#define vmerge(mask, op1, op2, vl) __riscv_vmerge((op1), (op2), (mask), (vl))
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
#define vfmerge(mask, op1, op2, vl) __riscv_vfmerge((op1), (op2), (mask), (vl))
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
// masked functions
|
||||
#define vcompress(mask, dest, src, vl) __riscv_vcompress_tu((dest), (src), (mask), (vl))
|
||||
// Reinterpret between different type under the same SEW/LMUL
|
||||
// Reinterpret between different SEW under the same LMUL
|
||||
#define vse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse16, __riscv_vse16, 2, 1)(__VA_ARGS__)
|
||||
#define vse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse32, __riscv_vse32, 2, 1)(__VA_ARGS__)
|
||||
#define vse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse64, __riscv_vse64, 2, 1)(__VA_ARGS__)
|
||||
#define vse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse8, __riscv_vse8, 2, 1)(__VA_ARGS__)
|
||||
#define vsse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse16, __riscv_vsse16, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse32, __riscv_vsse32, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse64, __riscv_vsse64, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse8, __riscv_vsse8, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei8_tumu, 4, __riscv_vloxei8, 2, 1)(__VA_ARGS__)
|
||||
#define vloxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei16_tumu, 4, __riscv_vloxei16, 2, 1)(__VA_ARGS__)
|
||||
#define vloxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei32_tumu, 4, __riscv_vloxei32, 2, 1)(__VA_ARGS__)
|
||||
#define vloxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei64_tumu, 4, __riscv_vloxei64, 2, 1)(__VA_ARGS__)
|
||||
#define vluxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei8_tumu, 4, __riscv_vluxei8, 2, 1)(__VA_ARGS__)
|
||||
#define vluxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei16_tumu, 4, __riscv_vluxei16, 2, 1)(__VA_ARGS__)
|
||||
#define vluxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei32_tumu, 4, __riscv_vluxei32, 2, 1)(__VA_ARGS__)
|
||||
#define vluxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei64_tumu, 4, __riscv_vluxei64, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei8, __riscv_vsoxei8, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei16, __riscv_vsoxei16, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei32, __riscv_vsoxei32, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei64, __riscv_vsoxei64, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei8, __riscv_vsuxei8, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei16, __riscv_vsuxei16, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei32, __riscv_vsuxei32, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei64, __riscv_vsuxei64, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e16, __riscv_vsseg2e16, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e16, __riscv_vsseg3e16, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e16, __riscv_vsseg4e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e16, __riscv_vsseg5e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e16, __riscv_vsseg6e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e16, __riscv_vsseg7e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e16, __riscv_vsseg8e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e32, __riscv_vsseg2e32, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e32, __riscv_vsseg3e32, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e32, __riscv_vsseg4e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e32, __riscv_vsseg5e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e32, __riscv_vsseg6e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e32, __riscv_vsseg7e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e32, __riscv_vsseg8e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e64, __riscv_vsseg2e64, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e64, __riscv_vsseg3e64, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e64, __riscv_vsseg4e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e64, __riscv_vsseg5e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e64, __riscv_vsseg6e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e64, __riscv_vsseg7e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e64, __riscv_vsseg8e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e8, __riscv_vsseg2e8, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e8, __riscv_vsseg3e8, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e8, __riscv_vsseg4e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e8, __riscv_vsseg5e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e8, __riscv_vsseg6e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e8, __riscv_vsseg7e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e8, __riscv_vsseg8e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e16, __riscv_vssseg2e16, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e16, __riscv_vssseg3e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e16, __riscv_vssseg4e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e16, __riscv_vssseg5e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e16, __riscv_vssseg6e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e16, __riscv_vssseg7e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e16, __riscv_vssseg8e16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e32, __riscv_vssseg2e32, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e32, __riscv_vssseg3e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e32, __riscv_vssseg4e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e32, __riscv_vssseg5e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e32, __riscv_vssseg6e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e32, __riscv_vssseg7e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e32, __riscv_vssseg8e32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e64, __riscv_vssseg2e64, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e64, __riscv_vssseg3e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e64, __riscv_vssseg4e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e64, __riscv_vssseg5e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e64, __riscv_vssseg6e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e64, __riscv_vssseg7e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e64, __riscv_vssseg8e64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e8, __riscv_vssseg2e8, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e8, __riscv_vssseg3e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e8, __riscv_vssseg4e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e8, __riscv_vssseg5e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e8, __riscv_vssseg6e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e8, __riscv_vssseg7e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vssseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e8, __riscv_vssseg8e8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei8_tumu, 7, 6, __riscv_vloxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei8_tumu, 9, 8, 7, __riscv_vloxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei16_tumu, 7, 6, __riscv_vloxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei16_tumu, 9, 8, 7, __riscv_vloxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei32_tumu, 7, 6, __riscv_vloxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei32_tumu, 9, 8, 7, __riscv_vloxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei64_tumu, 7, 6, __riscv_vloxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei64_tumu, 9, 8, 7, __riscv_vloxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vloxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei8_tumu, 7, 6, __riscv_vluxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei8_tumu, 9, 8, 7, __riscv_vluxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei16_tumu, 7, 6, __riscv_vluxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei16_tumu, 9, 8, 7, __riscv_vluxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei32_tumu, 7, 6, __riscv_vluxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei32_tumu, 9, 8, 7, __riscv_vluxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei64_tumu, 7, 6, __riscv_vluxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei64_tumu, 9, 8, 7, __riscv_vluxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vluxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei8, __riscv_vsoxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei8, __riscv_vsoxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei8, __riscv_vsoxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei8, __riscv_vsoxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei8, __riscv_vsoxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei8, __riscv_vsoxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei8, __riscv_vsoxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei16, __riscv_vsoxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei16, __riscv_vsoxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei16, __riscv_vsoxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei16, __riscv_vsoxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei16, __riscv_vsoxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei16, __riscv_vsoxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei16, __riscv_vsoxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei32, __riscv_vsoxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei32, __riscv_vsoxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei32, __riscv_vsoxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei32, __riscv_vsoxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei32, __riscv_vsoxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei32, __riscv_vsoxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei32, __riscv_vsoxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei64, __riscv_vsoxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei64, __riscv_vsoxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei64, __riscv_vsoxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei64, __riscv_vsoxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei64, __riscv_vsoxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei64, __riscv_vsoxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsoxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei64, __riscv_vsoxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei8, __riscv_vsuxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei8, __riscv_vsuxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei8, __riscv_vsuxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei8, __riscv_vsuxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei8, __riscv_vsuxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei8, __riscv_vsuxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei8, __riscv_vsuxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei16, __riscv_vsuxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei16, __riscv_vsuxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei16, __riscv_vsuxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei16, __riscv_vsuxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei16, __riscv_vsuxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei16, __riscv_vsuxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei16, __riscv_vsuxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei32, __riscv_vsuxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei32, __riscv_vsuxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei32, __riscv_vsuxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei32, __riscv_vsuxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei32, __riscv_vsuxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei32, __riscv_vsuxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei32, __riscv_vsuxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei64, __riscv_vsuxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei64, __riscv_vsuxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei64, __riscv_vsuxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei64, __riscv_vsuxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei64, __riscv_vsuxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei64, __riscv_vsuxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsuxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei64, __riscv_vsuxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vadd_tumu, 4, __riscv_vadd, 2, 1)(__VA_ARGS__)
|
||||
#define vsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsub_tumu, 4, __riscv_vsub, 2, 1)(__VA_ARGS__)
|
||||
#define vrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrsub_tumu, 4, __riscv_vrsub, 2, 1)(__VA_ARGS__)
|
||||
#define vneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vneg_tumu, 3, __riscv_vneg, 1)(__VA_ARGS__)
|
||||
#define vwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vv_tumu, 4, __riscv_vwadd_vv, 2, 1)(__VA_ARGS__)
|
||||
#define vwadd_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vx_tumu, 4, __riscv_vwadd_vx, 2, 1)(__VA_ARGS__)
|
||||
#define vwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wv_tumu, 4, __riscv_vwadd_wv, 2, 1)(__VA_ARGS__)
|
||||
#define vwadd_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wx_tumu, 4, __riscv_vwadd_wx, 2, 1)(__VA_ARGS__)
|
||||
#define vwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vv_tumu, 4, __riscv_vwsub_vv, 2, 1)(__VA_ARGS__)
|
||||
#define vwsub_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vx_tumu, 4, __riscv_vwsub_vx, 2, 1)(__VA_ARGS__)
|
||||
#define vwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wv_tumu, 4, __riscv_vwsub_wv, 2, 1)(__VA_ARGS__)
|
||||
#define vwsub_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wx_tumu, 4, __riscv_vwsub_wx, 2, 1)(__VA_ARGS__)
|
||||
#define vwaddu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vv_tumu, 4, __riscv_vwaddu_vv, 2, 1)(__VA_ARGS__)
|
||||
#define vwaddu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vx_tumu, 4, __riscv_vwaddu_vx, 2, 1)(__VA_ARGS__)
|
||||
#define vwaddu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wv_tumu, 4, __riscv_vwaddu_wv, 2, 1)(__VA_ARGS__)
|
||||
#define vwaddu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wx_tumu, 4, __riscv_vwaddu_wx, 2, 1)(__VA_ARGS__)
|
||||
#define vwsubu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vv_tumu, 4, __riscv_vwsubu_vv, 2, 1)(__VA_ARGS__)
|
||||
#define vwsubu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vx_tumu, 4, __riscv_vwsubu_vx, 2, 1)(__VA_ARGS__)
|
||||
#define vwsubu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wv_tumu, 4, __riscv_vwsubu_wv, 2, 1)(__VA_ARGS__)
|
||||
#define vwsubu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wx_tumu, 4, __riscv_vwsubu_wx, 2, 1)(__VA_ARGS__)
|
||||
#define vsext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf2_tumu, 3, __riscv_vsext_vf2, 1)(__VA_ARGS__)
|
||||
#define vsext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf4_tumu, 3, __riscv_vsext_vf4, 1)(__VA_ARGS__)
|
||||
#define vsext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf8_tumu, 3, __riscv_vsext_vf8, 1)(__VA_ARGS__)
|
||||
#define vzext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf2_tumu, 3, __riscv_vzext_vf2, 1)(__VA_ARGS__)
|
||||
#define vzext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf4_tumu, 3, __riscv_vzext_vf4, 1)(__VA_ARGS__)
|
||||
#define vzext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf8_tumu, 3, __riscv_vzext_vf8, 1)(__VA_ARGS__)
|
||||
#define vadc(...) __riscv_vadc(__VA_ARGS__)
|
||||
#define vsbc(...) __riscv_vsbc(__VA_ARGS__)
|
||||
#define vmadc(...) __riscv_vmadc(__VA_ARGS__)
|
||||
#define vmsbc(...) __riscv_vmsbc(__VA_ARGS__)
|
||||
#define vand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vand_tumu, 4, __riscv_vand, 2, 1)(__VA_ARGS__)
|
||||
#define vor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vor_tumu, 4, __riscv_vor, 2, 1)(__VA_ARGS__)
|
||||
#define vxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vxor_tumu, 4, __riscv_vxor, 2, 1)(__VA_ARGS__)
|
||||
#define vnot(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vnot_tumu, 3, __riscv_vnot, 1)(__VA_ARGS__)
|
||||
#define vsll(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsll_tumu, 4, __riscv_vsll, 2, 1)(__VA_ARGS__)
|
||||
#define vsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsra_tumu, 4, __riscv_vsra, 2, 1)(__VA_ARGS__)
|
||||
#define vsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsrl_tumu, 4, __riscv_vsrl, 2, 1)(__VA_ARGS__)
|
||||
#define vnsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsra_tumu, 4, __riscv_vnsra, 2, 1)(__VA_ARGS__)
|
||||
#define vnsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsrl_tumu, 4, __riscv_vnsrl, 2, 1)(__VA_ARGS__)
|
||||
#define vmseq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmseq_mu, 4, __riscv_vmseq, 2, 1)(__VA_ARGS__)
|
||||
#define vmsne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsne_mu, 4, __riscv_vmsne, 2, 1)(__VA_ARGS__)
|
||||
#define vmslt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmslt_mu, 4, __riscv_vmslt, 2, 1)(__VA_ARGS__)
|
||||
#define vmsle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsle_mu, 4, __riscv_vmsle, 2, 1)(__VA_ARGS__)
|
||||
#define vmsgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgt_mu, 4, __riscv_vmsgt, 2, 1)(__VA_ARGS__)
|
||||
#define vmsge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsge_mu, 4, __riscv_vmsge, 2, 1)(__VA_ARGS__)
|
||||
#define vmsltu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsltu_mu, 4, __riscv_vmsltu, 2, 1)(__VA_ARGS__)
|
||||
#define vmsleu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsleu_mu, 4, __riscv_vmsleu, 2, 1)(__VA_ARGS__)
|
||||
#define vmsgtu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgtu_mu, 4, __riscv_vmsgtu, 2, 1)(__VA_ARGS__)
|
||||
#define vmsgeu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgeu_mu, 4, __riscv_vmsgeu, 2, 1)(__VA_ARGS__)
|
||||
#define vmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmin_tumu, 4, __riscv_vmin, 2, 1)(__VA_ARGS__)
|
||||
#define vmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmax_tumu, 4, __riscv_vmax, 2, 1)(__VA_ARGS__)
|
||||
#define vminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vminu_tumu, 4, __riscv_vminu, 2, 1)(__VA_ARGS__)
|
||||
#define vmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmaxu_tumu, 4, __riscv_vmaxu, 2, 1)(__VA_ARGS__)
|
||||
#define vmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmul_tumu, 4, __riscv_vmul, 2, 1)(__VA_ARGS__)
|
||||
#define vmulh(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulh_tumu, 4, __riscv_vmulh, 2, 1)(__VA_ARGS__)
|
||||
#define vmulhsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhsu_tumu, 4, __riscv_vmulhsu, 2, 1)(__VA_ARGS__)
|
||||
#define vmulhu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhu_tumu, 4, __riscv_vmulhu, 2, 1)(__VA_ARGS__)
|
||||
#define vdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdiv_tumu, 4, __riscv_vdiv, 2, 1)(__VA_ARGS__)
|
||||
#define vrem(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrem_tumu, 4, __riscv_vrem, 2, 1)(__VA_ARGS__)
|
||||
#define vdivu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdivu_tumu, 4, __riscv_vdivu, 2, 1)(__VA_ARGS__)
|
||||
#define vremu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vremu_tumu, 4, __riscv_vremu, 2, 1)(__VA_ARGS__)
|
||||
#define vwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmul_tumu, 4, __riscv_vwmul, 2, 1)(__VA_ARGS__)
|
||||
#define vwmulsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulsu_tumu, 4, __riscv_vwmulsu, 2, 1)(__VA_ARGS__)
|
||||
#define vwmulu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulu_tumu, 4, __riscv_vwmulu, 2, 1)(__VA_ARGS__)
|
||||
#define vmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmacc_tumu, __riscv_vmacc_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsac_tumu, __riscv_vnmsac_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmadd_tumu, __riscv_vmadd_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsub_tumu, __riscv_vnmsub_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmacc_tumu, __riscv_vwmacc_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vwmaccsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccsu_tumu, __riscv_vwmaccsu_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vwmaccus(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccus_tumu, __riscv_vwmaccus_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vwmaccu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccu_tumu, __riscv_vwmaccu_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vmv_v(...) __riscv_vmv_v(__VA_ARGS__)
|
||||
#define vsadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsadd_tumu, 4, __riscv_vsadd, 2, 1)(__VA_ARGS__)
|
||||
#define vssub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssub_tumu, 4, __riscv_vssub, 2, 1)(__VA_ARGS__)
|
||||
#define vsaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsaddu_tumu, 4, __riscv_vsaddu, 2, 1)(__VA_ARGS__)
|
||||
#define vssubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssubu_tumu, 4, __riscv_vssubu, 2, 1)(__VA_ARGS__)
|
||||
#define vaadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaadd_tumu, 4, __riscv_vaadd, 2, 1)(__VA_ARGS__)
|
||||
#define vasub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasub_tumu, 4, __riscv_vasub, 2, 1)(__VA_ARGS__)
|
||||
#define vaaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaaddu_tumu, 4, __riscv_vaaddu, 2, 1)(__VA_ARGS__)
|
||||
#define vasubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasubu_tumu, 4, __riscv_vasubu, 2, 1)(__VA_ARGS__)
|
||||
#define vsmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsmul_mu, 4, __riscv_vsmul, 2, 1)(__VA_ARGS__)
|
||||
#define vssra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssra_tumu, 4, __riscv_vssra, 2, 1)(__VA_ARGS__)
|
||||
#define vssrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssrl_tumu, 4, __riscv_vssrl, 2, 1)(__VA_ARGS__)
|
||||
#define vnclip(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclip_tumu, 4, __riscv_vnclip, 2, 1)(__VA_ARGS__)
|
||||
#define vnclipu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclipu_tumu, 4, __riscv_vnclipu, 2, 1)(__VA_ARGS__)
|
||||
#define vfadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfadd_tumu, 4, __riscv_vfadd, 2, 1)(__VA_ARGS__)
|
||||
#define vfsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsub_tumu, 4, __riscv_vfsub, 2, 1)(__VA_ARGS__)
|
||||
#define vfrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrsub_tumu, 4, __riscv_vfrsub, 2, 1)(__VA_ARGS__)
|
||||
#define vfneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfneg_tumu, 3, __riscv_vfneg, 1)(__VA_ARGS__)
|
||||
#define vfwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vv_tumu, 4, __riscv_vfwadd_vv, 2, 1)(__VA_ARGS__)
|
||||
#define vfwadd_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vf_tumu, 4, __riscv_vfwadd_vf, 2, 1)(__VA_ARGS__)
|
||||
#define vfwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wv_tumu, 4, __riscv_vfwadd_wv, 2, 1)(__VA_ARGS__)
|
||||
#define vfwadd_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wf_tumu, 4, __riscv_vfwadd_wf, 2, 1)(__VA_ARGS__)
|
||||
#define vfwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vv_tumu, 4, __riscv_vfwsub_vv, 2, 1)(__VA_ARGS__)
|
||||
#define vfwsub_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vf_tumu, 4, __riscv_vfwsub_vf, 2, 1)(__VA_ARGS__)
|
||||
#define vfwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wv_tumu, 4, __riscv_vfwsub_wv, 2, 1)(__VA_ARGS__)
|
||||
#define vfwsub_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wf_tumu, 4, __riscv_vfwsub_wf, 2, 1)(__VA_ARGS__)
|
||||
#define vfmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmul_tumu, 4, __riscv_vfmul, 2, 1)(__VA_ARGS__)
|
||||
#define vfdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfdiv_tumu, 4, __riscv_vfdiv, 2, 1)(__VA_ARGS__)
|
||||
#define vfrdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrdiv_tumu, 4, __riscv_vfrdiv, 2, 1)(__VA_ARGS__)
|
||||
#define vfwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmul_tumu, 4, __riscv_vfwmul, 2, 1)(__VA_ARGS__)
|
||||
#define vfmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmacc_tumu, __riscv_vfmacc_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmacc_tumu, __riscv_vfnmacc_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsac_tumu, __riscv_vfmsac_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsac_tumu, __riscv_vfnmsac_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmadd_tumu, __riscv_vfmadd_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfnmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmadd_tumu, __riscv_vfnmadd_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsub_tumu, __riscv_vfmsub_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsub_tumu, __riscv_vfnmsub_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmacc_tumu, __riscv_vfwmacc_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfwnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmacc_tumu, __riscv_vfwnmacc_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfwmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmsac_tumu, __riscv_vfwmsac_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfwnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmsac_tumu, __riscv_vfwnmsac_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfsqrt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfsqrt_tumu, 3, __riscv_vfsqrt, 1)(__VA_ARGS__)
|
||||
#define vfrsqrt7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrsqrt7_tumu, 3, __riscv_vfrsqrt7, 1)(__VA_ARGS__)
|
||||
#define vfrec7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrec7_tumu, 3, __riscv_vfrec7, 1)(__VA_ARGS__)
|
||||
#define vfmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmin_tumu, 4, __riscv_vfmin, 2, 1)(__VA_ARGS__)
|
||||
#define vfmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmax_tumu, 4, __riscv_vfmax, 2, 1)(__VA_ARGS__)
|
||||
#define vfsgnj(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnj_tumu, 4, __riscv_vfsgnj, 2, 1)(__VA_ARGS__)
|
||||
#define vfsgnjn(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjn_tumu, 4, __riscv_vfsgnjn, 2, 1)(__VA_ARGS__)
|
||||
#define vfsgnjx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjx_tumu, 4, __riscv_vfsgnjx, 2, 1)(__VA_ARGS__)
|
||||
#define vfabs(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfabs_tumu, 3, __riscv_vfabs, 1)(__VA_ARGS__)
|
||||
#define vmfeq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfeq_mu, 4, __riscv_vmfeq, 2, 1)(__VA_ARGS__)
|
||||
#define vmfne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfne_mu, 4, __riscv_vmfne, 2, 1)(__VA_ARGS__)
|
||||
#define vmflt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmflt_mu, 4, __riscv_vmflt, 2, 1)(__VA_ARGS__)
|
||||
#define vmfle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfle_mu, 4, __riscv_vmfle, 2, 1)(__VA_ARGS__)
|
||||
#define vmfgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfgt_mu, 4, __riscv_vmfgt, 2, 1)(__VA_ARGS__)
|
||||
#define vmfge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfge_mu, 4, __riscv_vmfge, 2, 1)(__VA_ARGS__)
|
||||
#define vfclass(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfclass_tumu, 3, __riscv_vfclass, 1)(__VA_ARGS__)
|
||||
#define vfcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_x_tumu, 3, __riscv_vfcvt_x, 1)(__VA_ARGS__)
|
||||
#define vfcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_x_tumu, 3, __riscv_vfcvt_rtz_x, 1)(__VA_ARGS__)
|
||||
#define vfcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_xu_tumu, 3, __riscv_vfcvt_xu, 1)(__VA_ARGS__)
|
||||
#define vfcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_xu_tumu, 3, __riscv_vfcvt_rtz_xu, 1)(__VA_ARGS__)
|
||||
#define vfcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_f_tumu, 3, __riscv_vfcvt_f, 1)(__VA_ARGS__)
|
||||
#define vwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvt_x_tumu, 3, __riscv_vwcvt_x, 1)(__VA_ARGS__)
|
||||
#define vwcvtu_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvtu_x_tumu, 3, __riscv_vwcvtu_x, 1)(__VA_ARGS__)
|
||||
#define vfwcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_f_tumu, 3, __riscv_vfwcvt_f, 1)(__VA_ARGS__)
|
||||
#define vfwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_x_tumu, 3, __riscv_vfwcvt_x, 1)(__VA_ARGS__)
|
||||
#define vfwcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_x_tumu, 3, __riscv_vfwcvt_rtz_x, 1)(__VA_ARGS__)
|
||||
#define vfwcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_xu_tumu, 3, __riscv_vfwcvt_xu, 1)(__VA_ARGS__)
|
||||
#define vfwcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_xu_tumu, 3, __riscv_vfwcvt_rtz_xu, 1)(__VA_ARGS__)
|
||||
#define vfncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_x_tumu, 3, __riscv_vfncvt_x, 1)(__VA_ARGS__)
|
||||
#define vfncvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_x_tumu, 3, __riscv_vfncvt_rtz_x, 1)(__VA_ARGS__)
|
||||
#define vncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vncvt_x_tumu, 3, __riscv_vncvt_x, 1)(__VA_ARGS__)
|
||||
#define vfncvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_xu_tumu, 3, __riscv_vfncvt_xu, 1)(__VA_ARGS__)
|
||||
#define vfncvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_xu_tumu, 3, __riscv_vfncvt_rtz_xu, 1)(__VA_ARGS__)
|
||||
#define vfncvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_f_tumu, 3, __riscv_vfncvt_f, 1)(__VA_ARGS__)
|
||||
#define vfncvt_rod_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rod_f_tumu, 3, __riscv_vfncvt_rod_f, 1)(__VA_ARGS__)
|
||||
#define vredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredsum_tum, __riscv_vredsum_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmax_tum, __riscv_vredmax_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmin_tum, __riscv_vredmin_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredand_tum, __riscv_vredand_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredor_tum, __riscv_vredor_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredxor_tum, __riscv_vredxor_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmaxu_tum, __riscv_vredmaxu_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vredminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredminu_tum, __riscv_vredminu_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vwredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsum_tum, __riscv_vwredsum_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vwredsumu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsumu_tum, __riscv_vwredsumu_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredosum_tum, __riscv_vfredosum_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredusum_tum, __riscv_vfredusum_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmax_tum, __riscv_vfredmax_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmin_tum, __riscv_vfredmin_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfwredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredosum_tum, __riscv_vfwredosum_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfwredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredusum_tum, __riscv_vfwredusum_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vsm(...) __riscv_vsm(__VA_ARGS__)
|
||||
#define vmand(...) __riscv_vmand(__VA_ARGS__)
|
||||
#define vmnand(...) __riscv_vmnand(__VA_ARGS__)
|
||||
#define vmandn(...) __riscv_vmandn(__VA_ARGS__)
|
||||
#define vmxor(...) __riscv_vmxor(__VA_ARGS__)
|
||||
#define vmor(...) __riscv_vmor(__VA_ARGS__)
|
||||
#define vmnor(...) __riscv_vmnor(__VA_ARGS__)
|
||||
#define vmorn(...) __riscv_vmorn(__VA_ARGS__)
|
||||
#define vmxnor(...) __riscv_vmxnor(__VA_ARGS__)
|
||||
#define vmmv(...) __riscv_vmmv(__VA_ARGS__)
|
||||
#define vmnot(...) __riscv_vmnot(__VA_ARGS__)
|
||||
#define vcpop(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vcpop, __riscv_vcpop, 1)(__VA_ARGS__)
|
||||
#define vfirst(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vfirst, __riscv_vfirst, 1)(__VA_ARGS__)
|
||||
#define vmsbf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsbf_mu, 3, __riscv_vmsbf, 1)(__VA_ARGS__)
|
||||
#define vmsif(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsif_mu, 3, __riscv_vmsif, 1)(__VA_ARGS__)
|
||||
#define vmsof(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsof_mu, 3, __riscv_vmsof, 1)(__VA_ARGS__)
|
||||
#define vfmv_f(...) __riscv_vfmv_f(__VA_ARGS__)
|
||||
#define vfmv_s(...) __riscv_vfmv_s_tu(__VA_ARGS__)
|
||||
#define vmv_x(...) __riscv_vmv_x(__VA_ARGS__)
|
||||
#define vmv_s(...) __riscv_vmv_s_tu(__VA_ARGS__)
|
||||
#define vslideup(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslideup_tumu, __riscv_vslideup_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vslidedown(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslidedown_tumu, __riscv_vslidedown_tu, 3, 2, 1)(__VA_ARGS__)
|
||||
#define vfslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1up_tumu, 4, __riscv_vfslide1up, 2, 1)(__VA_ARGS__)
|
||||
#define vfslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1down_tumu, 4, __riscv_vfslide1down, 2, 1)(__VA_ARGS__)
|
||||
#define vslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1up_tumu, 4, __riscv_vslide1up, 2, 1)(__VA_ARGS__)
|
||||
#define vslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1down_tumu, 4, __riscv_vslide1down, 2, 1)(__VA_ARGS__)
|
||||
#define vrgather(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgather_tumu, 4, __riscv_vrgather, 2, 1)(__VA_ARGS__)
|
||||
#define vrgatherei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgatherei16_tumu, 4, __riscv_vrgatherei16, 2, 1)(__VA_ARGS__)
|
||||
#define vreinterpret_u8mf8(...) __riscv_vreinterpret_u8mf8(__VA_ARGS__)
|
||||
#define vreinterpret_u8mf4(...) __riscv_vreinterpret_u8mf4(__VA_ARGS__)
|
||||
#define vreinterpret_u8mf2(...) __riscv_vreinterpret_u8mf2(__VA_ARGS__)
|
||||
#define vreinterpret_u8m1(...) __riscv_vreinterpret_u8m1(__VA_ARGS__)
|
||||
#define vreinterpret_u8m2(...) __riscv_vreinterpret_u8m2(__VA_ARGS__)
|
||||
#define vreinterpret_u8m4(...) __riscv_vreinterpret_u8m4(__VA_ARGS__)
|
||||
#define vreinterpret_u8m8(...) __riscv_vreinterpret_u8m8(__VA_ARGS__)
|
||||
#define vreinterpret_i8mf8(...) __riscv_vreinterpret_i8mf8(__VA_ARGS__)
|
||||
#define vreinterpret_i8mf4(...) __riscv_vreinterpret_i8mf4(__VA_ARGS__)
|
||||
#define vreinterpret_i8mf2(...) __riscv_vreinterpret_i8mf2(__VA_ARGS__)
|
||||
#define vreinterpret_i8m1(...) __riscv_vreinterpret_i8m1(__VA_ARGS__)
|
||||
#define vreinterpret_i8m2(...) __riscv_vreinterpret_i8m2(__VA_ARGS__)
|
||||
#define vreinterpret_i8m4(...) __riscv_vreinterpret_i8m4(__VA_ARGS__)
|
||||
#define vreinterpret_i8m8(...) __riscv_vreinterpret_i8m8(__VA_ARGS__)
|
||||
#define vreinterpret_f16mf4(...) __riscv_vreinterpret_f16mf4(__VA_ARGS__)
|
||||
#define vreinterpret_f16mf2(...) __riscv_vreinterpret_f16mf2(__VA_ARGS__)
|
||||
#define vreinterpret_f16m1(...) __riscv_vreinterpret_f16m1(__VA_ARGS__)
|
||||
#define vreinterpret_f16m2(...) __riscv_vreinterpret_f16m2(__VA_ARGS__)
|
||||
#define vreinterpret_f16m4(...) __riscv_vreinterpret_f16m4(__VA_ARGS__)
|
||||
#define vreinterpret_f16m8(...) __riscv_vreinterpret_f16m8(__VA_ARGS__)
|
||||
#define vreinterpret_u16mf4(...) __riscv_vreinterpret_u16mf4(__VA_ARGS__)
|
||||
#define vreinterpret_u16mf2(...) __riscv_vreinterpret_u16mf2(__VA_ARGS__)
|
||||
#define vreinterpret_u16m1(...) __riscv_vreinterpret_u16m1(__VA_ARGS__)
|
||||
#define vreinterpret_u16m2(...) __riscv_vreinterpret_u16m2(__VA_ARGS__)
|
||||
#define vreinterpret_u16m4(...) __riscv_vreinterpret_u16m4(__VA_ARGS__)
|
||||
#define vreinterpret_u16m8(...) __riscv_vreinterpret_u16m8(__VA_ARGS__)
|
||||
#define vreinterpret_i16mf4(...) __riscv_vreinterpret_i16mf4(__VA_ARGS__)
|
||||
#define vreinterpret_i16mf2(...) __riscv_vreinterpret_i16mf2(__VA_ARGS__)
|
||||
#define vreinterpret_i16m1(...) __riscv_vreinterpret_i16m1(__VA_ARGS__)
|
||||
#define vreinterpret_i16m2(...) __riscv_vreinterpret_i16m2(__VA_ARGS__)
|
||||
#define vreinterpret_i16m4(...) __riscv_vreinterpret_i16m4(__VA_ARGS__)
|
||||
#define vreinterpret_i16m8(...) __riscv_vreinterpret_i16m8(__VA_ARGS__)
|
||||
#define vreinterpret_f32mf2(...) __riscv_vreinterpret_f32mf2(__VA_ARGS__)
|
||||
#define vreinterpret_f32m1(...) __riscv_vreinterpret_f32m1(__VA_ARGS__)
|
||||
#define vreinterpret_f32m2(...) __riscv_vreinterpret_f32m2(__VA_ARGS__)
|
||||
#define vreinterpret_f32m4(...) __riscv_vreinterpret_f32m4(__VA_ARGS__)
|
||||
#define vreinterpret_f32m8(...) __riscv_vreinterpret_f32m8(__VA_ARGS__)
|
||||
#define vreinterpret_u32mf2(...) __riscv_vreinterpret_u32mf2(__VA_ARGS__)
|
||||
#define vreinterpret_u32m1(...) __riscv_vreinterpret_u32m1(__VA_ARGS__)
|
||||
#define vreinterpret_u32m2(...) __riscv_vreinterpret_u32m2(__VA_ARGS__)
|
||||
#define vreinterpret_u32m4(...) __riscv_vreinterpret_u32m4(__VA_ARGS__)
|
||||
#define vreinterpret_u32m8(...) __riscv_vreinterpret_u32m8(__VA_ARGS__)
|
||||
#define vreinterpret_i32mf2(...) __riscv_vreinterpret_i32mf2(__VA_ARGS__)
|
||||
#define vreinterpret_i32m1(...) __riscv_vreinterpret_i32m1(__VA_ARGS__)
|
||||
#define vreinterpret_i32m2(...) __riscv_vreinterpret_i32m2(__VA_ARGS__)
|
||||
#define vreinterpret_i32m4(...) __riscv_vreinterpret_i32m4(__VA_ARGS__)
|
||||
#define vreinterpret_i32m8(...) __riscv_vreinterpret_i32m8(__VA_ARGS__)
|
||||
#define vreinterpret_f64m1(...) __riscv_vreinterpret_f64m1(__VA_ARGS__)
|
||||
#define vreinterpret_f64m2(...) __riscv_vreinterpret_f64m2(__VA_ARGS__)
|
||||
#define vreinterpret_f64m4(...) __riscv_vreinterpret_f64m4(__VA_ARGS__)
|
||||
#define vreinterpret_f64m8(...) __riscv_vreinterpret_f64m8(__VA_ARGS__)
|
||||
#define vreinterpret_u64m1(...) __riscv_vreinterpret_u64m1(__VA_ARGS__)
|
||||
#define vreinterpret_u64m2(...) __riscv_vreinterpret_u64m2(__VA_ARGS__)
|
||||
#define vreinterpret_u64m4(...) __riscv_vreinterpret_u64m4(__VA_ARGS__)
|
||||
#define vreinterpret_u64m8(...) __riscv_vreinterpret_u64m8(__VA_ARGS__)
|
||||
#define vreinterpret_i64m1(...) __riscv_vreinterpret_i64m1(__VA_ARGS__)
|
||||
#define vreinterpret_i64m2(...) __riscv_vreinterpret_i64m2(__VA_ARGS__)
|
||||
#define vreinterpret_i64m4(...) __riscv_vreinterpret_i64m4(__VA_ARGS__)
|
||||
#define vreinterpret_i64m8(...) __riscv_vreinterpret_i64m8(__VA_ARGS__)
|
||||
#define vlmul_ext_f16mf2(...) __riscv_vlmul_ext_f16mf2(__VA_ARGS__)
|
||||
#define vlmul_ext_f16m1(...) __riscv_vlmul_ext_f16m1(__VA_ARGS__)
|
||||
#define vlmul_ext_f16m2(...) __riscv_vlmul_ext_f16m2(__VA_ARGS__)
|
||||
#define vlmul_ext_f16m4(...) __riscv_vlmul_ext_f16m4(__VA_ARGS__)
|
||||
#define vlmul_ext_f16m8(...) __riscv_vlmul_ext_f16m8(__VA_ARGS__)
|
||||
#define vlmul_ext_f32m1(...) __riscv_vlmul_ext_f32m1(__VA_ARGS__)
|
||||
#define vlmul_ext_f32m2(...) __riscv_vlmul_ext_f32m2(__VA_ARGS__)
|
||||
#define vlmul_ext_f32m4(...) __riscv_vlmul_ext_f32m4(__VA_ARGS__)
|
||||
#define vlmul_ext_f32m8(...) __riscv_vlmul_ext_f32m8(__VA_ARGS__)
|
||||
#define vlmul_ext_f64m2(...) __riscv_vlmul_ext_f64m2(__VA_ARGS__)
|
||||
#define vlmul_ext_f64m4(...) __riscv_vlmul_ext_f64m4(__VA_ARGS__)
|
||||
#define vlmul_ext_f64m8(...) __riscv_vlmul_ext_f64m8(__VA_ARGS__)
|
||||
#define vlmul_ext_i8mf4(...) __riscv_vlmul_ext_i8mf4(__VA_ARGS__)
|
||||
#define vlmul_ext_i8mf2(...) __riscv_vlmul_ext_i8mf2(__VA_ARGS__)
|
||||
#define vlmul_ext_i8m1(...) __riscv_vlmul_ext_i8m1(__VA_ARGS__)
|
||||
#define vlmul_ext_i8m2(...) __riscv_vlmul_ext_i8m2(__VA_ARGS__)
|
||||
#define vlmul_ext_i8m4(...) __riscv_vlmul_ext_i8m4(__VA_ARGS__)
|
||||
#define vlmul_ext_i8m8(...) __riscv_vlmul_ext_i8m8(__VA_ARGS__)
|
||||
#define vlmul_ext_i16mf2(...) __riscv_vlmul_ext_i16mf2(__VA_ARGS__)
|
||||
#define vlmul_ext_i16m1(...) __riscv_vlmul_ext_i16m1(__VA_ARGS__)
|
||||
#define vlmul_ext_i16m2(...) __riscv_vlmul_ext_i16m2(__VA_ARGS__)
|
||||
#define vlmul_ext_i16m4(...) __riscv_vlmul_ext_i16m4(__VA_ARGS__)
|
||||
#define vlmul_ext_i16m8(...) __riscv_vlmul_ext_i16m8(__VA_ARGS__)
|
||||
#define vlmul_ext_i32m1(...) __riscv_vlmul_ext_i32m1(__VA_ARGS__)
|
||||
#define vlmul_ext_i32m2(...) __riscv_vlmul_ext_i32m2(__VA_ARGS__)
|
||||
#define vlmul_ext_i32m4(...) __riscv_vlmul_ext_i32m4(__VA_ARGS__)
|
||||
#define vlmul_ext_i32m8(...) __riscv_vlmul_ext_i32m8(__VA_ARGS__)
|
||||
#define vlmul_ext_i64m2(...) __riscv_vlmul_ext_i64m2(__VA_ARGS__)
|
||||
#define vlmul_ext_i64m4(...) __riscv_vlmul_ext_i64m4(__VA_ARGS__)
|
||||
#define vlmul_ext_i64m8(...) __riscv_vlmul_ext_i64m8(__VA_ARGS__)
|
||||
#define vlmul_ext_u8mf4(...) __riscv_vlmul_ext_u8mf4(__VA_ARGS__)
|
||||
#define vlmul_ext_u8mf2(...) __riscv_vlmul_ext_u8mf2(__VA_ARGS__)
|
||||
#define vlmul_ext_u8m1(...) __riscv_vlmul_ext_u8m1(__VA_ARGS__)
|
||||
#define vlmul_ext_u8m2(...) __riscv_vlmul_ext_u8m2(__VA_ARGS__)
|
||||
#define vlmul_ext_u8m4(...) __riscv_vlmul_ext_u8m4(__VA_ARGS__)
|
||||
#define vlmul_ext_u8m8(...) __riscv_vlmul_ext_u8m8(__VA_ARGS__)
|
||||
#define vlmul_ext_u16mf2(...) __riscv_vlmul_ext_u16mf2(__VA_ARGS__)
|
||||
#define vlmul_ext_u16m1(...) __riscv_vlmul_ext_u16m1(__VA_ARGS__)
|
||||
#define vlmul_ext_u16m2(...) __riscv_vlmul_ext_u16m2(__VA_ARGS__)
|
||||
#define vlmul_ext_u16m4(...) __riscv_vlmul_ext_u16m4(__VA_ARGS__)
|
||||
#define vlmul_ext_u16m8(...) __riscv_vlmul_ext_u16m8(__VA_ARGS__)
|
||||
#define vlmul_ext_u32m1(...) __riscv_vlmul_ext_u32m1(__VA_ARGS__)
|
||||
#define vlmul_ext_u32m2(...) __riscv_vlmul_ext_u32m2(__VA_ARGS__)
|
||||
#define vlmul_ext_u32m4(...) __riscv_vlmul_ext_u32m4(__VA_ARGS__)
|
||||
#define vlmul_ext_u32m8(...) __riscv_vlmul_ext_u32m8(__VA_ARGS__)
|
||||
#define vlmul_ext_u64m2(...) __riscv_vlmul_ext_u64m2(__VA_ARGS__)
|
||||
#define vlmul_ext_u64m4(...) __riscv_vlmul_ext_u64m4(__VA_ARGS__)
|
||||
#define vlmul_ext_u64m8(...) __riscv_vlmul_ext_u64m8(__VA_ARGS__)
|
||||
#define vlmul_trunc_f16mf4(...) __riscv_vlmul_trunc_f16mf4(__VA_ARGS__)
|
||||
#define vlmul_trunc_f16mf2(...) __riscv_vlmul_trunc_f16mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_f16m1(...) __riscv_vlmul_trunc_f16m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_f16m2(...) __riscv_vlmul_trunc_f16m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_f16m4(...) __riscv_vlmul_trunc_f16m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_f32mf2(...) __riscv_vlmul_trunc_f32mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_f32m1(...) __riscv_vlmul_trunc_f32m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_f32m2(...) __riscv_vlmul_trunc_f32m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_f32m4(...) __riscv_vlmul_trunc_f32m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_f64m1(...) __riscv_vlmul_trunc_f64m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_f64m2(...) __riscv_vlmul_trunc_f64m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_f64m4(...) __riscv_vlmul_trunc_f64m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_i8mf8(...) __riscv_vlmul_trunc_i8mf8(__VA_ARGS__)
|
||||
#define vlmul_trunc_i8mf4(...) __riscv_vlmul_trunc_i8mf4(__VA_ARGS__)
|
||||
#define vlmul_trunc_i8mf2(...) __riscv_vlmul_trunc_i8mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i8m1(...) __riscv_vlmul_trunc_i8m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_i8m2(...) __riscv_vlmul_trunc_i8m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i8m4(...) __riscv_vlmul_trunc_i8m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_i16mf4(...) __riscv_vlmul_trunc_i16mf4(__VA_ARGS__)
|
||||
#define vlmul_trunc_i16mf2(...) __riscv_vlmul_trunc_i16mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i16m1(...) __riscv_vlmul_trunc_i16m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_i16m2(...) __riscv_vlmul_trunc_i16m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i16m4(...) __riscv_vlmul_trunc_i16m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_i32mf2(...) __riscv_vlmul_trunc_i32mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i32m1(...) __riscv_vlmul_trunc_i32m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_i32m2(...) __riscv_vlmul_trunc_i32m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i32m4(...) __riscv_vlmul_trunc_i32m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_i64m1(...) __riscv_vlmul_trunc_i64m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_i64m2(...) __riscv_vlmul_trunc_i64m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_i64m4(...) __riscv_vlmul_trunc_i64m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_u8mf8(...) __riscv_vlmul_trunc_u8mf8(__VA_ARGS__)
|
||||
#define vlmul_trunc_u8mf4(...) __riscv_vlmul_trunc_u8mf4(__VA_ARGS__)
|
||||
#define vlmul_trunc_u8mf2(...) __riscv_vlmul_trunc_u8mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u8m1(...) __riscv_vlmul_trunc_u8m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_u8m2(...) __riscv_vlmul_trunc_u8m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u8m4(...) __riscv_vlmul_trunc_u8m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_u16mf4(...) __riscv_vlmul_trunc_u16mf4(__VA_ARGS__)
|
||||
#define vlmul_trunc_u16mf2(...) __riscv_vlmul_trunc_u16mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u16m1(...) __riscv_vlmul_trunc_u16m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_u16m2(...) __riscv_vlmul_trunc_u16m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u16m4(...) __riscv_vlmul_trunc_u16m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_u32mf2(...) __riscv_vlmul_trunc_u32mf2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u32m1(...) __riscv_vlmul_trunc_u32m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_u32m2(...) __riscv_vlmul_trunc_u32m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u32m4(...) __riscv_vlmul_trunc_u32m4(__VA_ARGS__)
|
||||
#define vlmul_trunc_u64m1(...) __riscv_vlmul_trunc_u64m1(__VA_ARGS__)
|
||||
#define vlmul_trunc_u64m2(...) __riscv_vlmul_trunc_u64m2(__VA_ARGS__)
|
||||
#define vlmul_trunc_u64m4(...) __riscv_vlmul_trunc_u64m4(__VA_ARGS__)
|
||||
#define vset(...) __riscv_vset(__VA_ARGS__)
|
||||
#define vget_f16m1(...) __riscv_vget_f16m1(__VA_ARGS__)
|
||||
#define vget_f16m2(...) __riscv_vget_f16m2(__VA_ARGS__)
|
||||
#define vget_f16m4(...) __riscv_vget_f16m4(__VA_ARGS__)
|
||||
#define vget_f32m1(...) __riscv_vget_f32m1(__VA_ARGS__)
|
||||
#define vget_f32m2(...) __riscv_vget_f32m2(__VA_ARGS__)
|
||||
#define vget_f32m4(...) __riscv_vget_f32m4(__VA_ARGS__)
|
||||
#define vget_f64m1(...) __riscv_vget_f64m1(__VA_ARGS__)
|
||||
#define vget_f64m2(...) __riscv_vget_f64m2(__VA_ARGS__)
|
||||
#define vget_f64m4(...) __riscv_vget_f64m4(__VA_ARGS__)
|
||||
#define vget_i8m1(...) __riscv_vget_i8m1(__VA_ARGS__)
|
||||
#define vget_i8m2(...) __riscv_vget_i8m2(__VA_ARGS__)
|
||||
#define vget_i8m4(...) __riscv_vget_i8m4(__VA_ARGS__)
|
||||
#define vget_i16m1(...) __riscv_vget_i16m1(__VA_ARGS__)
|
||||
#define vget_i16m2(...) __riscv_vget_i16m2(__VA_ARGS__)
|
||||
#define vget_i16m4(...) __riscv_vget_i16m4(__VA_ARGS__)
|
||||
#define vget_i32m1(...) __riscv_vget_i32m1(__VA_ARGS__)
|
||||
#define vget_i32m2(...) __riscv_vget_i32m2(__VA_ARGS__)
|
||||
#define vget_i32m4(...) __riscv_vget_i32m4(__VA_ARGS__)
|
||||
#define vget_i64m1(...) __riscv_vget_i64m1(__VA_ARGS__)
|
||||
#define vget_i64m2(...) __riscv_vget_i64m2(__VA_ARGS__)
|
||||
#define vget_i64m4(...) __riscv_vget_i64m4(__VA_ARGS__)
|
||||
#define vget_u8m1(...) __riscv_vget_u8m1(__VA_ARGS__)
|
||||
#define vget_u8m2(...) __riscv_vget_u8m2(__VA_ARGS__)
|
||||
#define vget_u8m4(...) __riscv_vget_u8m4(__VA_ARGS__)
|
||||
#define vget_u16m1(...) __riscv_vget_u16m1(__VA_ARGS__)
|
||||
#define vget_u16m2(...) __riscv_vget_u16m2(__VA_ARGS__)
|
||||
#define vget_u16m4(...) __riscv_vget_u16m4(__VA_ARGS__)
|
||||
#define vget_u32m1(...) __riscv_vget_u32m1(__VA_ARGS__)
|
||||
#define vget_u32m2(...) __riscv_vget_u32m2(__VA_ARGS__)
|
||||
#define vget_u32m4(...) __riscv_vget_u32m4(__VA_ARGS__)
|
||||
#define vget_u64m1(...) __riscv_vget_u64m1(__VA_ARGS__)
|
||||
#define vget_u64m2(...) __riscv_vget_u64m2(__VA_ARGS__)
|
||||
#define vget_u64m4(...) __riscv_vget_u64m4(__VA_ARGS__)
|
||||
#define vle16(...) __riscv_vle16_tumu(__VA_ARGS__)
|
||||
#define vle32(...) __riscv_vle32_tumu(__VA_ARGS__)
|
||||
#define vle64(...) __riscv_vle64_tumu(__VA_ARGS__)
|
||||
#define vle8(...) __riscv_vle8_tumu(__VA_ARGS__)
|
||||
#define vlse16(...) __riscv_vlse16_tumu(__VA_ARGS__)
|
||||
#define vlse32(...) __riscv_vlse32_tumu(__VA_ARGS__)
|
||||
#define vlse64(...) __riscv_vlse64_tumu(__VA_ARGS__)
|
||||
#define vlse8(...) __riscv_vlse8_tumu(__VA_ARGS__)
|
||||
#define vle16ff(...) __riscv_vle16ff_tumu(__VA_ARGS__)
|
||||
#define vle32ff(...) __riscv_vle32ff_tumu(__VA_ARGS__)
|
||||
#define vle64ff(...) __riscv_vle64ff_tumu(__VA_ARGS__)
|
||||
#define vle8ff(...) __riscv_vle8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg2e16(...) __riscv_vlseg2e16_tumu(__VA_ARGS__)
|
||||
#define vlseg3e16(...) __riscv_vlseg3e16_tumu(__VA_ARGS__)
|
||||
#define vlseg4e16(...) __riscv_vlseg4e16_tumu(__VA_ARGS__)
|
||||
#define vlseg5e16(...) __riscv_vlseg5e16_tumu(__VA_ARGS__)
|
||||
#define vlseg6e16(...) __riscv_vlseg6e16_tumu(__VA_ARGS__)
|
||||
#define vlseg7e16(...) __riscv_vlseg7e16_tumu(__VA_ARGS__)
|
||||
#define vlseg8e16(...) __riscv_vlseg8e16_tumu(__VA_ARGS__)
|
||||
#define vlseg2e32(...) __riscv_vlseg2e32_tumu(__VA_ARGS__)
|
||||
#define vlseg3e32(...) __riscv_vlseg3e32_tumu(__VA_ARGS__)
|
||||
#define vlseg4e32(...) __riscv_vlseg4e32_tumu(__VA_ARGS__)
|
||||
#define vlseg5e32(...) __riscv_vlseg5e32_tumu(__VA_ARGS__)
|
||||
#define vlseg6e32(...) __riscv_vlseg6e32_tumu(__VA_ARGS__)
|
||||
#define vlseg7e32(...) __riscv_vlseg7e32_tumu(__VA_ARGS__)
|
||||
#define vlseg8e32(...) __riscv_vlseg8e32_tumu(__VA_ARGS__)
|
||||
#define vlseg2e64(...) __riscv_vlseg2e64_tumu(__VA_ARGS__)
|
||||
#define vlseg3e64(...) __riscv_vlseg3e64_tumu(__VA_ARGS__)
|
||||
#define vlseg4e64(...) __riscv_vlseg4e64_tumu(__VA_ARGS__)
|
||||
#define vlseg5e64(...) __riscv_vlseg5e64_tumu(__VA_ARGS__)
|
||||
#define vlseg6e64(...) __riscv_vlseg6e64_tumu(__VA_ARGS__)
|
||||
#define vlseg7e64(...) __riscv_vlseg7e64_tumu(__VA_ARGS__)
|
||||
#define vlseg8e64(...) __riscv_vlseg8e64_tumu(__VA_ARGS__)
|
||||
#define vlseg2e16ff(...) __riscv_vlseg2e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg3e16ff(...) __riscv_vlseg3e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg4e16ff(...) __riscv_vlseg4e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg5e16ff(...) __riscv_vlseg5e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg6e16ff(...) __riscv_vlseg6e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg7e16ff(...) __riscv_vlseg7e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg8e16ff(...) __riscv_vlseg8e16ff_tumu(__VA_ARGS__)
|
||||
#define vlseg2e32ff(...) __riscv_vlseg2e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg3e32ff(...) __riscv_vlseg3e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg4e32ff(...) __riscv_vlseg4e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg5e32ff(...) __riscv_vlseg5e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg6e32ff(...) __riscv_vlseg6e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg7e32ff(...) __riscv_vlseg7e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg8e32ff(...) __riscv_vlseg8e32ff_tumu(__VA_ARGS__)
|
||||
#define vlseg2e64ff(...) __riscv_vlseg2e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg3e64ff(...) __riscv_vlseg3e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg4e64ff(...) __riscv_vlseg4e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg5e64ff(...) __riscv_vlseg5e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg6e64ff(...) __riscv_vlseg6e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg7e64ff(...) __riscv_vlseg7e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg8e64ff(...) __riscv_vlseg8e64ff_tumu(__VA_ARGS__)
|
||||
#define vlseg2e8(...) __riscv_vlseg2e8_tumu(__VA_ARGS__)
|
||||
#define vlseg3e8(...) __riscv_vlseg3e8_tumu(__VA_ARGS__)
|
||||
#define vlseg4e8(...) __riscv_vlseg4e8_tumu(__VA_ARGS__)
|
||||
#define vlseg5e8(...) __riscv_vlseg5e8_tumu(__VA_ARGS__)
|
||||
#define vlseg6e8(...) __riscv_vlseg6e8_tumu(__VA_ARGS__)
|
||||
#define vlseg7e8(...) __riscv_vlseg7e8_tumu(__VA_ARGS__)
|
||||
#define vlseg8e8(...) __riscv_vlseg8e8_tumu(__VA_ARGS__)
|
||||
#define vlseg2e8ff(...) __riscv_vlseg2e8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg3e8ff(...) __riscv_vlseg3e8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg4e8ff(...) __riscv_vlseg4e8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg5e8ff(...) __riscv_vlseg5e8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg6e8ff(...) __riscv_vlseg6e8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg7e8ff(...) __riscv_vlseg7e8ff_tumu(__VA_ARGS__)
|
||||
#define vlseg8e8ff(...) __riscv_vlseg8e8ff_tumu(__VA_ARGS__)
|
||||
#define vlsseg2e16(...) __riscv_vlsseg2e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg3e16(...) __riscv_vlsseg3e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg4e16(...) __riscv_vlsseg4e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg5e16(...) __riscv_vlsseg5e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg6e16(...) __riscv_vlsseg6e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg7e16(...) __riscv_vlsseg7e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg8e16(...) __riscv_vlsseg8e16_tumu(__VA_ARGS__)
|
||||
#define vlsseg2e32(...) __riscv_vlsseg2e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg3e32(...) __riscv_vlsseg3e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg4e32(...) __riscv_vlsseg4e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg5e32(...) __riscv_vlsseg5e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg6e32(...) __riscv_vlsseg6e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg7e32(...) __riscv_vlsseg7e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg8e32(...) __riscv_vlsseg8e32_tumu(__VA_ARGS__)
|
||||
#define vlsseg2e64(...) __riscv_vlsseg2e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg3e64(...) __riscv_vlsseg3e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg4e64(...) __riscv_vlsseg4e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg5e64(...) __riscv_vlsseg5e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg6e64(...) __riscv_vlsseg6e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg7e64(...) __riscv_vlsseg7e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg8e64(...) __riscv_vlsseg8e64_tumu(__VA_ARGS__)
|
||||
#define vlsseg2e8(...) __riscv_vlsseg2e8_tumu(__VA_ARGS__)
|
||||
#define vlsseg3e8(...) __riscv_vlsseg3e8_tumu(__VA_ARGS__)
|
||||
#define vlsseg4e8(...) __riscv_vlsseg4e8_tumu(__VA_ARGS__)
|
||||
#define vlsseg5e8(...) __riscv_vlsseg5e8_tumu(__VA_ARGS__)
|
||||
#define vlsseg6e8(...) __riscv_vlsseg6e8_tumu(__VA_ARGS__)
|
||||
#define vlsseg7e8(...) __riscv_vlsseg7e8_tumu(__VA_ARGS__)
|
||||
#define vlsseg8e8(...) __riscv_vlsseg8e8_tumu(__VA_ARGS__)
|
||||
#define viota(...) __riscv_viota_tumu(__VA_ARGS__)
|
||||
#define vid(...) __riscv_vid_tumu(__VA_ARGS__)
|
||||
#endif
|
||||
@@ -0,0 +1,33 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// 0.11 -> 0.12 compatibility
|
||||
|
||||
#ifndef _RVV_IMPLICIT_VXRM
|
||||
#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
|
||||
#endif
|
||||
|
||||
// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
|
||||
|
||||
// masked
|
||||
#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
|
||||
|
||||
// unmasked
|
||||
#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
|
||||
@@ -0,0 +1,213 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
|
||||
#define OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
|
||||
|
||||
// This file requires VTraits to be defined for vector types
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_AND(REG, SUF) \
|
||||
inline static REG vand(const REG & op1, const REG & op2, size_t vl) \
|
||||
{ \
|
||||
return vand_vv_##SUF(op1, op2, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vint8m1_t, i8m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint8m1_t, u8m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vint16m1_t, i16m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint16m1_t, u16m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vint32m1_t, i32m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint32m1_t, u32m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vint64m1_t, i64m1)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint64m1_t, u64m1)
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_LOXEI(REG, SUF, INDX, ISUF) \
|
||||
inline static REG vloxe##ISUF(const VTraits<REG>::lane_type *base, INDX bindex, size_t vl) \
|
||||
{ \
|
||||
return vloxe##ISUF##_v_##SUF(base, bindex, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint8m1_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint8m2_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m4_t, i8m4, vuint8m4_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m8_t, i8m8, vuint8m8_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint32m4_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint32m8_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint16m1_t, i16m1, vuint32m2_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m1_t, i32m1, vuint32m1_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m2_t, i32m2, vuint32m2_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m4_t, i32m4, vuint32m4_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m8_t, i32m8, vuint32m8_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint64m1_t, i64m1, vuint32mf2_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m1_t, u8m1, vuint8m1_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_MUL(REG, SUF) \
|
||||
inline static REG##m1_t vmul(const REG##m1_t & op1, const REG##m1_t & op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vv_##SUF##m1(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m1_t vmul(const REG##m1_t & op1, VTraits<REG##m1_t>::lane_type op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vx_##SUF##m1(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m2_t vmul(const REG##m2_t & op1, const REG##m2_t & op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vv_##SUF##m2(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m2_t vmul(const REG##m2_t & op1, VTraits<REG##m2_t>::lane_type op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vx_##SUF##m2(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m4_t vmul(const REG##m4_t & op1, const REG##m4_t & op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vv_##SUF##m4(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m4_t vmul(const REG##m4_t & op1, VTraits<REG##m4_t>::lane_type op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vx_##SUF##m4(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m8_t vmul(const REG##m8_t & op1, const REG##m8_t & op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vv_##SUF##m8(op1, op2, vl); \
|
||||
} \
|
||||
inline static REG##m8_t vmul(const REG##m8_t & op1, VTraits<REG##m8_t>::lane_type op2, size_t vl) \
|
||||
{ \
|
||||
return vmul_vx_##SUF##m8(op1, op2, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint8, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint16, i16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint32, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint32, u32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(REG1, SUF1, REG2, SUF2) \
|
||||
inline static REG1##m1_t vreinterpret_##SUF1##m1(const REG2##m1_t & src) \
|
||||
{\
|
||||
return vreinterpret_v_##SUF2##m1_##SUF1##m1(src); \
|
||||
} \
|
||||
inline static REG1##m2_t vreinterpret_##SUF1##m2(const REG2##m2_t & src) \
|
||||
{\
|
||||
return vreinterpret_v_##SUF2##m2_##SUF1##m2(src); \
|
||||
} \
|
||||
inline static REG1##m4_t vreinterpret_##SUF1##m4(const REG2##m4_t & src) \
|
||||
{\
|
||||
return vreinterpret_v_##SUF2##m4_##SUF1##m4(src); \
|
||||
} \
|
||||
inline static REG1##m8_t vreinterpret_##SUF1##m8(const REG2##m8_t & src) \
|
||||
{\
|
||||
return vreinterpret_v_##SUF2##m8_##SUF1##m8(src); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint8, i8, vuint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint16, i16, vuint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vuint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vuint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vint32, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vfloat32, f32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vfloat32, f32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vint8, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint64, u64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vint16, i16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint64, u64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vint32, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint64, u64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_STORE(REG, SUF, SZ) \
|
||||
inline static void vse##SZ(VTraits<REG>::lane_type *base, REG value, size_t vl) \
|
||||
{ \
|
||||
return vse##SZ##_v_##SUF##m1(base, value, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint8, u8, 8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int8, i8, 8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint16, u16, 16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int16, i16, 16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint32, u32, 32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int32, i32, 32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint64, u64, 64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int64, i64, 64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float32, f32, 32)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float64, f64, 64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(REG, SUF) \
|
||||
inline static VTraits<REG>::lane_type vmv_x(const REG & reg) \
|
||||
{\
|
||||
return vmv_x_s_##SUF##m1_##SUF(reg); \
|
||||
}
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(REG, SUF) \
|
||||
inline static VTraits<REG>::lane_type vfmv_f(const REG & reg) \
|
||||
{\
|
||||
return vfmv_f_s_##SUF##m1_##SUF(reg); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int8, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int16, i16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int32, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint64, u64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int64, i64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float32, f32)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float64, f64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_FUN_SLIDE(REG, SUF) \
|
||||
inline static REG vslidedown(const REG & dst, const REG & src, size_t offset, size_t vl) \
|
||||
{ \
|
||||
return vslidedown_vx_##SUF##m1(dst, src, offset, vl); \
|
||||
} \
|
||||
inline static REG vslideup(const REG & dst, const REG & src, size_t offset, size_t vl) \
|
||||
{ \
|
||||
return vslideup_vx_##SUF##m1(dst, src, offset, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int8, i8)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int16, i16)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int32, i32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float32, f32)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint64, u64)
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int64, i64)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float64, f64)
|
||||
#endif
|
||||
|
||||
inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t vl)
|
||||
{
|
||||
return vmul_vx_u32mf2(op1, op2, vl);
|
||||
}
|
||||
|
||||
inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
|
||||
{
|
||||
return vreinterpret_v_i32mf2_u32mf2(val);
|
||||
}
|
||||
|
||||
inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
|
||||
{
|
||||
return vreinterpret_v_u16mf2_u32mf2(val);
|
||||
}
|
||||
|
||||
#endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
|
||||
File diff suppressed because it is too large
Load Diff
@@ -347,8 +347,6 @@ namespace hal_sse_internal
|
||||
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
|
||||
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
|
||||
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
|
||||
template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
|
||||
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
|
||||
{ return _Tpvec(cast(a.val)); }
|
||||
|
||||
@@ -366,11 +364,6 @@ inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
|
||||
inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
|
||||
inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
|
||||
|
||||
template <> inline v_uint64x2 v_setzero_() { return v_setzero_u64(); }
|
||||
template <> inline v_int64x2 v_setzero_() { return v_setzero_s64(); }
|
||||
template <> inline v_uint64x2 v_setall_(uint64 val) { return v_setall_u64(val); }
|
||||
template <> inline v_int64x2 v_setall_(int64 val) { return v_setall_s64(val); }
|
||||
|
||||
template<typename _Tpvec> inline
|
||||
v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
|
||||
template<typename _Tpvec> inline
|
||||
@@ -742,46 +735,53 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val)); \
|
||||
} \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
a.val = intrin(a.val, b.val); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
}
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
|
||||
@@ -845,7 +845,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
@@ -872,7 +872,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
#endif
|
||||
}
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
@@ -886,7 +886,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||
return v_uint32x4(_mm_add_epi32(p0, p1));
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
@@ -899,7 +899,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||
return v_int32x4(_mm_add_epi32(p0, p1));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
@@ -911,14 +911,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
v_expand(c, c0, c1);
|
||||
v_expand(d, d0, d1);
|
||||
|
||||
c0 = v_add(c0, c1); d0 = v_add(d0, d1);
|
||||
c0 += c1; d0 += d1;
|
||||
return v_uint64x2(_mm_add_epi64(
|
||||
_mm_unpacklo_epi64(c0.val, d0.val),
|
||||
_mm_unpackhi_epi64(c0.val, d0.val)
|
||||
));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@@ -931,7 +931,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
@@ -939,8 +939,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
#if CV_SSE4_1
|
||||
return v_cvt_f64(v_dotprod(a, b));
|
||||
#else
|
||||
v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b));
|
||||
v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b));
|
||||
v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
|
||||
v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
|
||||
|
||||
return v_float64x2(_mm_add_pd(
|
||||
_mm_unpacklo_pd(c.val, d.val),
|
||||
@@ -949,7 +949,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
#endif
|
||||
}
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
@@ -957,13 +957,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_fast(a, b), c); }
|
||||
{ return v_dotprod_fast(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
@@ -977,7 +977,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
|
||||
return v_uint32x4(_mm_add_epi32(p0, p1));
|
||||
}
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
@@ -994,7 +994,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
#endif
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
@@ -1006,34 +1006,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
|
||||
v_expand(c, c0, c1);
|
||||
v_expand(d, d0, d1);
|
||||
|
||||
c0 = v_add(c0, c1); d0 = v_add(d0, d1);
|
||||
return v_add(c0, d0);
|
||||
c0 += c1; d0 += d1;
|
||||
return c0 + d0;
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_int32x4 prod = v_dotprod(a, b);
|
||||
v_int64x2 c, d;
|
||||
v_expand(prod, c, d);
|
||||
return v_add(c, d);
|
||||
return c + d;
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ \
|
||||
return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
|
||||
}
|
||||
@@ -1182,58 +1182,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
|
||||
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
__m128i not_mask = _mm_set1_epi32(-1); \
|
||||
return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
|
||||
} \
|
||||
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ \
|
||||
__m128i not_mask = _mm_set1_epi32(-1); \
|
||||
return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
|
||||
} \
|
||||
inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
__m128i smask = _mm_set1_##suffix(sbit); \
|
||||
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
|
||||
} \
|
||||
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
__m128i smask = _mm_set1_##suffix(sbit); \
|
||||
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
|
||||
} \
|
||||
inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
__m128i smask = _mm_set1_##suffix(sbit); \
|
||||
__m128i not_mask = _mm_set1_epi32(-1); \
|
||||
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
|
||||
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
|
||||
} \
|
||||
inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \
|
||||
inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
|
||||
{ \
|
||||
__m128i smask = _mm_set1_##suffix(sbit); \
|
||||
__m128i not_mask = _mm_set1_epi32(-1); \
|
||||
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
|
||||
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
|
||||
} \
|
||||
inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ \
|
||||
return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
|
||||
} \
|
||||
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ \
|
||||
return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
|
||||
} \
|
||||
inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ \
|
||||
__m128i not_mask = _mm_set1_epi32(-1); \
|
||||
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
|
||||
} \
|
||||
inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \
|
||||
inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
|
||||
{ \
|
||||
__m128i not_mask = _mm_set1_epi32(-1); \
|
||||
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
|
||||
@@ -1244,17 +1244,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
|
||||
OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
|
||||
@@ -1262,17 +1262,17 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
|
||||
|
||||
#if CV_SSE4_1
|
||||
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
|
||||
inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
|
||||
inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); }
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); }
|
||||
#else
|
||||
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
|
||||
return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_not(v_eq(a, b)); }
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return ~(a == b); }
|
||||
#endif
|
||||
|
||||
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
|
||||
@@ -1311,17 +1311,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
|
||||
/** Absolute difference **/
|
||||
|
||||
inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
|
||||
{ return v_add_wrap(a - b, b - a); }
|
||||
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v_int8x16 d = v_sub_wrap(a, b);
|
||||
v_int8x16 m = v_lt(a, b);
|
||||
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
|
||||
v_int8x16 m = a < b;
|
||||
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
|
||||
}
|
||||
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@@ -1329,25 +1329,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
|
||||
}
|
||||
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
v_int32x4 d = v_sub(a, b);
|
||||
v_int32x4 m = v_lt(a, b);
|
||||
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
|
||||
v_int32x4 d = a - b;
|
||||
v_int32x4 m = a < b;
|
||||
return v_reinterpret_as_u32((d ^ m) - m);
|
||||
}
|
||||
|
||||
/** Saturating absolute difference **/
|
||||
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
v_int8x16 d = v_sub(a, b);
|
||||
v_int8x16 m = v_lt(a, b);
|
||||
return v_sub(v_xor(d, m), m);
|
||||
v_int8x16 d = a - b;
|
||||
v_int8x16 m = a < b;
|
||||
return (d ^ m) - m;
|
||||
}
|
||||
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_sub(v_max(a, b), v_min(a, b)); }
|
||||
{ return v_max(a, b) - v_min(a, b); }
|
||||
|
||||
|
||||
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return v_add(v_mul(a, b), c);
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
@@ -1381,12 +1381,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
||||
} \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpvec res = v_fma(a, a, v_mul(b, b)); \
|
||||
_Tpvec res = v_fma(a, a, b*b); \
|
||||
return _Tpvec(_mm_sqrt_##suffix(res.val)); \
|
||||
} \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return v_fma(a, a, v_mul(b, b)); \
|
||||
return v_fma(a, a, b*b); \
|
||||
} \
|
||||
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ \
|
||||
@@ -1397,19 +1397,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((
|
||||
OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
|
||||
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
|
||||
{ \
|
||||
return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
|
||||
} \
|
||||
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
|
||||
{ \
|
||||
return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
|
||||
} \
|
||||
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
|
||||
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
|
||||
{ \
|
||||
return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
|
||||
} \
|
||||
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
|
||||
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
|
||||
{ \
|
||||
return _Tpsvec(srai(a.val, imm)); \
|
||||
} \
|
||||
@@ -1711,9 +1711,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
|
||||
|
||||
inline int v_reduce_sum(const v_int16x8& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
inline unsigned v_reduce_sum(const v_uint16x8& a)
|
||||
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
|
||||
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
|
||||
|
||||
inline uint64 v_reduce_sum(const v_uint64x2& a)
|
||||
{
|
||||
@@ -1770,13 +1770,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v_uint32x4 l, h;
|
||||
v_expand(v_absdiff(a, b), l, h);
|
||||
return v_reduce_sum(v_add(l, h));
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_uint32x4 l, h;
|
||||
v_expand(v_absdiff(a, b), l, h);
|
||||
return v_reduce_sum(v_add(l, h));
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
@@ -1805,15 +1805,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
|
||||
inline v_uint16x8 v_popcount(const v_uint16x8& a)
|
||||
{
|
||||
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
|
||||
p = v_add(p, v_rotate_right<1>(p));
|
||||
return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
|
||||
p += v_rotate_right<1>(p);
|
||||
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
|
||||
}
|
||||
inline v_uint32x4 v_popcount(const v_uint32x4& a)
|
||||
{
|
||||
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
|
||||
p = v_add(p, v_rotate_right<1>(p));
|
||||
p = v_add(p, v_rotate_right<2>(p));
|
||||
return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
|
||||
p += v_rotate_right<1>(p);
|
||||
p += v_rotate_right<2>(p);
|
||||
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
|
||||
}
|
||||
inline v_uint64x2 v_popcount(const v_uint64x2& a)
|
||||
{
|
||||
@@ -3459,21 +3459,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
|
||||
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
|
||||
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
|
||||
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
@@ -261,8 +261,6 @@ OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
|
||||
#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
|
||||
inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
|
||||
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
|
||||
template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
|
||||
template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
|
||||
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
|
||||
{ return _Tpvec((cast)a.val); }
|
||||
|
||||
@@ -515,44 +513,48 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
|
||||
/* Element-wise binary and unary operations */
|
||||
/** Arithmetics **/
|
||||
#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); }
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(intrin(a.val, b.val)); } \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a.val = intrin(a.val, b.val); return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
|
||||
|
||||
// saturating multiply
|
||||
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
}
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
@@ -594,9 +596,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
|
||||
|
||||
/** Bitwise shifts **/
|
||||
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
|
||||
inline _Tpvec v_shl(const _Tpvec& a, int imm) \
|
||||
inline _Tpvec operator << (const _Tpvec& a, int imm) \
|
||||
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
|
||||
inline _Tpvec v_shr(const _Tpvec& a, int imm) \
|
||||
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
|
||||
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
|
||||
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
|
||||
@@ -615,10 +617,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
|
||||
|
||||
/** Bitwise logic **/
|
||||
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and) \
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or) \
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor) \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
|
||||
OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ return _Tpvec(vec_not(a.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
|
||||
@@ -648,17 +650,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
|
||||
|
||||
/** Comparison **/
|
||||
#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
|
||||
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_cmpeq(a.val, b.val)); } \
|
||||
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_cmpne(a.val, b.val)); } \
|
||||
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_cmplt(a.val, b.val)); } \
|
||||
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_cmpgt(a.val, b.val)); } \
|
||||
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_cmple(a.val, b.val)); } \
|
||||
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(vec_cmpge(a.val, b.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
|
||||
@@ -1058,7 +1060,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
|
||||
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
|
||||
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{ return v_add(v_mul(a, b), c); }
|
||||
{ return a * b + c; }
|
||||
|
||||
// TODO: exp, log, sin, cos
|
||||
|
||||
@@ -1087,12 +1089,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
|
||||
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
|
||||
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); }
|
||||
{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
|
||||
|
||||
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
|
||||
{ return v_abs(v_sub(a, b)); }
|
||||
{ return v_abs(a - b); }
|
||||
|
||||
/** Absolute difference for signed integers **/
|
||||
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
|
||||
@@ -1440,7 +1442,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
|
||||
return v_int64x2(vec_add(even, odd));
|
||||
}
|
||||
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod(a, b), c); }
|
||||
{ return v_dotprod(a, b) + c; }
|
||||
|
||||
// 8 >> 32
|
||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
@@ -1483,7 +1485,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||
return v_uint64x2(vec_add(s0, s1));
|
||||
}
|
||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
@@ -1493,13 +1495,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_cvt_f64(v_dotprod(a, b)); }
|
||||
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
|
||||
{ return v_add(v_dotprod_expand(a, b), c); }
|
||||
{ return v_dotprod_expand(a, b) + c; }
|
||||
|
||||
//////// Fast Dot Product ////////
|
||||
|
||||
@@ -1507,7 +1509,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_add(v_int32x4(vec_msum(a.val, b.val, vec_int4_z)), c); }
|
||||
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
|
||||
// 32 >> 64
|
||||
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
{ return v_dotprod(a, b); }
|
||||
@@ -1518,7 +1520,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{ return v_dotprod_expand(a, b); }
|
||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||
{ return v_add(v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)), c); }
|
||||
{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
@@ -1529,7 +1531,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 16 >> 64
|
||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||
@@ -1542,10 +1544,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
|
||||
v_int32x4 prod = v_dotprod(a, b);
|
||||
v_int64x2 c, d;
|
||||
v_expand(prod, c, d);
|
||||
return v_add(c, d);
|
||||
return c + d;
|
||||
}
|
||||
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
|
||||
{ return v_add(v_dotprod_expand_fast(a, b), c); }
|
||||
{ return v_dotprod_expand_fast(a, b) + c; }
|
||||
|
||||
// 32 >> 64f
|
||||
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
|
||||
@@ -1596,19 +1598,6 @@ template<int i, typename Tvec>
|
||||
inline Tvec v_broadcast_element(const Tvec& v)
|
||||
{ return Tvec(vec_splat(v.val, i)); }
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
|
||||
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
|
||||
|
||||
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
|
||||
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user