fix:make zerocolor edge

This commit is contained in:
lennlouisgeek
2026-02-04 03:18:20 +08:00
parent ca3545b8b0
commit 001685b633
157 changed files with 31832 additions and 32681 deletions

View File

@@ -64,7 +64,7 @@
namespace {
inline unsigned int trailingZeros32(unsigned int value) {
#if defined(_MSC_VER)
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
unsigned long index = 0;
_BitScanForward(&index, value);
return (unsigned int)index;
@@ -191,19 +191,6 @@ CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double)
#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
template <typename _VecTp> inline _VecTp v_setzero_();
template <typename _VecTp> inline _VecTp v_setall_(uchar);
template <typename _VecTp> inline _VecTp v_setall_(schar);
template <typename _VecTp> inline _VecTp v_setall_(ushort);
template <typename _VecTp> inline _VecTp v_setall_(short);
template <typename _VecTp> inline _VecTp v_setall_(unsigned);
template <typename _VecTp> inline _VecTp v_setall_(int);
template <typename _VecTp> inline _VecTp v_setall_(uint64);
template <typename _VecTp> inline _VecTp v_setall_(int64);
template <typename _VecTp> inline _VecTp v_setall_(float);
template <typename _VecTp> inline _VecTp v_setall_(double);
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#endif
@@ -249,7 +236,11 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#include "opencv2/core/hal/intrin_wasm.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
#if defined(CV_RVV_SCALABLE)
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
#else
#include "opencv2/core/hal/intrin_rvv.hpp"
#endif
#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
@@ -730,15 +721,314 @@ namespace CV__SIMD_NAMESPACE {
/** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
#if !CV_SIMD_SCALABLE
#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
// Compatibility layer
#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
template<typename T> struct VTraits {
static inline int vlanes() { return T::nlanes; }
enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
using lane_type = typename T::lane_type;
};
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
{ \
return a + b; \
} \
inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
{ \
return a - b; \
} \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_add(f1 + f2, vf...); \
}
#define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
inline _Tpvec v_shr(const _Tpvec& a, int n) \
{ \
return a >> n; \
} \
inline _Tpvec v_shl(const _Tpvec& a, int n) \
{ \
return a << n; \
}
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
{ \
return a & b; \
} \
inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
{ \
return a | b; \
} \
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
{ \
return a ^ b; \
}
#define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
inline _Tpvec v_not(const _Tpvec& a) \
{ \
return ~a; \
}
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
OPENCV_HAL_WRAP_NOT_OP(v_uint8)
OPENCV_HAL_WRAP_NOT_OP(v_uint16)
OPENCV_HAL_WRAP_NOT_OP(v_uint32)
OPENCV_HAL_WRAP_NOT_OP(v_uint64)
OPENCV_HAL_WRAP_NOT_OP(v_int8)
OPENCV_HAL_WRAP_NOT_OP(v_int16)
OPENCV_HAL_WRAP_NOT_OP(v_int32)
OPENCV_HAL_WRAP_NOT_OP(v_int64)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
return a * b; \
} \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_mul(f1 * f2, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
{ \
return a / b; \
}
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
{ \
return a op b; \
}
#define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ \
return a == b; \
} \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ \
return a != b; \
}
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
OPENCV_HAL_WRAP_CMP(v_uint8)
OPENCV_HAL_WRAP_CMP(v_uint16)
OPENCV_HAL_WRAP_CMP(v_uint32)
OPENCV_HAL_WRAP_EQ_OP(v_uint64)
OPENCV_HAL_WRAP_CMP(v_int8)
OPENCV_HAL_WRAP_CMP(v_int16)
OPENCV_HAL_WRAP_CMP(v_int32)
OPENCV_HAL_WRAP_EQ_OP(v_int64)
OPENCV_HAL_WRAP_CMP(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)
#endif
#endif
//////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
@@ -786,96 +1076,6 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_GRT0(v_float64x4)
#endif
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
return v_add(v_add(f1, f2), f3, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
return v_mul(v_mul(f1, f2), f3, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
@@ -946,6 +1146,74 @@ namespace CV__SIMD_NAMESPACE {
#endif //!CV_SIMD_SCALABLE
#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
// Compatibility layer for the backend that cleaned up.
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_add(v_add(f1, f2), vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
return v_mul(v_mul(f1, f2), vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
OPENCV_HAL_WRAP_EXTRACT(v_int8)
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
OPENCV_HAL_WRAP_EXTRACT(v_int16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64)
OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64)
#endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
{ \
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32)
#endif //CV_NEON
//! @cond IGNORED
// backward compatibility

View File

@@ -447,10 +447,6 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
{ return _Tpvec(_mm256_setzero_si256()); } \
inline _Tpvec v256_setall_##suffix(_Tp v) \
{ return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v256_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v256_setall_##suffix(v); } \
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
@@ -476,10 +472,6 @@ OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
{ return _Tpvec(_mm256_setzero_##zsuffix()); } \
inline _Tpvec v256_setall_##suffix(_Tp v) \
{ return _Tpvec(_mm256_set1_##zsuffix(v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v256_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v256_setall_##suffix(v); } \
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
@@ -681,51 +673,53 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
/** Arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32, _mm256_adds_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32, _mm256_subs_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32, _mm256_subs_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16, _mm256_adds_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16, _mm256_subs_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
// saturating multiply 8-bit, 16-bit
inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
{
v_int16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
@@ -733,7 +727,7 @@ inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_uint16x16(_v256_packs_epu32(p0, p1));
}
inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
{
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
@@ -741,6 +735,14 @@ inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
return v_int16x16(_mm256_packs_epi32(p0, p1));
}
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
{ a = a * b; return a; }
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
{ a = a * b; return a; }
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
{ a = a * b; return a; }
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
@@ -831,13 +833,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
{ return _Tpsvec(srai(a.val, imm)); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@@ -865,11 +867,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
@@ -898,29 +900,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
/** Comparison **/
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return v_gt(b, a); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_lt(a, b)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return v_ge(b, a); }
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return b > a; } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a < b); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return b >= a; }
#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m256i smask = _mm256_set1_##suffix(sbit); \
return _Tpuvec(_mm256_cmpgt_##suffix( \
_mm256_xor_si256(a.val, smask), \
_mm256_xor_si256(b.val, smask))); \
} \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
@@ -930,25 +932,25 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, suffix)
OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
@@ -1214,9 +1216,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline float v_reduce_sum(const v_float32x8& a)
{
@@ -1271,27 +1273,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
v_uint32x8 l, h;
v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
return v_reduce_sum(v_add(l, h));
v_expand(v_add_wrap(a - b, b - a), l, h);
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
{
v_uint32x8 l, h;
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
return v_reduce_sum(v_add(l, h));
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
{
return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
return v_reduce_sum(v_max(a, b) - v_min(a, b));
}
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 m = v_lt(a, b);
return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
v_int32x8 m = a < b;
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
}
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
{
return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))));
return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
}
/** Popcount **/
@@ -1306,15 +1308,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a)
inline v_uint16x16 v_popcount(const v_uint16x16& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
}
inline v_uint32x8 v_popcount(const v_uint32x8& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
}
inline v_uint64x4 v_popcount(const v_uint64x4& a)
{
@@ -1406,9 +1408,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, v_mul(b, b)); } \
{ return v_fma(a, a, b * b); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
{ return v_sqrt(v_fma(a, a, b*b)); }
OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
@@ -1417,7 +1419,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
{
return v_add(v_mul(a, b), c);
return a * b + c;
}
inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@@ -1427,16 +1429,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
inline v_float32x8 v_invsqrt(const v_float32x8& x)
{
v_float32x8 half = v_mul(x, v256_setall_f32(0.5));
v_float32x8 half = x * v256_setall_f32(0.5);
v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
// todo: _mm256_fnmsub_ps
t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half)));
t *= v256_setall_f32(1.5) - ((t * t) * half);
return t;
}
inline v_float64x4 v_invsqrt(const v_float64x4& x)
{
return v_div(v256_setall_f64(1.), v_sqrt(x));
return v256_setall_f64(1.) / v_sqrt(x);
}
/** Absolute values **/
@@ -1449,23 +1451,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
inline v_float32x8 v_abs(const v_float32x8& x)
{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); }
{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
inline v_float64x4 v_abs(const v_float64x4& x)
{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); }
{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
/** Absolute difference **/
inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
{ return v_add_wrap(a - b, b - a); }
inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
{ return v_add_wrap(a - b, b - a); }
inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = v_sub_wrap(a, b);
v_int8x32 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
v_int8x32 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
}
inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
@@ -1473,26 +1475,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 d = v_sub(a, b);
v_int32x8 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
v_int32x8 d = a - b;
v_int32x8 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
}
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
/** Saturating absolute difference **/
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = v_sub(a, b);
v_int8x32 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
v_int8x32 d = a - b;
v_int8x32 m = a < b;
return (d ^ m) - m;
}
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
////////// Conversions /////////
@@ -1787,7 +1789,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1797,7 +1799,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
return v_int64x4(_mm256_add_epi64(even, odd));
}
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
@@ -1814,7 +1816,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
return v_uint32x8(_mm256_add_epi32(prod0, prod1));
}
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
{
@@ -1829,7 +1831,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
return v_int32x8(_mm256_add_epi32(prod0, prod1));
}
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1853,7 +1855,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
));
}
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
{
@@ -1869,13 +1871,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
));
}
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
@@ -1921,7 +1923,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
}
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
{
@@ -1932,7 +1934,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
return v_int64x4(_mm256_add_epi64(lo, hi));
}
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -1951,7 +1953,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
}
inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@@ -2056,43 +2058,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
v_reinterpret_as_s16((b + delta) >> n));
}
template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
{
v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
}
template<int n> inline
v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
v_pack_u_store(ptr, (a + delta) >> n);
}
template<int n> inline
v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
{
v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
// 32
@@ -2125,43 +2127,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
{
// we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
v_reinterpret_as_s32((b + delta) >> n));
}
template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
{
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
}
template<int n> inline
v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
v_pack_u_store(ptr, (a + delta) >> n);
}
template<int n> inline
v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x8& a)
{
v_int32x8 delta = v256_setall_s32(1 << (n-1));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
// 64
@@ -2190,28 +2192,28 @@ template<int n> inline
v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
{
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
{
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
template<int n> inline
v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
{
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x4& a)
{
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
// pack boolean
@@ -3166,20 +3168,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
inline void v256_cleanup() { _mm256_zeroall(); }
#include "intrin_math.hpp"
inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@@ -458,10 +458,6 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
{ return _Tpvec(_mm512_setzero_si512()); } \
inline _Tpvec v512_setall_##suffix(_Tp v) \
{ return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v512_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v512_setall_##suffix(v); } \
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
@@ -487,10 +483,6 @@ OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64)
{ return _Tpvec(_mm512_setzero_##zsuffix()); } \
inline _Tpvec v512_setall_##suffix(_Tp v) \
{ return _Tpvec(_mm512_set1_##zsuffix(v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v512_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v512_setall_##suffix(v); } \
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
@@ -671,56 +663,58 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
}
#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
/** Saturating arithmetics **/
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64, _mm512_adds_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64, _mm512_subs_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64, _mm512_adds_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64, _mm512_subs_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32, _mm512_adds_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32, _mm512_subs_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
// saturating multiply
inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b)
inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
{
v_uint16x32 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b)
inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
{
v_int16x32 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
{
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
__m512i ph = _mm512_mulhi_epu16(a.val, b.val);
@@ -730,7 +724,7 @@ inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
const __m512i m = _mm512_set1_epi32(65535);
return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
}
inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
{
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
__m512i ph = _mm512_mulhi_epi16(a.val, b.val);
@@ -739,6 +733,15 @@ inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
return v_int16x32(_mm512_packs_epi32(p0, p1));
}
inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
{ a = a * b; return a; }
inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
{ a = a * b; return a; }
inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
{ a = a * b; return a; }
inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
{ a = a * b; return a; }
inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
@@ -799,13 +802,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@@ -827,10 +830,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
@@ -862,16 +865,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
/** Comparison **/
#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1)
@@ -883,16 +886,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
@@ -1247,9 +1250,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16)
OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16)
inline int v_reduce_sum(const v_int16x32& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline uint v_reduce_sum(const v_uint16x32& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
inline sctype v_reduce_##func(const _Tpvec& a) \
@@ -1303,17 +1306,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); }
{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); }
{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); }
{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); }
{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); }
{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
/** Popcount **/
inline v_uint8x64 v_popcount(const v_int8x64& a)
@@ -1348,8 +1351,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a)
_mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
#else
v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
#endif
}
inline v_uint32x16 v_popcount(const v_int32x16& a)
@@ -1358,9 +1361,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a)
return v_uint32x16(_mm512_popcnt_epi32(a.val));
#else
v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
#endif
}
inline v_uint64x8 v_popcount(const v_int64x8& a)
@@ -1400,9 +1403,9 @@ inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinte
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, v_mul(b, b)); } \
{ return v_fma(a, a, b * b); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
{ return v_sqrt(v_fma(a, a, b * b)); }
OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
@@ -1410,7 +1413,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd)
inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
{ return v_add(v_mul(a, b), c); }
{ return a * b + c; }
inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
{ return v_fma(a, b, c); }
@@ -1419,9 +1422,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x)
#if CV_AVX_512ER
return v_float32x16(_mm512_rsqrt28_ps(x.val));
#else
v_float32x16 half = v_mul(x, v512_setall_f32(0.5));
v_float32x16 half = x * v512_setall_f32(0.5);
v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half)));
t *= v512_setall_f32(1.5) - ((t * t) * half);
return t;
#endif
}
@@ -1431,7 +1434,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x)
#if CV_AVX_512ER
return v_float64x8(_mm512_rsqrt28_pd(x.val));
#else
return v_div(v512_setall_f64(1.), v_sqrt(x));
return v512_setall_f64(1.) / v_sqrt(x);
// v_float64x8 half = x * v512_setall_f64(0.5);
// v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
// t *= v512_setall_f64(1.5) - ((t * t) * half);
@@ -1479,17 +1482,17 @@ inline v_float64x8 v_abs(const v_float64x8& x)
/** Absolute difference **/
inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
{ return v_add_wrap(a - b, b - a); }
inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
{ return v_add_wrap(a - b, b - a); }
inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
{
v_int8x64 d = v_sub_wrap(a, b);
v_int8x64 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
v_int8x64 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
}
inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
@@ -1497,26 +1500,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
{
v_int32x16 d = v_sub(a, b);
v_int32x16 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
v_int32x16 d = a - b;
v_int32x16 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
}
inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
/** Saturating absolute difference **/
inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
{
v_int8x64 d = v_sub(a, b);
v_int8x64 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
v_int8x64 d = a - b;
v_int8x64 m = a < b;
return (d ^ m) - m;
}
inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
////////// Conversions /////////
@@ -1815,7 +1818,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
@@ -1825,7 +1828,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
return v_int64x8(_mm512_add_epi64(even, odd));
}
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
@@ -1841,7 +1844,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
return v_uint32x16(_mm512_add_epi32(prod0, prod1));
}
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
{
@@ -1856,7 +1859,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
return v_int32x16(_mm512_add_epi32(prod0, prod1));
}
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
@@ -1880,7 +1883,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
));
}
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
{
@@ -1890,13 +1893,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
return v_int64x8(_mm512_add_epi64(even, odd));
}
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
@@ -1941,7 +1944,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32&
return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
}
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
{ return v_dotprod_expand(a, b); }
@@ -1952,7 +1955,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b,
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
{ return v_dotprod_expand(a, b); }
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
@@ -1966,7 +1969,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v,
v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
}
inline v_float32x16 v_matmuladd(const v_float32x16& v,
@@ -2067,43 +2070,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
{
// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
v_reinterpret_as_s16((b + delta) >> n));
}
template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
{
v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
}
template<int n> inline
v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
v_pack_u_store(ptr, (a + delta) >> n);
}
template<int n> inline
v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
{
v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
// 32
@@ -2136,43 +2139,43 @@ template<int n> inline
v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
{
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
v_reinterpret_as_s32((b + delta) >> n));
}
template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
{
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
}
template<int n> inline
v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack_u((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
v_pack_u_store(ptr, (a + delta) >> n);
}
template<int n> inline
v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x16& a)
{
v_int32x16 delta = v512_setall_s32(1 << (n-1));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
// 64
@@ -2193,28 +2196,28 @@ template<int n> inline
v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
{
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
{
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
template<int n> inline
v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
{
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
return v_pack((a + delta) >> n, (b + delta) >> n);
}
template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x8& a)
{
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
v_pack_store(ptr, v_shr(v_add(a, delta), n));
v_pack_store(ptr, (a + delta) >> n);
}
// pack boolean
@@ -3078,20 +3081,6 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm
inline void v512_cleanup() { _mm256_zeroall(); }
#include "intrin_math.hpp"
inline v_float32x16 v_exp(const v_float32x16& x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_log(const v_float32x16& x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
inline void v_sincos(const v_float32x16& x, v_float32x16& s, v_float32x16& c) { v_sincos_default_32f<v_float32x16, v_int32x16>(x, s, c); }
inline v_float32x16 v_sin(const v_float32x16& x) { return v_sin_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_cos(const v_float32x16& x) { return v_cos_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float32x16 v_erf(const v_float32x16& x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
inline v_float64x8 v_exp(const v_float64x8& x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
inline v_float64x8 v_log(const v_float64x8& x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
inline void v_sincos(const v_float64x8& x, v_float64x8& s, v_float64x8& c) { v_sincos_default_64f<v_float64x8, v_int64x8>(x, s, c); }
inline v_float64x8 v_sin(const v_float64x8& x) { return v_sin_default_64f<v_float64x8, v_int64x8>(x); }
inline v_float64x8 v_cos(const v_float64x8& x) { return v_cos_default_64f<v_float64x8, v_int64x8>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@@ -81,26 +81,9 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
different platforms. Currently a few different SIMD extensions on different architectures are supported.
OpenCV Universal Intrinsics support the following instruction sets:
- *128 bit* registers of various types support is implemented for a wide range of architectures including
- x86(SSE/SSE2/SSE4.2),
- ARM(NEON): 64-bit float (64F) requires AArch64,
- PowerPC(VSX),
- MIPS(MSA),
- LoongArch(LSX),
- RISC-V(RVV 0.7.1): Fixed-length implementation,
- WASM: 64-bit float (64F) is not supported,
- *256 bit* registers are supported on
- x86(AVX2),
- LoongArch (LASX),
- *512 bit* registers are supported on
- x86(AVX512),
- *Vector Length Agnostic (VLA)* registers are supported on
- RISC-V(RVV 1.0)
- ARM(SVE/SVE2): Powered by Arm KleidiCV integration (OpenCV 4.11+),
128 bit registers of various types support is implemented for a wide range of architectures
including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
will be chosen and code will work as expected although it could be slower.
@@ -242,30 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto
Element-wise binary and unary operations.
- Arithmetics:
@ref v_add,
@ref v_sub,
@ref v_mul,
@ref v_div,
@ref operator +(const v_reg &a, const v_reg &b) "+",
@ref operator -(const v_reg &a, const v_reg &b) "-",
@ref operator *(const v_reg &a, const v_reg &b) "*",
@ref operator /(const v_reg &a, const v_reg &b) "/",
@ref v_mul_expand
- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
- Bitwise shifts:
@ref operator <<(const v_reg &a, int s) "<<",
@ref operator >>(const v_reg &a, int s) ">>",
@ref v_shl, @ref v_shr
- Bitwise logic:
@ref v_and,
@ref v_or,
@ref v_xor,
@ref v_not
@ref operator &(const v_reg &a, const v_reg &b) "&",
@ref operator |(const v_reg &a, const v_reg &b) "|",
@ref operator ^(const v_reg &a, const v_reg &b) "^",
@ref operator ~(const v_reg &a) "~"
- Comparison:
@ref v_gt,
@ref v_ge,
@ref v_lt,
@ref v_le,
@ref v_eq,
@ref v_ne
@ref operator >(const v_reg &a, const v_reg &b) ">",
@ref operator >=(const v_reg &a, const v_reg &b) ">=",
@ref operator <(const v_reg &a, const v_reg &b) "<",
@ref operator <=(const v_reg &a, const v_reg &b) "<=",
@ref operator ==(const v_reg &a, const v_reg &b) "==",
@ref operator !=(const v_reg &a, const v_reg &b) "!="
- min/max: @ref v_min, @ref v_max
@@ -278,8 +263,7 @@ Most of these operations return only one value.
### Other math
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log,
@ref v_erf, @ref v_sin, @ref v_cos
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
### Conversions
@@ -379,9 +363,6 @@ Floating point:
|reverse | x | x |
|extract_n | x | x |
|broadcast_element | x | |
|exp | x | x |
|log | x | x |
|sin, cos | x | x |
@{ */
@@ -589,43 +570,50 @@ enum {
/** @brief Add values
For all types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Subtract values
For all types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Multiply values
For 16- and 32-bit integer types and floating types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Divide values
For floating types only. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise AND
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise XOR
Only for integer types.*/
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
/** @brief Bitwise NOT
Only for integer types.*/
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
#ifndef CV_DOXYGEN
@@ -648,26 +636,33 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
template<int n> inline \
v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
return c; \
} \
template<int n> inline \
v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
for( int i = 0; i < n; i++ ) \
a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
return a; \
}
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func)
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add)
CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub)
CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul)
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div)
CV__HAL_INTRIN_IMPL_BIN_OP(+)
CV__HAL_INTRIN_IMPL_BIN_OP(-)
CV__HAL_INTRIN_IMPL_BIN_OP(*)
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
template<int n> CV_INLINE \
v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
@@ -675,20 +670,29 @@ v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
return c; \
} \
template<int n> CV_INLINE \
v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
for( int i = 0; i < n; i++ ) \
a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
return a; \
}
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and)
CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or)
CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor)
CV__HAL_INTRIN_IMPL_BIT_OP(&)
CV__HAL_INTRIN_IMPL_BIT_OP(|)
CV__HAL_INTRIN_IMPL_BIT_OP(^)
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
template<int n> CV_INLINE \
v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
@@ -696,7 +700,7 @@ v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
return c; \
} \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not)
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
#endif // !CV_DOXYGEN
@@ -717,85 +721,12 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
/**
* @brief Exponential \f$ e^x \f$ of elements
*
* Only for floating point types. Core implementation steps:
* 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
* \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
* 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
* 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
* \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
* 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
* - For float16 and float32, use a Taylor Series with 6 terms.
* - For float64, use Pade Polynomials Approximation with 4 terms.
* 5. Combine Results: Multiply the two parts together to get the final result:
* \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
*
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
#define OPENCV_HAL_MATH_HAVE_EXP 1
/**
* @brief Natural logarithm \f$ \log(x) \f$ of elements
*
* Only for floating point types. Core implementation steps:
* 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$.
* 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation.
* 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result.
* - For float16 and float32, use a Taylor Series with 9 terms.
* - For float64, use Pade Polynomials Approximation with 6 terms.
* 4. Combine Results: Add the two parts together to get the final result.
*
* @note The precision of the calculation depends on the implementation and the data type of the input.
*
* @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
/**
* @brief Error function.
*
* @note Support FP32 precision for now.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
/**
* @brief Compute sine \f$ sin(x) \f$ and cosine \f$ cos(x) \f$ of elements at the same time
*
* Only for floating point types. Core implementation steps:
* 1. Input Normalization: Scale the periodicity from 2π to 4 and reduce the angle to the range \f$ [0, \frac{\pi}{4}] \f$ using periodicity and trigonometric identities.
* 2. Polynomial Approximation for \f$ sin(x) \f$ and \f$ cos(x) \f$:
* - For float16 and float32, use a Taylor series with 4 terms for sine and 5 terms for cosine.
* - For float64, use a Taylor series with 7 terms for sine and 8 terms for cosine.
* 3. Select Results: select and convert the final sine and cosine values for the original input angle.
*
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
*/
template<typename _Tp, int n>
inline void v_sincos(const v_reg<_Tp, n>& x, v_reg<_Tp, n>& s, v_reg<_Tp, n>& c)
{
for( int i = 0; i < n; i++ )
{
s.s[i] = std::sin(x.s[i]);
c.s[i] = std::cos(x.s[i]);
}
}
/**
* @brief Sine \f$ sin(x) \f$ of elements
*
* Only for floating point types. Core implementation the same as @ref v_sincos.
*/
//! @cond IGNORED
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
/**
* @brief Cosine \f$ cos(x) \f$ of elements
*
* Only for floating point types. Core implementation the same as @ref v_sincos.
*/
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
//! @endcond
/** @brief Absolute value of elements
@@ -918,9 +849,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
template<typename _Tp, int n> \
inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
v_reg<_Tp, n> c; \
@@ -932,28 +863,28 @@ inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
/** @brief Less-than comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(<, v_lt)
OPENCV_HAL_IMPL_CMP_OP(<)
/** @brief Greater-than comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(>, v_gt)
OPENCV_HAL_IMPL_CMP_OP(>)
/** @brief Less-than or equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(<=, v_le)
OPENCV_HAL_IMPL_CMP_OP(<=)
/** @brief Greater-than or equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(>=, v_ge)
OPENCV_HAL_IMPL_CMP_OP(>=)
/** @brief Equal comparison */
OPENCV_HAL_IMPL_CMP_OP(==, v_eq)
OPENCV_HAL_IMPL_CMP_OP(==)
/** @brief Not equal comparison */
OPENCV_HAL_IMPL_CMP_OP(!=, v_ne)
OPENCV_HAL_IMPL_CMP_OP(!=)
template<int n>
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
@@ -1322,8 +1253,8 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \
template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
@@ -1334,12 +1265,12 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a,
/** @brief Bitwise shift left
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl)
OPENCV_HAL_IMPL_SHIFT_OP(<< )
/** @brief Bitwise shift right
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr)
OPENCV_HAL_IMPL_SHIFT_OP(>> )
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
@@ -2848,8 +2779,7 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } \
template <> inline _Tpvec v_setzero_() { return _Tpvec::zero(); }
inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
//! @name Init with zero
//! @{
@@ -2895,8 +2825,7 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
template <> inline _Tpvec v_setall_(_Tp val) { return _Tpvec::all(val); }
inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
//! @name Init with value
//! @{
@@ -2965,7 +2894,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
{ return v_shl(a, shift); }
{ return a << shift; }
//! @name Left shift
//! @{
@@ -2982,7 +2911,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64)
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
{ return v_shr(a, shift); }
{ return a >> shift; }
//! @name Right shift
//! @{
@@ -3308,7 +3237,7 @@ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
const v_reg<double, n/2>& c)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }

View File

@@ -557,10 +557,6 @@ inline __m256i _lasx_256_castpd_si256(const __m256d& v)
{ return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
inline _Tpvec v256_setall_##suffix(_Tp v) \
{ return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v256_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v256_setall_##suffix(v); } \
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
@@ -592,11 +588,7 @@ inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
inline _Tpvec v256_setzero_##suffix() \
{ return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
inline _Tpvec v256_setall_##suffix(_Tp v) \
{ return _Tpvec(_v256_setall_##zsuffix(v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v256_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v256_setall_##suffix(v); } \
{ return _Tpvec(_v256_setall_##zsuffix(v)); } \
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \
OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
@@ -658,18 +650,16 @@ inline v_float32x8 v256_shuffle(const v_float32x8 &a)
template<int m>
inline v_float64x4 v256_shuffle(const v_float64x4 &a)
{
const int m1 = m & 0b1;
const int m2 = m & 0b10;
const int m3 = m & 0b100;
const int m4 = m & 0b1000;
const int m5 = m2 << 1;
const int m6 = m3 << 2;
const int m7 = m4 << 3;
const int m8 = m1 & m5 & m6 & m7;
int imm8 = m & 0b0001; //0 or 1
if (m & 0x0b0010) imm8 |= 0b0100;
//else imm8 |= 0b0000;
if (m & 0x0b0100) imm8 |= 0b110000; //2 or 3
else imm8 |= 0b100000;
if (m & 0x0b1000) imm8 |= 0b11000000;
else imm8 |= 0b10000000;
return v_float64x4(__lasx_xvshuf4i_d(*((__m256i*)&a.val), *((__m256i*)&a.val), m8));
return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
}
template<typename _Tpvec>
inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
{
@@ -754,51 +744,53 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
/** Arithmetics **/
#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32, __lasx_xvsadd_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32, __lasx_xvssub_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32, __lasx_xvsadd_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32, __lasx_xvssub_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16, __lasx_xvsadd_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16, __lasx_xvssub_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
// saturating multiply 8-bit, 16-bit
inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
{
v_int16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
{
__m256i pl = __lasx_xvmul_h(a.val, b.val);
__m256i ph = __lasx_xvmuh_hu(a.val, b.val);
@@ -806,7 +798,7 @@ inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
__m256i p1 = __lasx_xvilvh_h(ph, pl);
return v_uint16x16(_v256_packs_epu32(p0, p1));
}
inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
{
__m256i pl = __lasx_xvmul_h(a.val, b.val);
__m256i ph = __lasx_xvmuh_h(a.val, b.val);
@@ -814,6 +806,14 @@ inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
__m256i p1 = __lasx_xvilvh_h(ph, pl);
return v_int16x16(_lasx_packs_w(p0, p1));
}
inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
{ a = a * b; return a; }
inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
{ a = a * b; return a; }
inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
{ a = a * b; return a; }
inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
@@ -902,13 +902,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
{ return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
{ return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@@ -930,10 +930,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
@@ -946,14 +946,16 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
@@ -979,25 +981,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const
/** Comparison **/
#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return v_gt(b, a); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_lt(a, b)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return v_ge(b, a); }
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return b > a; } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a < b); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return b >= a; }
#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
{ \
return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
} \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
@@ -1007,37 +1009,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt, xvfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix)
OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b)
inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b)
inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b)
inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b)
inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
inline v_float32x8 v_not_nan(const v_float32x8& a)
@@ -1098,7 +1100,7 @@ inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
template<int imm>
inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
{
enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
enum {IMM_L = (imm - 16) & 0xFF};
enum {IMM_R = (16 - imm) & 0xFF};
if (imm == 0) return a;
@@ -1115,7 +1117,7 @@ inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
template<int imm>
inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
{
enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
enum {IMM_L = (imm - 16) & 0xFF};
if (imm == 0) return a;
if (imm > 32) return v_uint8x32();
@@ -1305,9 +1307,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline float v_reduce_sum(const v_float32x8& a)
{
@@ -1375,27 +1377,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
v_uint32x8 l, h;
v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
return v_reduce_sum(v_add(l, h));
v_expand(v_add_wrap(a - b, b - a), l, h);
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
{
v_uint32x8 l, h;
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
return v_reduce_sum(v_add(l, h));
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
{
return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
return v_reduce_sum(v_max(a, b) - v_min(a, b));
}
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 m = v_lt(a, b);
return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
v_int32x8 m = a < b;
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
}
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
{
v_float32x8 a_b = v_sub(a, b);
v_float32x8 a_b = a - b;
return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
}
@@ -1499,9 +1501,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, v_mul(b, b)); } \
{ return v_fma(a, a, b * b); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
{ return v_sqrt(v_fma(a, a, b*b)); }
OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
@@ -1552,20 +1554,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
/** Saturating absolute difference **/
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = v_sub(a, b);
v_int8x32 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
v_int8x32 d = a - b;
v_int8x32 m = a < b;
return (d ^ m) - m;
}
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
////////// Conversions /////////
@@ -1887,7 +1889,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1911,7 +1913,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
}
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
{
@@ -1922,7 +1924,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
return v_int32x8(__lasx_xvadd_w(prod0, prod1));
}
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1934,7 +1936,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
}
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
{
@@ -1946,13 +1948,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
}
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
@@ -1989,7 +1991,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
}
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
{
@@ -2000,7 +2002,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
return v_int64x4(__lasx_xvadd_d(lo, hi));
}
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -2020,7 +2022,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
}
inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@@ -3013,20 +3015,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
inline void v256_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@@ -1,111 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// This file has been created for compatibility with older versions of Universal Intrinscs
// Binary operators for vector types has been removed since version 4.11
// Include this file manually after OpenCV headers if you need these operators
#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
#ifdef __OPENCV_BUILD
#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
#endif
#ifdef __riscv
#warning "Operators might conflict with built-in functions on RISC-V platform"
#endif
#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
#endif
namespace cv { namespace hal {
#define BIN_OP(OP, FUN) \
template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
#define BIN_A_OP(OP, FUN) \
template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
#define UN_OP(OP, FUN) \
template <typename R> R operator OP (const R & val) { return FUN(val); }
BIN_OP(+, v_add)
BIN_OP(-, v_sub)
BIN_OP(*, v_mul)
BIN_OP(/, v_div)
BIN_OP(&, v_and)
BIN_OP(|, v_or)
BIN_OP(^, v_xor)
BIN_OP(==, v_eq)
BIN_OP(!=, v_ne)
BIN_OP(<, v_lt)
BIN_OP(>, v_gt)
BIN_OP(<=, v_le)
BIN_OP(>=, v_ge)
BIN_A_OP(+=, v_add)
BIN_A_OP(-=, v_sub)
BIN_A_OP(*=, v_mul)
BIN_A_OP(/=, v_div)
BIN_A_OP(&=, v_and)
BIN_A_OP(|=, v_or)
BIN_A_OP(^=, v_xor)
UN_OP(~, v_not)
// TODO: shift operators?
}} // cv::hal::
//==============================================================================
#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
namespace cv { namespace hal {
inline static void opencv_operator_compile_test()
{
using namespace cv;
v_float32 a, b, c;
uint8_t shift = 1;
a = b + c;
a = b - c;
a = b * c;
a = b / c;
a = b & c;
a = b | c;
a = b ^ c;
// a = b >> shift;
// a = b << shift;
a = (b == c);
a = (b != c);
a = (b < c);}}
a = (b > c);
a = (b <= c);
a = (b >= c);
a += b;
a -= b;
a *= b;
a /= b;
a &= b;
a |= b;
a ^= b;
// a <<= shift;
// a >>= shift;
a = ~b;
}
}} // cv::hal::
#endif
#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP

View File

@@ -417,10 +417,6 @@ inline __m128i _lsx_128_castpd_si128(const __m128d& v)
{ return _Tpvec(__lsx_vldi(0)); } \
inline _Tpvec v_setall_##suffix(_Tp v) \
{ return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v_setall_##suffix(v); } \
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \
@@ -452,10 +448,6 @@ inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
{ return _Tpvec(__lsx_vldi(0)); } \
inline _Tpvec v_setall_##suffix(_Tp v) \
{ return _Tpvec(_v128_setall_##zsuffix(v)); } \
template <> inline _Tpvec v_setzero_() \
{ return v_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) \
{ return v_setall_##suffix(v); } \
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \
OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \
@@ -533,51 +525,53 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
/** Arithmetics **/
#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16, __lsx_vsadd_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16, __lsx_vssub_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16, __lsx_vsadd_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16, __lsx_vssub_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8, __lsx_vsadd_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8, __lsx_vssub_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8, __lsx_vsadd_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8, __lsx_vssub_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
// saturating multiply 8-bit, 16-bit
inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b)
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
{
v_uint16x8 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b)
inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
{
v_int16x8 c, d;
v_mul_expand(a, b, c, d);
return v_pack(c, d);
}
inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
{
__m128i a0 = a.val, b0 = b.val;
__m128i pev = __lsx_vmulwev_w_hu(a0, b0);
@@ -586,7 +580,7 @@ inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
__m128i ph = __lsx_vilvh_w(pod, pev);
return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
}
inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
{
__m128i a0 = a.val, b0 = b.val;
__m128i pev = __lsx_vmulwev_w_h(a0, b0);
@@ -595,6 +589,14 @@ inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
__m128i ph = __lsx_vilvh_w(pod, pev);
return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
}
inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
{ a = a * b; return a; }
inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
{ a = a * b; return a; }
inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
{ a = a * b; return a; }
inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
{ a = a * b; return a; }
/** Non-saturating arithmetics **/
@@ -679,13 +681,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
{ return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
{ return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
template<int imm> \
inline _Tpuvec v_shl(const _Tpuvec& a) \
@@ -706,10 +708,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
inline _Tpvec operator ~(const _Tpvec& a) \
{ return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v)
@@ -722,14 +724,18 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v)
OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v)
#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
a.val = cast(c); \
return a;}
#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
@@ -754,23 +760,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const
/** Comparison **/
#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
{ return v_gt(b, a); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_lt(a, b)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
{ return v_ge(b, a); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~( a == b ); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return b > a ; } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a < b); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return b >= a; } \
#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
@@ -780,37 +786,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu)
OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu)
#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt, vfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b)
inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b)
inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b)
inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b)
inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
inline v_float32x4 v_not_nan(const v_float32x4& a)
@@ -1182,7 +1188,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
{
v_float32x4 a_b = v_sub(a, b);
v_float32x4 a_b = a - b;
return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
}
@@ -1289,9 +1295,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
inline _Tpvec v_sqrt(const _Tpvec& x) \
{ return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_fma(a, a, v_mul(b, b)); } \
{ return v_fma(a, a, b * b); } \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return v_sqrt(v_fma(a, a, v_mul(b, b))); }
{ return v_sqrt(v_fma(a, a, b * b)); }
OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
@@ -1343,20 +1349,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = v_sub(a, b);
v_int8x16 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
v_int8x16 d = a - b;
v_int8x16 m = a < b;
return (d ^ m) - m;
}
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
///////// Conversions /////////
@@ -1667,7 +1673,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_add(v_dotprod_expand(a, b), c) ;}
{ return v_dotprod_expand(a, b) + c ;}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
@@ -1679,7 +1685,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
return v_int32x4(__lsx_vadd_w(prod0, prod1));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -1692,7 +1698,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
return v_uint64x2(__lsx_vadd_d(prod0, prod1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@@ -1704,13 +1710,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(__lsx_vadd_d(prod0, prod1));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
///////// Fast Dot Product //////
@@ -1749,7 +1755,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
@@ -1761,7 +1767,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(__lsx_vadd_d(lo, hi));
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
@@ -2523,20 +2529,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@@ -1,687 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
/* Universal Intrinsics implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2010,2011 RJVB - extensions */
/* Copyright (C) 2011 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#ifndef OPENCV_HAL_INTRIN_MATH_HPP
#define OPENCV_HAL_INTRIN_MATH_HPP
//! @name Exponential
//! @{
// Implementation is the same as float32 vector.
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
_TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
_TpVec16S _vexp_mm;
const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f16);
_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f16(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
_vexp_mm = v_shl(_vexp_mm, 10);
_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
// exp(NAN) -> NAN
_TpVec16F mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
_TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
_TpVec32S _vexp_mm;
const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f32);
_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f32(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
_vexp_mm = v_shl(_vexp_mm, 23);
_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
// exp(NAN) -> NAN
_TpVec32F mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
_TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
_TpVec64S _vexp_mm;
const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f64);
_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
_vexp_mm = v_expand_low(v_floor(_vexp_));
_vexp_ = v_cvt_f64(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
_vexp_mm = v_shl(_vexp_mm, 52);
_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
_vexp_y = v_mul(_vexp_y, _vexp_x);
_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
// exp(NAN) -> NAN
_TpVec64F mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
}
//! @}
//! @name Natural Logarithm
//! @{
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
_TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
_TpVec16S _vlog_ux, _vlog_emm0;
const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
_vlog_ux = v_reinterpret_as_s16(x);
_vlog_emm0 = v_shr(_vlog_ux, 10);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
_vlog_x = v_reinterpret_as_f16(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
_vlog_e = v_cvt_f16(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp16);
_TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_z = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_z);
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
// log(0) -> -INF
_TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
_vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
// log(NEG), log(NAN) -> NAN
_TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
// log(INF) -> INF
_TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
_vlog_x = v_select(mask_inf, x, _vlog_x);
return _vlog_x;
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
_TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
_TpVec32S _vlog_ux, _vlog_emm0;
const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
_vlog_ux = v_reinterpret_as_s32(x);
_vlog_emm0 = v_shr(_vlog_ux, 23);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
_vlog_x = v_reinterpret_as_f32(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
_vlog_e = v_cvt_f32(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp32);
_TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_z = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_z);
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
// log(0) -> -INF
_TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
_vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
// log(NEG), log(NAN) -> NAN
_TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
// log(INF) -> INF
_TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
_vlog_x = v_select(mask_inf, x, _vlog_x);
return _vlog_x;
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
_TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
_TpVec64S _vlog_ux, _vlog_emm0;
const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
_vlog_ux = v_reinterpret_as_s64(x);
_vlog_emm0 = v_shr(_vlog_ux, 52);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
_vlog_x = v_reinterpret_as_f64(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
_vlog_e = v_cvt_f64(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp64);
_TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_xx = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_xx);
_vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
_vlog_z = v_div(_vlog_y, _vlog_z);
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
_vlog_z = v_add(_vlog_z, _vlog_x);
_vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
// log(0) -> -INF
_TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
_vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
// log(NEG), log(NAN) -> NAN
_TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
_vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
// log(INF) -> INF
_TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
_vlog_z = v_select(mask_inf, x, _vlog_z);
return _vlog_z;
}
//! @}
//! @name Sine and Cosine
//! @{
template<typename _TpVec16F, typename _TpVec16S>
inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
_TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec16S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_trunc(_vy);
emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
_vy = v_cvt_f16(emm2);
_TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
_TpVec16F _vxx = v_mul(_vx, _vx);
_TpVec16F y1, y2;
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
y1 = v_fma(y1, _vxx, v_coscof_p2);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
y2 = v_fma(y2, _vxx, v_sincof_p2);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
_TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
_TpVec16F ysin, ycos;
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
_TpVec16F ysin, ycos;
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
return ycos;
}
template<typename _TpVec32F, typename _TpVec32S>
inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
_TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec32S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_trunc(_vy);
emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
_vy = v_cvt_f32(emm2);
_TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
_TpVec32F _vxx = v_mul(_vx, _vx);
_TpVec32F y1, y2;
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
y1 = v_fma(y1, _vxx, v_coscof_p2);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
y2 = v_fma(y2, _vxx, v_sincof_p2);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
_TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
_TpVec32F ysin, ycos;
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
_TpVec32F ysin, ycos;
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
return ycos;
}
template<typename _TpVec64F, typename _TpVec64S>
inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
_TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec64S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_expand_low(v_trunc(_vy));
emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
_vy = v_cvt_f64(emm2);
_TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
_TpVec64F _vxx = v_mul(_vx, _vx);
_TpVec64F y1, y2;
y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
y1 = v_fma(y1, _vxx, v_cos_C3);
y1 = v_fma(y1, _vxx, v_cos_C4);
y1 = v_fma(y1, _vxx, v_cos_C5);
y1 = v_fma(y1, _vxx, v_cos_C6);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
y2 = v_fma(y2, _vxx, v_sin_C3);
y2 = v_fma(y2, _vxx, v_sin_C4);
y2 = v_fma(y2, _vxx, v_sin_C5);
y2 = v_fma(y2, _vxx, v_sin_C6);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
_TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
_TpVec64F ysin, ycos;
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
_TpVec64F ysin, ycos;
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
return ycos;
}
//! @}
/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
*/
//! @name Error Function
//! @{
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
coef1 = v_setall_<_TpVec32F>(1.061405429f),
coef2 = v_setall_<_TpVec32F>(-1.453152027f),
coef3 = v_setall_<_TpVec32F>(1.421413741f),
coef4 = v_setall_<_TpVec32F>(-0.284496736f),
coef5 = v_setall_<_TpVec32F>(0.254829592f),
ones = v_setall_<_TpVec32F>(1.0f),
neg_zeros = v_setall_<_TpVec32F>(-0.f);
_TpVec32F t = v_abs(v);
// sign(v)
_TpVec32F sign_mask = v_and(neg_zeros, v);
t = v_div(ones, v_fma(coef0, t, ones));
_TpVec32F r = v_fma(coef1, t, coef2);
r = v_fma(r, t, coef3);
r = v_fma(r, t, coef4);
r = v_fma(r, t, coef5);
// - v * v
_TpVec32F v2 = v_mul(v, v);
_TpVec32F mv2 = v_xor(neg_zeros, v2);
// - exp(- v * v)
_TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
_TpVec32F neg_exp = v_xor(neg_zeros, exp);
_TpVec32F res = v_mul(t, neg_exp);
res = v_fma(r, res, ones);
return v_xor(sign_mask, res);
}
//! @}
#endif // OPENCV_HAL_INTRIN_MATH_HPP

View File

@@ -235,8 +235,6 @@ struct v_float64x2
#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
@@ -347,46 +345,53 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
}
#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
}
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{a = a * b; return a; }
OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -541,13 +546,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(msa_hadd_s64(prod, prod));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
@@ -591,10 +596,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
{ return v_dotprod_expand(a, b, c); }
#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ \
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
}
@@ -609,16 +614,21 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
{ \
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
} \
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
{ \
a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
return a; \
}
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
inline v_float32x4 v_not(const v_float32x4& a)
inline v_float32x4 operator ~ (const v_float32x4& a)
{
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
}
@@ -649,16 +659,21 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
{ \
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
} \
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
{ \
a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
return a; \
}
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
inline v_float64x2 v_not(const v_float64x2& a)
inline v_float64x2 operator ~ (const v_float64x2& a)
{
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
}
@@ -689,17 +704,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
@@ -806,9 +821,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
// trade efficiency for convenience
#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
inline _Tpvec v_shl(const _Tpvec& a, int n) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
inline _Tpvec v_shr(const _Tpvec& a, int n) \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
@@ -1863,20 +1878,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@@ -56,7 +56,7 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD128 1
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#if defined(__aarch64__) || defined(_M_ARM64)
#define CV_SIMD128_64F 1
#else
#define CV_SIMD128_64F 0
@@ -72,7 +72,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
//
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(_M_ARM64EC)
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
#define CV_NEON_AARCH64 1
#else
#define CV_NEON_AARCH64 0
@@ -381,8 +381,6 @@ private:
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
@@ -888,10 +886,9 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
int32x4_t s0 = vaddl_s16(vget_low_s16(p0), vget_low_s16(p1));
return v_int32x4(vaddq_s32(s0, vaddl_s16(vget_high_s16(p0), vget_high_s16(p1))));
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{
@@ -1081,7 +1078,7 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#if defined(__aarch64__) || defined(_M_ARM64)
static inline uint64x2_t vmvnq_u64(uint64x2_t a)
{
uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
@@ -1823,7 +1820,7 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
return v_int32x4(vmovl_s16(v1));
}
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#if defined(__aarch64__) || defined(_M_ARM64)
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
{ \
@@ -2649,28 +2646,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
inline v_float16x8 v_exp(const v_float16x8& x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
inline v_float16x8 v_log(const v_float16x8& x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
inline void v_sincos(const v_float16x8& x, v_float16x8& s, v_float16x8& c) { v_sincos_default_16f<v_float16x8, v_int16x8>(x, s, c); }
inline v_float16x8 v_sin(const v_float16x8& x) { return v_sin_default_16f<v_float16x8, v_int16x8>(x); }
inline v_float16x8 v_cos(const v_float16x8& x) { return v_cos_default_16f<v_float16x8, v_int16x8>(x); }
#endif
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
#if CV_SIMD128_64F
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
#endif
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

File diff suppressed because it is too large Load Diff

View File

@@ -355,12 +355,10 @@ inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64
#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \
inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } \
template <> inline v_##_Tp##x##num v_setzero_() { return v_setzero_##suffix(); } \
template <> inline v_##_Tp##x##num v_setall_(__Tp v) { return v_setall_##suffix(v); }
inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
OPENCV_HAL_IMPL_RISCVV_INIT_SET(schar, int8, s8, i8, 16)
OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
@@ -373,57 +371,72 @@ inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v,
inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
template <> inline v_float32x4 v_setzero_() { return v_setzero_f32(); }
template <> inline v_float32x4 v_setall_(float v) { return v_setall_f32(v); }
template <> inline v_float64x2 v_setzero_() { return v_setzero_f64(); }
template <> inline v_float64x2 v_setall_(double v) { return v_setall_f64(v); }
#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val, num)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val, num); \
return a; \
}
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4)
inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
{
return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
}
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
{
a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
return a;
}
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2)
inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
{
return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
}
inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
{
a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
return a;
}
// TODO: exp, log, sin, cos
#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
@@ -549,10 +562,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
}
#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \
inline _Tpvec v_not(const _Tpvec & a) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
inline _Tpvec operator ~ (const _Tpvec & a) \
{ \
return _Tpvec(vnot_v_##suffix(a.val, num)); \
}
@@ -567,31 +580,41 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
{ \
return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
} \
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
{ \
a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
return a; \
}
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
inline v_float32x4 v_not(const v_float32x4& a)
inline v_float32x4 operator ~ (const v_float32x4& a)
{
return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
}
#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
{ \
return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
} \
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
{ \
a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
return a; \
}
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
inline v_float64x2 v_not(const v_float64x2& a)
inline v_float64x2 operator ~ (const v_float64x2& a)
{
return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
}
@@ -1151,32 +1174,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
} \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ \
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
@@ -1192,37 +1215,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
//TODO: ==
inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
}
inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b)
inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
{
vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
@@ -1236,37 +1259,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a)
}
//TODO: ==
inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
}
inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
{
vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
@@ -1308,13 +1331,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
inline _Tpvec v_shl(const _Tpvec& a, int n) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
inline _Tpvec v_shr(const _Tpvec& a, int n) \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
@@ -2014,11 +2037,13 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
auto res = mul(a.val, b.val, num); \
return _Tpvec(cvt(res, 0, num)); \
}
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
@@ -2820,7 +2845,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
const v_float64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{
vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
@@ -2829,7 +2854,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
}
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ v_float64x2 res = v_dotprod_expand_fast(a, b);
return v_add(res, c); }
return res + c; }
#endif
////// FP16 support ///////
#if __riscv_v == 7000
@@ -2866,20 +2891,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,768 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copied from
// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
#ifndef __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
#define __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
// The maximum number of parameters is 20, this is held by segment load
// instructions with a NFIELD (NF) of 8. 20 is contributed by 8 vector register
// pointers passed, 1 vector mask register, 8 passthrough register for
// undisturbed policy, and 3 for address base, byte index, vl.
#define _GET_OVERRIDE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,\
_14, _15, _16, _17, _18, _19, _20, NAME, ...) NAME
#if __has_include ("riscv_vector.h")
#include <riscv_vector.h>
#endif
#ifndef __RISCV_VECTOR_H
#include_next <riscv_vector.h>
#endif
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
#define vmerge(mask, op1, op2, vl) __riscv_vmerge((op1), (op2), (mask), (vl))
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
#define vfmerge(mask, op1, op2, vl) __riscv_vfmerge((op1), (op2), (mask), (vl))
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
#define vcompress(mask, dest, src, vl) __riscv_vcompress_tu((dest), (src), (mask), (vl))
// Reinterpret between different type under the same SEW/LMUL
// Reinterpret between different SEW under the same LMUL
#define vse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse16, __riscv_vse16, 2, 1)(__VA_ARGS__)
#define vse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse32, __riscv_vse32, 2, 1)(__VA_ARGS__)
#define vse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse64, __riscv_vse64, 2, 1)(__VA_ARGS__)
#define vse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse8, __riscv_vse8, 2, 1)(__VA_ARGS__)
#define vsse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse16, __riscv_vsse16, 3, 2, 1)(__VA_ARGS__)
#define vsse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse32, __riscv_vsse32, 3, 2, 1)(__VA_ARGS__)
#define vsse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse64, __riscv_vsse64, 3, 2, 1)(__VA_ARGS__)
#define vsse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse8, __riscv_vsse8, 3, 2, 1)(__VA_ARGS__)
#define vloxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei8_tumu, 4, __riscv_vloxei8, 2, 1)(__VA_ARGS__)
#define vloxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei16_tumu, 4, __riscv_vloxei16, 2, 1)(__VA_ARGS__)
#define vloxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei32_tumu, 4, __riscv_vloxei32, 2, 1)(__VA_ARGS__)
#define vloxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei64_tumu, 4, __riscv_vloxei64, 2, 1)(__VA_ARGS__)
#define vluxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei8_tumu, 4, __riscv_vluxei8, 2, 1)(__VA_ARGS__)
#define vluxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei16_tumu, 4, __riscv_vluxei16, 2, 1)(__VA_ARGS__)
#define vluxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei32_tumu, 4, __riscv_vluxei32, 2, 1)(__VA_ARGS__)
#define vluxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei64_tumu, 4, __riscv_vluxei64, 2, 1)(__VA_ARGS__)
#define vsoxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei8, __riscv_vsoxei8, 3, 2, 1)(__VA_ARGS__)
#define vsoxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei16, __riscv_vsoxei16, 3, 2, 1)(__VA_ARGS__)
#define vsoxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei32, __riscv_vsoxei32, 3, 2, 1)(__VA_ARGS__)
#define vsoxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei64, __riscv_vsoxei64, 3, 2, 1)(__VA_ARGS__)
#define vsuxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei8, __riscv_vsuxei8, 3, 2, 1)(__VA_ARGS__)
#define vsuxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei16, __riscv_vsuxei16, 3, 2, 1)(__VA_ARGS__)
#define vsuxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei32, __riscv_vsuxei32, 3, 2, 1)(__VA_ARGS__)
#define vsuxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei64, __riscv_vsuxei64, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e16, __riscv_vsseg2e16, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e16, __riscv_vsseg3e16, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e16, __riscv_vsseg4e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e16, __riscv_vsseg5e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e16, __riscv_vsseg6e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e16, __riscv_vsseg7e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e16, __riscv_vsseg8e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e32, __riscv_vsseg2e32, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e32, __riscv_vsseg3e32, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e32, __riscv_vsseg4e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e32, __riscv_vsseg5e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e32, __riscv_vsseg6e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e32, __riscv_vsseg7e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e32, __riscv_vsseg8e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e64, __riscv_vsseg2e64, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e64, __riscv_vsseg3e64, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e64, __riscv_vsseg4e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e64, __riscv_vsseg5e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e64, __riscv_vsseg6e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e64, __riscv_vsseg7e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e64, __riscv_vsseg8e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e8, __riscv_vsseg2e8, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e8, __riscv_vsseg3e8, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e8, __riscv_vsseg4e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e8, __riscv_vsseg5e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e8, __riscv_vsseg6e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e8, __riscv_vsseg7e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e8, __riscv_vsseg8e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e16, __riscv_vssseg2e16, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e16, __riscv_vssseg3e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e16, __riscv_vssseg4e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e16, __riscv_vssseg5e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e16, __riscv_vssseg6e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e16, __riscv_vssseg7e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e16, __riscv_vssseg8e16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e32, __riscv_vssseg2e32, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e32, __riscv_vssseg3e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e32, __riscv_vssseg4e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e32, __riscv_vssseg5e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e32, __riscv_vssseg6e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e32, __riscv_vssseg7e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e32, __riscv_vssseg8e32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e64, __riscv_vssseg2e64, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e64, __riscv_vssseg3e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e64, __riscv_vssseg4e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e64, __riscv_vssseg5e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e64, __riscv_vssseg6e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e64, __riscv_vssseg7e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e64, __riscv_vssseg8e64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e8, __riscv_vssseg2e8, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e8, __riscv_vssseg3e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e8, __riscv_vssseg4e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e8, __riscv_vssseg5e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e8, __riscv_vssseg6e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e8, __riscv_vssseg7e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e8, __riscv_vssseg8e8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei8_tumu, 7, 6, __riscv_vloxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei8_tumu, 9, 8, 7, __riscv_vloxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei16_tumu, 7, 6, __riscv_vloxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei16_tumu, 9, 8, 7, __riscv_vloxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei32_tumu, 7, 6, __riscv_vloxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei32_tumu, 9, 8, 7, __riscv_vloxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei64_tumu, 7, 6, __riscv_vloxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei64_tumu, 9, 8, 7, __riscv_vloxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei8_tumu, 7, 6, __riscv_vluxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei8_tumu, 9, 8, 7, __riscv_vluxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei16_tumu, 7, 6, __riscv_vluxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei16_tumu, 9, 8, 7, __riscv_vluxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei32_tumu, 7, 6, __riscv_vluxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei32_tumu, 9, 8, 7, __riscv_vluxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei64_tumu, 7, 6, __riscv_vluxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei64_tumu, 9, 8, 7, __riscv_vluxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei8, __riscv_vsoxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei8, __riscv_vsoxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei8, __riscv_vsoxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei8, __riscv_vsoxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei8, __riscv_vsoxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei8, __riscv_vsoxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei8, __riscv_vsoxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei16, __riscv_vsoxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei16, __riscv_vsoxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei16, __riscv_vsoxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei16, __riscv_vsoxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei16, __riscv_vsoxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei16, __riscv_vsoxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei16, __riscv_vsoxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei32, __riscv_vsoxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei32, __riscv_vsoxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei32, __riscv_vsoxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei32, __riscv_vsoxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei32, __riscv_vsoxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei32, __riscv_vsoxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei32, __riscv_vsoxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei64, __riscv_vsoxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei64, __riscv_vsoxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei64, __riscv_vsoxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei64, __riscv_vsoxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei64, __riscv_vsoxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei64, __riscv_vsoxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei64, __riscv_vsoxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei8, __riscv_vsuxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei8, __riscv_vsuxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei8, __riscv_vsuxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei8, __riscv_vsuxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei8, __riscv_vsuxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei8, __riscv_vsuxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei8, __riscv_vsuxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei16, __riscv_vsuxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei16, __riscv_vsuxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei16, __riscv_vsuxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei16, __riscv_vsuxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei16, __riscv_vsuxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei16, __riscv_vsuxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei16, __riscv_vsuxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei32, __riscv_vsuxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei32, __riscv_vsuxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei32, __riscv_vsuxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei32, __riscv_vsuxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei32, __riscv_vsuxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei32, __riscv_vsuxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei32, __riscv_vsuxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei64, __riscv_vsuxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei64, __riscv_vsuxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei64, __riscv_vsuxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei64, __riscv_vsuxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei64, __riscv_vsuxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei64, __riscv_vsuxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei64, __riscv_vsuxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vadd_tumu, 4, __riscv_vadd, 2, 1)(__VA_ARGS__)
#define vsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsub_tumu, 4, __riscv_vsub, 2, 1)(__VA_ARGS__)
#define vrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrsub_tumu, 4, __riscv_vrsub, 2, 1)(__VA_ARGS__)
#define vneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vneg_tumu, 3, __riscv_vneg, 1)(__VA_ARGS__)
#define vwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vv_tumu, 4, __riscv_vwadd_vv, 2, 1)(__VA_ARGS__)
#define vwadd_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vx_tumu, 4, __riscv_vwadd_vx, 2, 1)(__VA_ARGS__)
#define vwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wv_tumu, 4, __riscv_vwadd_wv, 2, 1)(__VA_ARGS__)
#define vwadd_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wx_tumu, 4, __riscv_vwadd_wx, 2, 1)(__VA_ARGS__)
#define vwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vv_tumu, 4, __riscv_vwsub_vv, 2, 1)(__VA_ARGS__)
#define vwsub_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vx_tumu, 4, __riscv_vwsub_vx, 2, 1)(__VA_ARGS__)
#define vwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wv_tumu, 4, __riscv_vwsub_wv, 2, 1)(__VA_ARGS__)
#define vwsub_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wx_tumu, 4, __riscv_vwsub_wx, 2, 1)(__VA_ARGS__)
#define vwaddu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vv_tumu, 4, __riscv_vwaddu_vv, 2, 1)(__VA_ARGS__)
#define vwaddu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vx_tumu, 4, __riscv_vwaddu_vx, 2, 1)(__VA_ARGS__)
#define vwaddu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wv_tumu, 4, __riscv_vwaddu_wv, 2, 1)(__VA_ARGS__)
#define vwaddu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wx_tumu, 4, __riscv_vwaddu_wx, 2, 1)(__VA_ARGS__)
#define vwsubu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vv_tumu, 4, __riscv_vwsubu_vv, 2, 1)(__VA_ARGS__)
#define vwsubu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vx_tumu, 4, __riscv_vwsubu_vx, 2, 1)(__VA_ARGS__)
#define vwsubu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wv_tumu, 4, __riscv_vwsubu_wv, 2, 1)(__VA_ARGS__)
#define vwsubu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wx_tumu, 4, __riscv_vwsubu_wx, 2, 1)(__VA_ARGS__)
#define vsext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf2_tumu, 3, __riscv_vsext_vf2, 1)(__VA_ARGS__)
#define vsext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf4_tumu, 3, __riscv_vsext_vf4, 1)(__VA_ARGS__)
#define vsext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf8_tumu, 3, __riscv_vsext_vf8, 1)(__VA_ARGS__)
#define vzext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf2_tumu, 3, __riscv_vzext_vf2, 1)(__VA_ARGS__)
#define vzext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf4_tumu, 3, __riscv_vzext_vf4, 1)(__VA_ARGS__)
#define vzext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf8_tumu, 3, __riscv_vzext_vf8, 1)(__VA_ARGS__)
#define vadc(...) __riscv_vadc(__VA_ARGS__)
#define vsbc(...) __riscv_vsbc(__VA_ARGS__)
#define vmadc(...) __riscv_vmadc(__VA_ARGS__)
#define vmsbc(...) __riscv_vmsbc(__VA_ARGS__)
#define vand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vand_tumu, 4, __riscv_vand, 2, 1)(__VA_ARGS__)
#define vor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vor_tumu, 4, __riscv_vor, 2, 1)(__VA_ARGS__)
#define vxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vxor_tumu, 4, __riscv_vxor, 2, 1)(__VA_ARGS__)
#define vnot(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vnot_tumu, 3, __riscv_vnot, 1)(__VA_ARGS__)
#define vsll(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsll_tumu, 4, __riscv_vsll, 2, 1)(__VA_ARGS__)
#define vsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsra_tumu, 4, __riscv_vsra, 2, 1)(__VA_ARGS__)
#define vsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsrl_tumu, 4, __riscv_vsrl, 2, 1)(__VA_ARGS__)
#define vnsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsra_tumu, 4, __riscv_vnsra, 2, 1)(__VA_ARGS__)
#define vnsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsrl_tumu, 4, __riscv_vnsrl, 2, 1)(__VA_ARGS__)
#define vmseq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmseq_mu, 4, __riscv_vmseq, 2, 1)(__VA_ARGS__)
#define vmsne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsne_mu, 4, __riscv_vmsne, 2, 1)(__VA_ARGS__)
#define vmslt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmslt_mu, 4, __riscv_vmslt, 2, 1)(__VA_ARGS__)
#define vmsle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsle_mu, 4, __riscv_vmsle, 2, 1)(__VA_ARGS__)
#define vmsgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgt_mu, 4, __riscv_vmsgt, 2, 1)(__VA_ARGS__)
#define vmsge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsge_mu, 4, __riscv_vmsge, 2, 1)(__VA_ARGS__)
#define vmsltu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsltu_mu, 4, __riscv_vmsltu, 2, 1)(__VA_ARGS__)
#define vmsleu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsleu_mu, 4, __riscv_vmsleu, 2, 1)(__VA_ARGS__)
#define vmsgtu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgtu_mu, 4, __riscv_vmsgtu, 2, 1)(__VA_ARGS__)
#define vmsgeu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgeu_mu, 4, __riscv_vmsgeu, 2, 1)(__VA_ARGS__)
#define vmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmin_tumu, 4, __riscv_vmin, 2, 1)(__VA_ARGS__)
#define vmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmax_tumu, 4, __riscv_vmax, 2, 1)(__VA_ARGS__)
#define vminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vminu_tumu, 4, __riscv_vminu, 2, 1)(__VA_ARGS__)
#define vmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmaxu_tumu, 4, __riscv_vmaxu, 2, 1)(__VA_ARGS__)
#define vmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmul_tumu, 4, __riscv_vmul, 2, 1)(__VA_ARGS__)
#define vmulh(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulh_tumu, 4, __riscv_vmulh, 2, 1)(__VA_ARGS__)
#define vmulhsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhsu_tumu, 4, __riscv_vmulhsu, 2, 1)(__VA_ARGS__)
#define vmulhu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhu_tumu, 4, __riscv_vmulhu, 2, 1)(__VA_ARGS__)
#define vdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdiv_tumu, 4, __riscv_vdiv, 2, 1)(__VA_ARGS__)
#define vrem(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrem_tumu, 4, __riscv_vrem, 2, 1)(__VA_ARGS__)
#define vdivu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdivu_tumu, 4, __riscv_vdivu, 2, 1)(__VA_ARGS__)
#define vremu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vremu_tumu, 4, __riscv_vremu, 2, 1)(__VA_ARGS__)
#define vwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmul_tumu, 4, __riscv_vwmul, 2, 1)(__VA_ARGS__)
#define vwmulsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulsu_tumu, 4, __riscv_vwmulsu, 2, 1)(__VA_ARGS__)
#define vwmulu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulu_tumu, 4, __riscv_vwmulu, 2, 1)(__VA_ARGS__)
#define vmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmacc_tumu, __riscv_vmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsac_tumu, __riscv_vnmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmadd_tumu, __riscv_vmadd_tu, 3, 2, 1)(__VA_ARGS__)
#define vnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsub_tumu, __riscv_vnmsub_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmacc_tumu, __riscv_vwmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmaccsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccsu_tumu, __riscv_vwmaccsu_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmaccus(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccus_tumu, __riscv_vwmaccus_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmaccu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccu_tumu, __riscv_vwmaccu_tu, 3, 2, 1)(__VA_ARGS__)
#define vmv_v(...) __riscv_vmv_v(__VA_ARGS__)
#define vsadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsadd_tumu, 4, __riscv_vsadd, 2, 1)(__VA_ARGS__)
#define vssub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssub_tumu, 4, __riscv_vssub, 2, 1)(__VA_ARGS__)
#define vsaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsaddu_tumu, 4, __riscv_vsaddu, 2, 1)(__VA_ARGS__)
#define vssubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssubu_tumu, 4, __riscv_vssubu, 2, 1)(__VA_ARGS__)
#define vaadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaadd_tumu, 4, __riscv_vaadd, 2, 1)(__VA_ARGS__)
#define vasub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasub_tumu, 4, __riscv_vasub, 2, 1)(__VA_ARGS__)
#define vaaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaaddu_tumu, 4, __riscv_vaaddu, 2, 1)(__VA_ARGS__)
#define vasubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasubu_tumu, 4, __riscv_vasubu, 2, 1)(__VA_ARGS__)
#define vsmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsmul_mu, 4, __riscv_vsmul, 2, 1)(__VA_ARGS__)
#define vssra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssra_tumu, 4, __riscv_vssra, 2, 1)(__VA_ARGS__)
#define vssrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssrl_tumu, 4, __riscv_vssrl, 2, 1)(__VA_ARGS__)
#define vnclip(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclip_tumu, 4, __riscv_vnclip, 2, 1)(__VA_ARGS__)
#define vnclipu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclipu_tumu, 4, __riscv_vnclipu, 2, 1)(__VA_ARGS__)
#define vfadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfadd_tumu, 4, __riscv_vfadd, 2, 1)(__VA_ARGS__)
#define vfsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsub_tumu, 4, __riscv_vfsub, 2, 1)(__VA_ARGS__)
#define vfrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrsub_tumu, 4, __riscv_vfrsub, 2, 1)(__VA_ARGS__)
#define vfneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfneg_tumu, 3, __riscv_vfneg, 1)(__VA_ARGS__)
#define vfwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vv_tumu, 4, __riscv_vfwadd_vv, 2, 1)(__VA_ARGS__)
#define vfwadd_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vf_tumu, 4, __riscv_vfwadd_vf, 2, 1)(__VA_ARGS__)
#define vfwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wv_tumu, 4, __riscv_vfwadd_wv, 2, 1)(__VA_ARGS__)
#define vfwadd_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wf_tumu, 4, __riscv_vfwadd_wf, 2, 1)(__VA_ARGS__)
#define vfwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vv_tumu, 4, __riscv_vfwsub_vv, 2, 1)(__VA_ARGS__)
#define vfwsub_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vf_tumu, 4, __riscv_vfwsub_vf, 2, 1)(__VA_ARGS__)
#define vfwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wv_tumu, 4, __riscv_vfwsub_wv, 2, 1)(__VA_ARGS__)
#define vfwsub_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wf_tumu, 4, __riscv_vfwsub_wf, 2, 1)(__VA_ARGS__)
#define vfmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmul_tumu, 4, __riscv_vfmul, 2, 1)(__VA_ARGS__)
#define vfdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfdiv_tumu, 4, __riscv_vfdiv, 2, 1)(__VA_ARGS__)
#define vfrdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrdiv_tumu, 4, __riscv_vfrdiv, 2, 1)(__VA_ARGS__)
#define vfwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmul_tumu, 4, __riscv_vfwmul, 2, 1)(__VA_ARGS__)
#define vfmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmacc_tumu, __riscv_vfmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmacc_tumu, __riscv_vfnmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsac_tumu, __riscv_vfmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsac_tumu, __riscv_vfnmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmadd_tumu, __riscv_vfmadd_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmadd_tumu, __riscv_vfnmadd_tu, 3, 2, 1)(__VA_ARGS__)
#define vfmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsub_tumu, __riscv_vfmsub_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsub_tumu, __riscv_vfnmsub_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmacc_tumu, __riscv_vfwmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmacc_tumu, __riscv_vfwnmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmsac_tumu, __riscv_vfwmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmsac_tumu, __riscv_vfwnmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfsqrt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfsqrt_tumu, 3, __riscv_vfsqrt, 1)(__VA_ARGS__)
#define vfrsqrt7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrsqrt7_tumu, 3, __riscv_vfrsqrt7, 1)(__VA_ARGS__)
#define vfrec7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrec7_tumu, 3, __riscv_vfrec7, 1)(__VA_ARGS__)
#define vfmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmin_tumu, 4, __riscv_vfmin, 2, 1)(__VA_ARGS__)
#define vfmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmax_tumu, 4, __riscv_vfmax, 2, 1)(__VA_ARGS__)
#define vfsgnj(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnj_tumu, 4, __riscv_vfsgnj, 2, 1)(__VA_ARGS__)
#define vfsgnjn(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjn_tumu, 4, __riscv_vfsgnjn, 2, 1)(__VA_ARGS__)
#define vfsgnjx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjx_tumu, 4, __riscv_vfsgnjx, 2, 1)(__VA_ARGS__)
#define vfabs(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfabs_tumu, 3, __riscv_vfabs, 1)(__VA_ARGS__)
#define vmfeq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfeq_mu, 4, __riscv_vmfeq, 2, 1)(__VA_ARGS__)
#define vmfne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfne_mu, 4, __riscv_vmfne, 2, 1)(__VA_ARGS__)
#define vmflt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmflt_mu, 4, __riscv_vmflt, 2, 1)(__VA_ARGS__)
#define vmfle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfle_mu, 4, __riscv_vmfle, 2, 1)(__VA_ARGS__)
#define vmfgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfgt_mu, 4, __riscv_vmfgt, 2, 1)(__VA_ARGS__)
#define vmfge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfge_mu, 4, __riscv_vmfge, 2, 1)(__VA_ARGS__)
#define vfclass(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfclass_tumu, 3, __riscv_vfclass, 1)(__VA_ARGS__)
#define vfcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_x_tumu, 3, __riscv_vfcvt_x, 1)(__VA_ARGS__)
#define vfcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_x_tumu, 3, __riscv_vfcvt_rtz_x, 1)(__VA_ARGS__)
#define vfcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_xu_tumu, 3, __riscv_vfcvt_xu, 1)(__VA_ARGS__)
#define vfcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_xu_tumu, 3, __riscv_vfcvt_rtz_xu, 1)(__VA_ARGS__)
#define vfcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_f_tumu, 3, __riscv_vfcvt_f, 1)(__VA_ARGS__)
#define vwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvt_x_tumu, 3, __riscv_vwcvt_x, 1)(__VA_ARGS__)
#define vwcvtu_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvtu_x_tumu, 3, __riscv_vwcvtu_x, 1)(__VA_ARGS__)
#define vfwcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_f_tumu, 3, __riscv_vfwcvt_f, 1)(__VA_ARGS__)
#define vfwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_x_tumu, 3, __riscv_vfwcvt_x, 1)(__VA_ARGS__)
#define vfwcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_x_tumu, 3, __riscv_vfwcvt_rtz_x, 1)(__VA_ARGS__)
#define vfwcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_xu_tumu, 3, __riscv_vfwcvt_xu, 1)(__VA_ARGS__)
#define vfwcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_xu_tumu, 3, __riscv_vfwcvt_rtz_xu, 1)(__VA_ARGS__)
#define vfncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_x_tumu, 3, __riscv_vfncvt_x, 1)(__VA_ARGS__)
#define vfncvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_x_tumu, 3, __riscv_vfncvt_rtz_x, 1)(__VA_ARGS__)
#define vncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vncvt_x_tumu, 3, __riscv_vncvt_x, 1)(__VA_ARGS__)
#define vfncvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_xu_tumu, 3, __riscv_vfncvt_xu, 1)(__VA_ARGS__)
#define vfncvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_xu_tumu, 3, __riscv_vfncvt_rtz_xu, 1)(__VA_ARGS__)
#define vfncvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_f_tumu, 3, __riscv_vfncvt_f, 1)(__VA_ARGS__)
#define vfncvt_rod_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rod_f_tumu, 3, __riscv_vfncvt_rod_f, 1)(__VA_ARGS__)
#define vredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredsum_tum, __riscv_vredsum_tu, 3, 2, 1)(__VA_ARGS__)
#define vredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmax_tum, __riscv_vredmax_tu, 3, 2, 1)(__VA_ARGS__)
#define vredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmin_tum, __riscv_vredmin_tu, 3, 2, 1)(__VA_ARGS__)
#define vredand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredand_tum, __riscv_vredand_tu, 3, 2, 1)(__VA_ARGS__)
#define vredor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredor_tum, __riscv_vredor_tu, 3, 2, 1)(__VA_ARGS__)
#define vredxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredxor_tum, __riscv_vredxor_tu, 3, 2, 1)(__VA_ARGS__)
#define vredmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmaxu_tum, __riscv_vredmaxu_tu, 3, 2, 1)(__VA_ARGS__)
#define vredminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredminu_tum, __riscv_vredminu_tu, 3, 2, 1)(__VA_ARGS__)
#define vwredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsum_tum, __riscv_vwredsum_tu, 3, 2, 1)(__VA_ARGS__)
#define vwredsumu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsumu_tum, __riscv_vwredsumu_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredosum_tum, __riscv_vfredosum_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredusum_tum, __riscv_vfredusum_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmax_tum, __riscv_vfredmax_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmin_tum, __riscv_vfredmin_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredosum_tum, __riscv_vfwredosum_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredusum_tum, __riscv_vfwredusum_tu, 3, 2, 1)(__VA_ARGS__)
#define vsm(...) __riscv_vsm(__VA_ARGS__)
#define vmand(...) __riscv_vmand(__VA_ARGS__)
#define vmnand(...) __riscv_vmnand(__VA_ARGS__)
#define vmandn(...) __riscv_vmandn(__VA_ARGS__)
#define vmxor(...) __riscv_vmxor(__VA_ARGS__)
#define vmor(...) __riscv_vmor(__VA_ARGS__)
#define vmnor(...) __riscv_vmnor(__VA_ARGS__)
#define vmorn(...) __riscv_vmorn(__VA_ARGS__)
#define vmxnor(...) __riscv_vmxnor(__VA_ARGS__)
#define vmmv(...) __riscv_vmmv(__VA_ARGS__)
#define vmnot(...) __riscv_vmnot(__VA_ARGS__)
#define vcpop(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vcpop, __riscv_vcpop, 1)(__VA_ARGS__)
#define vfirst(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vfirst, __riscv_vfirst, 1)(__VA_ARGS__)
#define vmsbf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsbf_mu, 3, __riscv_vmsbf, 1)(__VA_ARGS__)
#define vmsif(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsif_mu, 3, __riscv_vmsif, 1)(__VA_ARGS__)
#define vmsof(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsof_mu, 3, __riscv_vmsof, 1)(__VA_ARGS__)
#define vfmv_f(...) __riscv_vfmv_f(__VA_ARGS__)
#define vfmv_s(...) __riscv_vfmv_s_tu(__VA_ARGS__)
#define vmv_x(...) __riscv_vmv_x(__VA_ARGS__)
#define vmv_s(...) __riscv_vmv_s_tu(__VA_ARGS__)
#define vslideup(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslideup_tumu, __riscv_vslideup_tu, 3, 2, 1)(__VA_ARGS__)
#define vslidedown(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslidedown_tumu, __riscv_vslidedown_tu, 3, 2, 1)(__VA_ARGS__)
#define vfslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1up_tumu, 4, __riscv_vfslide1up, 2, 1)(__VA_ARGS__)
#define vfslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1down_tumu, 4, __riscv_vfslide1down, 2, 1)(__VA_ARGS__)
#define vslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1up_tumu, 4, __riscv_vslide1up, 2, 1)(__VA_ARGS__)
#define vslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1down_tumu, 4, __riscv_vslide1down, 2, 1)(__VA_ARGS__)
#define vrgather(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgather_tumu, 4, __riscv_vrgather, 2, 1)(__VA_ARGS__)
#define vrgatherei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgatherei16_tumu, 4, __riscv_vrgatherei16, 2, 1)(__VA_ARGS__)
#define vreinterpret_u8mf8(...) __riscv_vreinterpret_u8mf8(__VA_ARGS__)
#define vreinterpret_u8mf4(...) __riscv_vreinterpret_u8mf4(__VA_ARGS__)
#define vreinterpret_u8mf2(...) __riscv_vreinterpret_u8mf2(__VA_ARGS__)
#define vreinterpret_u8m1(...) __riscv_vreinterpret_u8m1(__VA_ARGS__)
#define vreinterpret_u8m2(...) __riscv_vreinterpret_u8m2(__VA_ARGS__)
#define vreinterpret_u8m4(...) __riscv_vreinterpret_u8m4(__VA_ARGS__)
#define vreinterpret_u8m8(...) __riscv_vreinterpret_u8m8(__VA_ARGS__)
#define vreinterpret_i8mf8(...) __riscv_vreinterpret_i8mf8(__VA_ARGS__)
#define vreinterpret_i8mf4(...) __riscv_vreinterpret_i8mf4(__VA_ARGS__)
#define vreinterpret_i8mf2(...) __riscv_vreinterpret_i8mf2(__VA_ARGS__)
#define vreinterpret_i8m1(...) __riscv_vreinterpret_i8m1(__VA_ARGS__)
#define vreinterpret_i8m2(...) __riscv_vreinterpret_i8m2(__VA_ARGS__)
#define vreinterpret_i8m4(...) __riscv_vreinterpret_i8m4(__VA_ARGS__)
#define vreinterpret_i8m8(...) __riscv_vreinterpret_i8m8(__VA_ARGS__)
#define vreinterpret_f16mf4(...) __riscv_vreinterpret_f16mf4(__VA_ARGS__)
#define vreinterpret_f16mf2(...) __riscv_vreinterpret_f16mf2(__VA_ARGS__)
#define vreinterpret_f16m1(...) __riscv_vreinterpret_f16m1(__VA_ARGS__)
#define vreinterpret_f16m2(...) __riscv_vreinterpret_f16m2(__VA_ARGS__)
#define vreinterpret_f16m4(...) __riscv_vreinterpret_f16m4(__VA_ARGS__)
#define vreinterpret_f16m8(...) __riscv_vreinterpret_f16m8(__VA_ARGS__)
#define vreinterpret_u16mf4(...) __riscv_vreinterpret_u16mf4(__VA_ARGS__)
#define vreinterpret_u16mf2(...) __riscv_vreinterpret_u16mf2(__VA_ARGS__)
#define vreinterpret_u16m1(...) __riscv_vreinterpret_u16m1(__VA_ARGS__)
#define vreinterpret_u16m2(...) __riscv_vreinterpret_u16m2(__VA_ARGS__)
#define vreinterpret_u16m4(...) __riscv_vreinterpret_u16m4(__VA_ARGS__)
#define vreinterpret_u16m8(...) __riscv_vreinterpret_u16m8(__VA_ARGS__)
#define vreinterpret_i16mf4(...) __riscv_vreinterpret_i16mf4(__VA_ARGS__)
#define vreinterpret_i16mf2(...) __riscv_vreinterpret_i16mf2(__VA_ARGS__)
#define vreinterpret_i16m1(...) __riscv_vreinterpret_i16m1(__VA_ARGS__)
#define vreinterpret_i16m2(...) __riscv_vreinterpret_i16m2(__VA_ARGS__)
#define vreinterpret_i16m4(...) __riscv_vreinterpret_i16m4(__VA_ARGS__)
#define vreinterpret_i16m8(...) __riscv_vreinterpret_i16m8(__VA_ARGS__)
#define vreinterpret_f32mf2(...) __riscv_vreinterpret_f32mf2(__VA_ARGS__)
#define vreinterpret_f32m1(...) __riscv_vreinterpret_f32m1(__VA_ARGS__)
#define vreinterpret_f32m2(...) __riscv_vreinterpret_f32m2(__VA_ARGS__)
#define vreinterpret_f32m4(...) __riscv_vreinterpret_f32m4(__VA_ARGS__)
#define vreinterpret_f32m8(...) __riscv_vreinterpret_f32m8(__VA_ARGS__)
#define vreinterpret_u32mf2(...) __riscv_vreinterpret_u32mf2(__VA_ARGS__)
#define vreinterpret_u32m1(...) __riscv_vreinterpret_u32m1(__VA_ARGS__)
#define vreinterpret_u32m2(...) __riscv_vreinterpret_u32m2(__VA_ARGS__)
#define vreinterpret_u32m4(...) __riscv_vreinterpret_u32m4(__VA_ARGS__)
#define vreinterpret_u32m8(...) __riscv_vreinterpret_u32m8(__VA_ARGS__)
#define vreinterpret_i32mf2(...) __riscv_vreinterpret_i32mf2(__VA_ARGS__)
#define vreinterpret_i32m1(...) __riscv_vreinterpret_i32m1(__VA_ARGS__)
#define vreinterpret_i32m2(...) __riscv_vreinterpret_i32m2(__VA_ARGS__)
#define vreinterpret_i32m4(...) __riscv_vreinterpret_i32m4(__VA_ARGS__)
#define vreinterpret_i32m8(...) __riscv_vreinterpret_i32m8(__VA_ARGS__)
#define vreinterpret_f64m1(...) __riscv_vreinterpret_f64m1(__VA_ARGS__)
#define vreinterpret_f64m2(...) __riscv_vreinterpret_f64m2(__VA_ARGS__)
#define vreinterpret_f64m4(...) __riscv_vreinterpret_f64m4(__VA_ARGS__)
#define vreinterpret_f64m8(...) __riscv_vreinterpret_f64m8(__VA_ARGS__)
#define vreinterpret_u64m1(...) __riscv_vreinterpret_u64m1(__VA_ARGS__)
#define vreinterpret_u64m2(...) __riscv_vreinterpret_u64m2(__VA_ARGS__)
#define vreinterpret_u64m4(...) __riscv_vreinterpret_u64m4(__VA_ARGS__)
#define vreinterpret_u64m8(...) __riscv_vreinterpret_u64m8(__VA_ARGS__)
#define vreinterpret_i64m1(...) __riscv_vreinterpret_i64m1(__VA_ARGS__)
#define vreinterpret_i64m2(...) __riscv_vreinterpret_i64m2(__VA_ARGS__)
#define vreinterpret_i64m4(...) __riscv_vreinterpret_i64m4(__VA_ARGS__)
#define vreinterpret_i64m8(...) __riscv_vreinterpret_i64m8(__VA_ARGS__)
#define vlmul_ext_f16mf2(...) __riscv_vlmul_ext_f16mf2(__VA_ARGS__)
#define vlmul_ext_f16m1(...) __riscv_vlmul_ext_f16m1(__VA_ARGS__)
#define vlmul_ext_f16m2(...) __riscv_vlmul_ext_f16m2(__VA_ARGS__)
#define vlmul_ext_f16m4(...) __riscv_vlmul_ext_f16m4(__VA_ARGS__)
#define vlmul_ext_f16m8(...) __riscv_vlmul_ext_f16m8(__VA_ARGS__)
#define vlmul_ext_f32m1(...) __riscv_vlmul_ext_f32m1(__VA_ARGS__)
#define vlmul_ext_f32m2(...) __riscv_vlmul_ext_f32m2(__VA_ARGS__)
#define vlmul_ext_f32m4(...) __riscv_vlmul_ext_f32m4(__VA_ARGS__)
#define vlmul_ext_f32m8(...) __riscv_vlmul_ext_f32m8(__VA_ARGS__)
#define vlmul_ext_f64m2(...) __riscv_vlmul_ext_f64m2(__VA_ARGS__)
#define vlmul_ext_f64m4(...) __riscv_vlmul_ext_f64m4(__VA_ARGS__)
#define vlmul_ext_f64m8(...) __riscv_vlmul_ext_f64m8(__VA_ARGS__)
#define vlmul_ext_i8mf4(...) __riscv_vlmul_ext_i8mf4(__VA_ARGS__)
#define vlmul_ext_i8mf2(...) __riscv_vlmul_ext_i8mf2(__VA_ARGS__)
#define vlmul_ext_i8m1(...) __riscv_vlmul_ext_i8m1(__VA_ARGS__)
#define vlmul_ext_i8m2(...) __riscv_vlmul_ext_i8m2(__VA_ARGS__)
#define vlmul_ext_i8m4(...) __riscv_vlmul_ext_i8m4(__VA_ARGS__)
#define vlmul_ext_i8m8(...) __riscv_vlmul_ext_i8m8(__VA_ARGS__)
#define vlmul_ext_i16mf2(...) __riscv_vlmul_ext_i16mf2(__VA_ARGS__)
#define vlmul_ext_i16m1(...) __riscv_vlmul_ext_i16m1(__VA_ARGS__)
#define vlmul_ext_i16m2(...) __riscv_vlmul_ext_i16m2(__VA_ARGS__)
#define vlmul_ext_i16m4(...) __riscv_vlmul_ext_i16m4(__VA_ARGS__)
#define vlmul_ext_i16m8(...) __riscv_vlmul_ext_i16m8(__VA_ARGS__)
#define vlmul_ext_i32m1(...) __riscv_vlmul_ext_i32m1(__VA_ARGS__)
#define vlmul_ext_i32m2(...) __riscv_vlmul_ext_i32m2(__VA_ARGS__)
#define vlmul_ext_i32m4(...) __riscv_vlmul_ext_i32m4(__VA_ARGS__)
#define vlmul_ext_i32m8(...) __riscv_vlmul_ext_i32m8(__VA_ARGS__)
#define vlmul_ext_i64m2(...) __riscv_vlmul_ext_i64m2(__VA_ARGS__)
#define vlmul_ext_i64m4(...) __riscv_vlmul_ext_i64m4(__VA_ARGS__)
#define vlmul_ext_i64m8(...) __riscv_vlmul_ext_i64m8(__VA_ARGS__)
#define vlmul_ext_u8mf4(...) __riscv_vlmul_ext_u8mf4(__VA_ARGS__)
#define vlmul_ext_u8mf2(...) __riscv_vlmul_ext_u8mf2(__VA_ARGS__)
#define vlmul_ext_u8m1(...) __riscv_vlmul_ext_u8m1(__VA_ARGS__)
#define vlmul_ext_u8m2(...) __riscv_vlmul_ext_u8m2(__VA_ARGS__)
#define vlmul_ext_u8m4(...) __riscv_vlmul_ext_u8m4(__VA_ARGS__)
#define vlmul_ext_u8m8(...) __riscv_vlmul_ext_u8m8(__VA_ARGS__)
#define vlmul_ext_u16mf2(...) __riscv_vlmul_ext_u16mf2(__VA_ARGS__)
#define vlmul_ext_u16m1(...) __riscv_vlmul_ext_u16m1(__VA_ARGS__)
#define vlmul_ext_u16m2(...) __riscv_vlmul_ext_u16m2(__VA_ARGS__)
#define vlmul_ext_u16m4(...) __riscv_vlmul_ext_u16m4(__VA_ARGS__)
#define vlmul_ext_u16m8(...) __riscv_vlmul_ext_u16m8(__VA_ARGS__)
#define vlmul_ext_u32m1(...) __riscv_vlmul_ext_u32m1(__VA_ARGS__)
#define vlmul_ext_u32m2(...) __riscv_vlmul_ext_u32m2(__VA_ARGS__)
#define vlmul_ext_u32m4(...) __riscv_vlmul_ext_u32m4(__VA_ARGS__)
#define vlmul_ext_u32m8(...) __riscv_vlmul_ext_u32m8(__VA_ARGS__)
#define vlmul_ext_u64m2(...) __riscv_vlmul_ext_u64m2(__VA_ARGS__)
#define vlmul_ext_u64m4(...) __riscv_vlmul_ext_u64m4(__VA_ARGS__)
#define vlmul_ext_u64m8(...) __riscv_vlmul_ext_u64m8(__VA_ARGS__)
#define vlmul_trunc_f16mf4(...) __riscv_vlmul_trunc_f16mf4(__VA_ARGS__)
#define vlmul_trunc_f16mf2(...) __riscv_vlmul_trunc_f16mf2(__VA_ARGS__)
#define vlmul_trunc_f16m1(...) __riscv_vlmul_trunc_f16m1(__VA_ARGS__)
#define vlmul_trunc_f16m2(...) __riscv_vlmul_trunc_f16m2(__VA_ARGS__)
#define vlmul_trunc_f16m4(...) __riscv_vlmul_trunc_f16m4(__VA_ARGS__)
#define vlmul_trunc_f32mf2(...) __riscv_vlmul_trunc_f32mf2(__VA_ARGS__)
#define vlmul_trunc_f32m1(...) __riscv_vlmul_trunc_f32m1(__VA_ARGS__)
#define vlmul_trunc_f32m2(...) __riscv_vlmul_trunc_f32m2(__VA_ARGS__)
#define vlmul_trunc_f32m4(...) __riscv_vlmul_trunc_f32m4(__VA_ARGS__)
#define vlmul_trunc_f64m1(...) __riscv_vlmul_trunc_f64m1(__VA_ARGS__)
#define vlmul_trunc_f64m2(...) __riscv_vlmul_trunc_f64m2(__VA_ARGS__)
#define vlmul_trunc_f64m4(...) __riscv_vlmul_trunc_f64m4(__VA_ARGS__)
#define vlmul_trunc_i8mf8(...) __riscv_vlmul_trunc_i8mf8(__VA_ARGS__)
#define vlmul_trunc_i8mf4(...) __riscv_vlmul_trunc_i8mf4(__VA_ARGS__)
#define vlmul_trunc_i8mf2(...) __riscv_vlmul_trunc_i8mf2(__VA_ARGS__)
#define vlmul_trunc_i8m1(...) __riscv_vlmul_trunc_i8m1(__VA_ARGS__)
#define vlmul_trunc_i8m2(...) __riscv_vlmul_trunc_i8m2(__VA_ARGS__)
#define vlmul_trunc_i8m4(...) __riscv_vlmul_trunc_i8m4(__VA_ARGS__)
#define vlmul_trunc_i16mf4(...) __riscv_vlmul_trunc_i16mf4(__VA_ARGS__)
#define vlmul_trunc_i16mf2(...) __riscv_vlmul_trunc_i16mf2(__VA_ARGS__)
#define vlmul_trunc_i16m1(...) __riscv_vlmul_trunc_i16m1(__VA_ARGS__)
#define vlmul_trunc_i16m2(...) __riscv_vlmul_trunc_i16m2(__VA_ARGS__)
#define vlmul_trunc_i16m4(...) __riscv_vlmul_trunc_i16m4(__VA_ARGS__)
#define vlmul_trunc_i32mf2(...) __riscv_vlmul_trunc_i32mf2(__VA_ARGS__)
#define vlmul_trunc_i32m1(...) __riscv_vlmul_trunc_i32m1(__VA_ARGS__)
#define vlmul_trunc_i32m2(...) __riscv_vlmul_trunc_i32m2(__VA_ARGS__)
#define vlmul_trunc_i32m4(...) __riscv_vlmul_trunc_i32m4(__VA_ARGS__)
#define vlmul_trunc_i64m1(...) __riscv_vlmul_trunc_i64m1(__VA_ARGS__)
#define vlmul_trunc_i64m2(...) __riscv_vlmul_trunc_i64m2(__VA_ARGS__)
#define vlmul_trunc_i64m4(...) __riscv_vlmul_trunc_i64m4(__VA_ARGS__)
#define vlmul_trunc_u8mf8(...) __riscv_vlmul_trunc_u8mf8(__VA_ARGS__)
#define vlmul_trunc_u8mf4(...) __riscv_vlmul_trunc_u8mf4(__VA_ARGS__)
#define vlmul_trunc_u8mf2(...) __riscv_vlmul_trunc_u8mf2(__VA_ARGS__)
#define vlmul_trunc_u8m1(...) __riscv_vlmul_trunc_u8m1(__VA_ARGS__)
#define vlmul_trunc_u8m2(...) __riscv_vlmul_trunc_u8m2(__VA_ARGS__)
#define vlmul_trunc_u8m4(...) __riscv_vlmul_trunc_u8m4(__VA_ARGS__)
#define vlmul_trunc_u16mf4(...) __riscv_vlmul_trunc_u16mf4(__VA_ARGS__)
#define vlmul_trunc_u16mf2(...) __riscv_vlmul_trunc_u16mf2(__VA_ARGS__)
#define vlmul_trunc_u16m1(...) __riscv_vlmul_trunc_u16m1(__VA_ARGS__)
#define vlmul_trunc_u16m2(...) __riscv_vlmul_trunc_u16m2(__VA_ARGS__)
#define vlmul_trunc_u16m4(...) __riscv_vlmul_trunc_u16m4(__VA_ARGS__)
#define vlmul_trunc_u32mf2(...) __riscv_vlmul_trunc_u32mf2(__VA_ARGS__)
#define vlmul_trunc_u32m1(...) __riscv_vlmul_trunc_u32m1(__VA_ARGS__)
#define vlmul_trunc_u32m2(...) __riscv_vlmul_trunc_u32m2(__VA_ARGS__)
#define vlmul_trunc_u32m4(...) __riscv_vlmul_trunc_u32m4(__VA_ARGS__)
#define vlmul_trunc_u64m1(...) __riscv_vlmul_trunc_u64m1(__VA_ARGS__)
#define vlmul_trunc_u64m2(...) __riscv_vlmul_trunc_u64m2(__VA_ARGS__)
#define vlmul_trunc_u64m4(...) __riscv_vlmul_trunc_u64m4(__VA_ARGS__)
#define vset(...) __riscv_vset(__VA_ARGS__)
#define vget_f16m1(...) __riscv_vget_f16m1(__VA_ARGS__)
#define vget_f16m2(...) __riscv_vget_f16m2(__VA_ARGS__)
#define vget_f16m4(...) __riscv_vget_f16m4(__VA_ARGS__)
#define vget_f32m1(...) __riscv_vget_f32m1(__VA_ARGS__)
#define vget_f32m2(...) __riscv_vget_f32m2(__VA_ARGS__)
#define vget_f32m4(...) __riscv_vget_f32m4(__VA_ARGS__)
#define vget_f64m1(...) __riscv_vget_f64m1(__VA_ARGS__)
#define vget_f64m2(...) __riscv_vget_f64m2(__VA_ARGS__)
#define vget_f64m4(...) __riscv_vget_f64m4(__VA_ARGS__)
#define vget_i8m1(...) __riscv_vget_i8m1(__VA_ARGS__)
#define vget_i8m2(...) __riscv_vget_i8m2(__VA_ARGS__)
#define vget_i8m4(...) __riscv_vget_i8m4(__VA_ARGS__)
#define vget_i16m1(...) __riscv_vget_i16m1(__VA_ARGS__)
#define vget_i16m2(...) __riscv_vget_i16m2(__VA_ARGS__)
#define vget_i16m4(...) __riscv_vget_i16m4(__VA_ARGS__)
#define vget_i32m1(...) __riscv_vget_i32m1(__VA_ARGS__)
#define vget_i32m2(...) __riscv_vget_i32m2(__VA_ARGS__)
#define vget_i32m4(...) __riscv_vget_i32m4(__VA_ARGS__)
#define vget_i64m1(...) __riscv_vget_i64m1(__VA_ARGS__)
#define vget_i64m2(...) __riscv_vget_i64m2(__VA_ARGS__)
#define vget_i64m4(...) __riscv_vget_i64m4(__VA_ARGS__)
#define vget_u8m1(...) __riscv_vget_u8m1(__VA_ARGS__)
#define vget_u8m2(...) __riscv_vget_u8m2(__VA_ARGS__)
#define vget_u8m4(...) __riscv_vget_u8m4(__VA_ARGS__)
#define vget_u16m1(...) __riscv_vget_u16m1(__VA_ARGS__)
#define vget_u16m2(...) __riscv_vget_u16m2(__VA_ARGS__)
#define vget_u16m4(...) __riscv_vget_u16m4(__VA_ARGS__)
#define vget_u32m1(...) __riscv_vget_u32m1(__VA_ARGS__)
#define vget_u32m2(...) __riscv_vget_u32m2(__VA_ARGS__)
#define vget_u32m4(...) __riscv_vget_u32m4(__VA_ARGS__)
#define vget_u64m1(...) __riscv_vget_u64m1(__VA_ARGS__)
#define vget_u64m2(...) __riscv_vget_u64m2(__VA_ARGS__)
#define vget_u64m4(...) __riscv_vget_u64m4(__VA_ARGS__)
#define vle16(...) __riscv_vle16_tumu(__VA_ARGS__)
#define vle32(...) __riscv_vle32_tumu(__VA_ARGS__)
#define vle64(...) __riscv_vle64_tumu(__VA_ARGS__)
#define vle8(...) __riscv_vle8_tumu(__VA_ARGS__)
#define vlse16(...) __riscv_vlse16_tumu(__VA_ARGS__)
#define vlse32(...) __riscv_vlse32_tumu(__VA_ARGS__)
#define vlse64(...) __riscv_vlse64_tumu(__VA_ARGS__)
#define vlse8(...) __riscv_vlse8_tumu(__VA_ARGS__)
#define vle16ff(...) __riscv_vle16ff_tumu(__VA_ARGS__)
#define vle32ff(...) __riscv_vle32ff_tumu(__VA_ARGS__)
#define vle64ff(...) __riscv_vle64ff_tumu(__VA_ARGS__)
#define vle8ff(...) __riscv_vle8ff_tumu(__VA_ARGS__)
#define vlseg2e16(...) __riscv_vlseg2e16_tumu(__VA_ARGS__)
#define vlseg3e16(...) __riscv_vlseg3e16_tumu(__VA_ARGS__)
#define vlseg4e16(...) __riscv_vlseg4e16_tumu(__VA_ARGS__)
#define vlseg5e16(...) __riscv_vlseg5e16_tumu(__VA_ARGS__)
#define vlseg6e16(...) __riscv_vlseg6e16_tumu(__VA_ARGS__)
#define vlseg7e16(...) __riscv_vlseg7e16_tumu(__VA_ARGS__)
#define vlseg8e16(...) __riscv_vlseg8e16_tumu(__VA_ARGS__)
#define vlseg2e32(...) __riscv_vlseg2e32_tumu(__VA_ARGS__)
#define vlseg3e32(...) __riscv_vlseg3e32_tumu(__VA_ARGS__)
#define vlseg4e32(...) __riscv_vlseg4e32_tumu(__VA_ARGS__)
#define vlseg5e32(...) __riscv_vlseg5e32_tumu(__VA_ARGS__)
#define vlseg6e32(...) __riscv_vlseg6e32_tumu(__VA_ARGS__)
#define vlseg7e32(...) __riscv_vlseg7e32_tumu(__VA_ARGS__)
#define vlseg8e32(...) __riscv_vlseg8e32_tumu(__VA_ARGS__)
#define vlseg2e64(...) __riscv_vlseg2e64_tumu(__VA_ARGS__)
#define vlseg3e64(...) __riscv_vlseg3e64_tumu(__VA_ARGS__)
#define vlseg4e64(...) __riscv_vlseg4e64_tumu(__VA_ARGS__)
#define vlseg5e64(...) __riscv_vlseg5e64_tumu(__VA_ARGS__)
#define vlseg6e64(...) __riscv_vlseg6e64_tumu(__VA_ARGS__)
#define vlseg7e64(...) __riscv_vlseg7e64_tumu(__VA_ARGS__)
#define vlseg8e64(...) __riscv_vlseg8e64_tumu(__VA_ARGS__)
#define vlseg2e16ff(...) __riscv_vlseg2e16ff_tumu(__VA_ARGS__)
#define vlseg3e16ff(...) __riscv_vlseg3e16ff_tumu(__VA_ARGS__)
#define vlseg4e16ff(...) __riscv_vlseg4e16ff_tumu(__VA_ARGS__)
#define vlseg5e16ff(...) __riscv_vlseg5e16ff_tumu(__VA_ARGS__)
#define vlseg6e16ff(...) __riscv_vlseg6e16ff_tumu(__VA_ARGS__)
#define vlseg7e16ff(...) __riscv_vlseg7e16ff_tumu(__VA_ARGS__)
#define vlseg8e16ff(...) __riscv_vlseg8e16ff_tumu(__VA_ARGS__)
#define vlseg2e32ff(...) __riscv_vlseg2e32ff_tumu(__VA_ARGS__)
#define vlseg3e32ff(...) __riscv_vlseg3e32ff_tumu(__VA_ARGS__)
#define vlseg4e32ff(...) __riscv_vlseg4e32ff_tumu(__VA_ARGS__)
#define vlseg5e32ff(...) __riscv_vlseg5e32ff_tumu(__VA_ARGS__)
#define vlseg6e32ff(...) __riscv_vlseg6e32ff_tumu(__VA_ARGS__)
#define vlseg7e32ff(...) __riscv_vlseg7e32ff_tumu(__VA_ARGS__)
#define vlseg8e32ff(...) __riscv_vlseg8e32ff_tumu(__VA_ARGS__)
#define vlseg2e64ff(...) __riscv_vlseg2e64ff_tumu(__VA_ARGS__)
#define vlseg3e64ff(...) __riscv_vlseg3e64ff_tumu(__VA_ARGS__)
#define vlseg4e64ff(...) __riscv_vlseg4e64ff_tumu(__VA_ARGS__)
#define vlseg5e64ff(...) __riscv_vlseg5e64ff_tumu(__VA_ARGS__)
#define vlseg6e64ff(...) __riscv_vlseg6e64ff_tumu(__VA_ARGS__)
#define vlseg7e64ff(...) __riscv_vlseg7e64ff_tumu(__VA_ARGS__)
#define vlseg8e64ff(...) __riscv_vlseg8e64ff_tumu(__VA_ARGS__)
#define vlseg2e8(...) __riscv_vlseg2e8_tumu(__VA_ARGS__)
#define vlseg3e8(...) __riscv_vlseg3e8_tumu(__VA_ARGS__)
#define vlseg4e8(...) __riscv_vlseg4e8_tumu(__VA_ARGS__)
#define vlseg5e8(...) __riscv_vlseg5e8_tumu(__VA_ARGS__)
#define vlseg6e8(...) __riscv_vlseg6e8_tumu(__VA_ARGS__)
#define vlseg7e8(...) __riscv_vlseg7e8_tumu(__VA_ARGS__)
#define vlseg8e8(...) __riscv_vlseg8e8_tumu(__VA_ARGS__)
#define vlseg2e8ff(...) __riscv_vlseg2e8ff_tumu(__VA_ARGS__)
#define vlseg3e8ff(...) __riscv_vlseg3e8ff_tumu(__VA_ARGS__)
#define vlseg4e8ff(...) __riscv_vlseg4e8ff_tumu(__VA_ARGS__)
#define vlseg5e8ff(...) __riscv_vlseg5e8ff_tumu(__VA_ARGS__)
#define vlseg6e8ff(...) __riscv_vlseg6e8ff_tumu(__VA_ARGS__)
#define vlseg7e8ff(...) __riscv_vlseg7e8ff_tumu(__VA_ARGS__)
#define vlseg8e8ff(...) __riscv_vlseg8e8ff_tumu(__VA_ARGS__)
#define vlsseg2e16(...) __riscv_vlsseg2e16_tumu(__VA_ARGS__)
#define vlsseg3e16(...) __riscv_vlsseg3e16_tumu(__VA_ARGS__)
#define vlsseg4e16(...) __riscv_vlsseg4e16_tumu(__VA_ARGS__)
#define vlsseg5e16(...) __riscv_vlsseg5e16_tumu(__VA_ARGS__)
#define vlsseg6e16(...) __riscv_vlsseg6e16_tumu(__VA_ARGS__)
#define vlsseg7e16(...) __riscv_vlsseg7e16_tumu(__VA_ARGS__)
#define vlsseg8e16(...) __riscv_vlsseg8e16_tumu(__VA_ARGS__)
#define vlsseg2e32(...) __riscv_vlsseg2e32_tumu(__VA_ARGS__)
#define vlsseg3e32(...) __riscv_vlsseg3e32_tumu(__VA_ARGS__)
#define vlsseg4e32(...) __riscv_vlsseg4e32_tumu(__VA_ARGS__)
#define vlsseg5e32(...) __riscv_vlsseg5e32_tumu(__VA_ARGS__)
#define vlsseg6e32(...) __riscv_vlsseg6e32_tumu(__VA_ARGS__)
#define vlsseg7e32(...) __riscv_vlsseg7e32_tumu(__VA_ARGS__)
#define vlsseg8e32(...) __riscv_vlsseg8e32_tumu(__VA_ARGS__)
#define vlsseg2e64(...) __riscv_vlsseg2e64_tumu(__VA_ARGS__)
#define vlsseg3e64(...) __riscv_vlsseg3e64_tumu(__VA_ARGS__)
#define vlsseg4e64(...) __riscv_vlsseg4e64_tumu(__VA_ARGS__)
#define vlsseg5e64(...) __riscv_vlsseg5e64_tumu(__VA_ARGS__)
#define vlsseg6e64(...) __riscv_vlsseg6e64_tumu(__VA_ARGS__)
#define vlsseg7e64(...) __riscv_vlsseg7e64_tumu(__VA_ARGS__)
#define vlsseg8e64(...) __riscv_vlsseg8e64_tumu(__VA_ARGS__)
#define vlsseg2e8(...) __riscv_vlsseg2e8_tumu(__VA_ARGS__)
#define vlsseg3e8(...) __riscv_vlsseg3e8_tumu(__VA_ARGS__)
#define vlsseg4e8(...) __riscv_vlsseg4e8_tumu(__VA_ARGS__)
#define vlsseg5e8(...) __riscv_vlsseg5e8_tumu(__VA_ARGS__)
#define vlsseg6e8(...) __riscv_vlsseg6e8_tumu(__VA_ARGS__)
#define vlsseg7e8(...) __riscv_vlsseg7e8_tumu(__VA_ARGS__)
#define vlsseg8e8(...) __riscv_vlsseg8e8_tumu(__VA_ARGS__)
#define viota(...) __riscv_viota_tumu(__VA_ARGS__)
#define vid(...) __riscv_vid_tumu(__VA_ARGS__)
#endif

View File

@@ -0,0 +1,33 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// 0.11 -> 0.12 compatibility
#ifndef _RVV_IMPLICIT_VXRM
#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
#endif
// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
// masked
#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
// unmasked
#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)

View File

@@ -0,0 +1,213 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
#define OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
// This file requires VTraits to be defined for vector types
#define OPENCV_HAL_IMPL_RVV_FUN_AND(REG, SUF) \
inline static REG vand(const REG & op1, const REG & op2, size_t vl) \
{ \
return vand_vv_##SUF(op1, op2, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_AND(vint8m1_t, i8m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint8m1_t, u8m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vint16m1_t, i16m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint16m1_t, u16m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vint32m1_t, i32m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint32m1_t, u32m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vint64m1_t, i64m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint64m1_t, u64m1)
#define OPENCV_HAL_IMPL_RVV_FUN_LOXEI(REG, SUF, INDX, ISUF) \
inline static REG vloxe##ISUF(const VTraits<REG>::lane_type *base, INDX bindex, size_t vl) \
{ \
return vloxe##ISUF##_v_##SUF(base, bindex, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint8m1_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint8m2_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m4_t, i8m4, vuint8m4_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m8_t, i8m8, vuint8m8_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint32m4_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint32m8_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint16m1_t, i16m1, vuint32m2_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m1_t, i32m1, vuint32m1_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m2_t, i32m2, vuint32m2_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m4_t, i32m4, vuint32m4_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m8_t, i32m8, vuint32m8_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint64m1_t, i64m1, vuint32mf2_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m1_t, u8m1, vuint8m1_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
#endif
#define OPENCV_HAL_IMPL_RVV_FUN_MUL(REG, SUF) \
inline static REG##m1_t vmul(const REG##m1_t & op1, const REG##m1_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m1(op1, op2, vl); \
} \
inline static REG##m1_t vmul(const REG##m1_t & op1, VTraits<REG##m1_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m1(op1, op2, vl); \
} \
inline static REG##m2_t vmul(const REG##m2_t & op1, const REG##m2_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m2(op1, op2, vl); \
} \
inline static REG##m2_t vmul(const REG##m2_t & op1, VTraits<REG##m2_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m2(op1, op2, vl); \
} \
inline static REG##m4_t vmul(const REG##m4_t & op1, const REG##m4_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m4(op1, op2, vl); \
} \
inline static REG##m4_t vmul(const REG##m4_t & op1, VTraits<REG##m4_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m4(op1, op2, vl); \
} \
inline static REG##m8_t vmul(const REG##m8_t & op1, const REG##m8_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m8(op1, op2, vl); \
} \
inline static REG##m8_t vmul(const REG##m8_t & op1, VTraits<REG##m8_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m8(op1, op2, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint8, i8)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint16, i16)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint32, i32)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint32, u32)
#define OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(REG1, SUF1, REG2, SUF2) \
inline static REG1##m1_t vreinterpret_##SUF1##m1(const REG2##m1_t & src) \
{\
return vreinterpret_v_##SUF2##m1_##SUF1##m1(src); \
} \
inline static REG1##m2_t vreinterpret_##SUF1##m2(const REG2##m2_t & src) \
{\
return vreinterpret_v_##SUF2##m2_##SUF1##m2(src); \
} \
inline static REG1##m4_t vreinterpret_##SUF1##m4(const REG2##m4_t & src) \
{\
return vreinterpret_v_##SUF2##m4_##SUF1##m4(src); \
} \
inline static REG1##m8_t vreinterpret_##SUF1##m8(const REG2##m8_t & src) \
{\
return vreinterpret_v_##SUF2##m8_##SUF1##m8(src); \
}
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint8, i8, vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint16, i16, vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vint32, i32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vfloat32, f32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vfloat32, f32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vint8, i8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vint16, i16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vint32, i32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint64, u64)
#define OPENCV_HAL_IMPL_RVV_FUN_STORE(REG, SUF, SZ) \
inline static void vse##SZ(VTraits<REG>::lane_type *base, REG value, size_t vl) \
{ \
return vse##SZ##_v_##SUF##m1(base, value, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint8, u8, 8)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int8, i8, 8)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint16, u16, 16)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int16, i16, 16)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint32, u32, 32)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int32, i32, 32)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint64, u64, 64)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int64, i64, 64)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float32, f32, 32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float64, f64, 64)
#endif
#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(REG, SUF) \
inline static VTraits<REG>::lane_type vmv_x(const REG & reg) \
{\
return vmv_x_s_##SUF##m1_##SUF(reg); \
}
#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(REG, SUF) \
inline static VTraits<REG>::lane_type vfmv_f(const REG & reg) \
{\
return vfmv_f_s_##SUF##m1_##SUF(reg); \
}
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int8, i8)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int16, i16)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int32, i32)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int64, i64)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float32, f32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float64, f64)
#endif
#define OPENCV_HAL_IMPL_RVV_FUN_SLIDE(REG, SUF) \
inline static REG vslidedown(const REG & dst, const REG & src, size_t offset, size_t vl) \
{ \
return vslidedown_vx_##SUF##m1(dst, src, offset, vl); \
} \
inline static REG vslideup(const REG & dst, const REG & src, size_t offset, size_t vl) \
{ \
return vslideup_vx_##SUF##m1(dst, src, offset, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int8, i8)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int16, i16)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int32, i32)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float32, f32)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int64, i64)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float64, f64)
#endif
inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t vl)
{
return vmul_vx_u32mf2(op1, op2, vl);
}
inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
{
return vreinterpret_v_i32mf2_u32mf2(val);
}
inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
{
return vreinterpret_v_u16mf2_u32mf2(val);
}
#endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP

File diff suppressed because it is too large Load Diff

View File

@@ -347,8 +347,6 @@ namespace hal_sse_internal
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
{ return _Tpvec(cast(a.val)); }
@@ -366,11 +364,6 @@ inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
template <> inline v_uint64x2 v_setzero_() { return v_setzero_u64(); }
template <> inline v_int64x2 v_setzero_() { return v_setzero_s64(); }
template <> inline v_uint64x2 v_setall_(uint64 val) { return v_setall_u64(val); }
template <> inline v_int64x2 v_setall_(int64 val) { return v_setall_s64(val); }
template<typename _Tpvec> inline
v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
template<typename _Tpvec> inline
@@ -742,46 +735,53 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
}
#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
}
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
@@ -845,7 +845,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
@@ -872,7 +872,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
#endif
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
@@ -886,7 +886,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
return v_uint32x4(_mm_add_epi32(p0, p1));
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
@@ -899,7 +899,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
return v_int32x4(_mm_add_epi32(p0, p1));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -911,14 +911,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
v_expand(c, c0, c1);
v_expand(d, d0, d1);
c0 = v_add(c0, c1); d0 = v_add(d0, d1);
c0 += c1; d0 += d1;
return v_uint64x2(_mm_add_epi64(
_mm_unpacklo_epi64(c0.val, d0.val),
_mm_unpackhi_epi64(c0.val, d0.val)
));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@@ -931,7 +931,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
@@ -939,8 +939,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
#if CV_SSE4_1
return v_cvt_f64(v_dotprod(a, b));
#else
v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b));
v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b));
v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
return v_float64x2(_mm_add_pd(
_mm_unpacklo_pd(c.val, d.val),
@@ -949,7 +949,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
#endif
}
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
@@ -957,13 +957,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{ return v_dotprod(a, b); }
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_dotprod(a, b); }
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_add(v_dotprod_fast(a, b), c); }
{ return v_dotprod_fast(a, b) + c; }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
@@ -977,7 +977,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
return v_uint32x4(_mm_add_epi32(p0, p1));
}
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
@@ -994,7 +994,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1006,34 +1006,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
v_expand(c, c0, c1);
v_expand(d, d0, d1);
c0 = v_add(c0, c1); d0 = v_add(d0, d1);
return v_add(c0, d0);
c0 += c1; d0 += d1;
return c0 + d0;
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return v_add(c, d);
return c + d;
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ \
return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
}
@@ -1182,58 +1182,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
}
#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
} \
inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
} \
inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
} \
inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
} \
inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
__m128i not_mask = _mm_set1_epi32(-1); \
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
} \
inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \
inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
{ \
__m128i smask = _mm_set1_##suffix(sbit); \
__m128i not_mask = _mm_set1_epi32(-1); \
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
} \
inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
{ \
return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
} \
inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
{ \
return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
} \
inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
} \
inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \
inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i not_mask = _mm_set1_epi32(-1); \
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
@@ -1244,17 +1244,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
@@ -1262,17 +1262,17 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
#if CV_SSE4_1
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
#else
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ return v_not(v_eq(a, b)); }
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
#endif
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
@@ -1311,17 +1311,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
/** Absolute difference **/
inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
{ return v_add_wrap(a - b, b - a); }
inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
{ return v_add_wrap(a - b, b - a); }
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = v_sub_wrap(a, b);
v_int8x16 m = v_lt(a, b);
return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
v_int8x16 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
}
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{
@@ -1329,25 +1329,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
}
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{
v_int32x4 d = v_sub(a, b);
v_int32x4 m = v_lt(a, b);
return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
v_int32x4 d = a - b;
v_int32x4 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
}
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = v_sub(a, b);
v_int8x16 m = v_lt(a, b);
return v_sub(v_xor(d, m), m);
v_int8x16 d = a - b;
v_int8x16 m = a < b;
return (d ^ m) - m;
}
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_sub(v_max(a, b), v_min(a, b)); }
{ return v_max(a, b) - v_min(a, b); }
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return v_add(v_mul(a, b), c);
return a * b + c;
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@@ -1381,12 +1381,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
} \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpvec res = v_fma(a, a, v_mul(b, b)); \
_Tpvec res = v_fma(a, a, b*b); \
return _Tpvec(_mm_sqrt_##suffix(res.val)); \
} \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
return v_fma(a, a, v_mul(b, b)); \
return v_fma(a, a, b*b); \
} \
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
{ \
@@ -1397,19 +1397,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((
OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ \
return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
} \
inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
{ \
return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
} \
inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ \
return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
} \
inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
{ \
return _Tpsvec(srai(a.val, imm)); \
} \
@@ -1711,9 +1711,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
inline int v_reduce_sum(const v_int16x8& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x8& a)
{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
@@ -1770,13 +1770,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(v_add(l, h));
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(v_add(l, h));
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
{
@@ -1805,15 +1805,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p = v_add(p, v_rotate_right<1>(p));
return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
}
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p = v_add(p, v_rotate_right<1>(p));
p = v_add(p, v_rotate_right<2>(p));
return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
}
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{
@@ -3459,21 +3459,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
inline void v_cleanup() {}
#include "intrin_math.hpp"
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@@ -261,8 +261,6 @@ OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
{ return _Tpvec((cast)a.val); }
@@ -515,44 +513,48 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
/* Element-wise binary and unary operations */
/** Arithmetics **/
#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
// saturating multiply
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
}
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -594,9 +596,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
inline _Tpvec v_shl(const _Tpvec& a, int imm) \
inline _Tpvec operator << (const _Tpvec& a, int imm) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpvec v_shr(const _Tpvec& a, int imm) \
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
@@ -615,10 +617,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and) \
OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or) \
OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor) \
inline _Tpvec v_not(const _Tpvec& a) \
OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(vec_not(a.val)); }
OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
@@ -648,17 +650,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
/** Comparison **/
#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpeq(a.val, b.val)); } \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpne(a.val, b.val)); } \
inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmplt(a.val, b.val)); } \
inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpgt(a.val, b.val)); } \
inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmple(a.val, b.val)); } \
inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpge(a.val, b.val)); }
OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
@@ -1058,7 +1060,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{ return v_add(v_mul(a, b), c); }
{ return a * b + c; }
// TODO: exp, log, sin, cos
@@ -1087,12 +1089,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); }
{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
{ return v_abs(v_sub(a, b)); }
{ return v_abs(a - b); }
/** Absolute difference for signed integers **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
@@ -1440,7 +1442,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
return v_int64x2(vec_add(even, odd));
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_add(v_dotprod(a, b), c); }
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -1483,7 +1485,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
return v_uint64x2(vec_add(s0, s1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
@@ -1493,13 +1495,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_add(v_dotprod_expand(a, b), c); }
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
@@ -1507,7 +1509,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{ return v_dotprod(a, b); }
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_add(v_int32x4(vec_msum(a.val, b.val, vec_int4_z)), c); }
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_dotprod(a, b); }
@@ -1518,7 +1520,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
{ return v_dotprod_expand(a, b); }
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_add(v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)), c); }
{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
@@ -1529,7 +1531,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1542,10 +1544,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return v_add(c, d);
return c + d;
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_add(v_dotprod_expand_fast(a, b), c); }
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
@@ -1596,19 +1598,6 @@ template<int i, typename Tvec>
inline Tvec v_broadcast_element(const Tvec& v)
{ return Tvec(vec_splat(v.val, i)); }
#include "intrin_math.hpp"
inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

File diff suppressed because it is too large Load Diff