fix:make zerocolor edge

2026-02-04 03:18:20 +08:00
parent ca3545b8b0
commit 001685b633
157 changed files with 31832 additions and 32681 deletions
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin.hpp
@@ -64,7 +64,7 @@
 namespace {
 inline unsigned int trailingZeros32(unsigned int value) {
 #if defined(_MSC_VER)
-#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
    unsigned long index = 0;
    _BitScanForward(&index, value);
    return (unsigned int)index;
@@ -191,19 +191,6 @@ CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double)
 #endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-
-template <typename _VecTp> inline _VecTp v_setzero_();
-template <typename _VecTp> inline _VecTp v_setall_(uchar);
-template <typename _VecTp> inline _VecTp v_setall_(schar);
-template <typename _VecTp> inline _VecTp v_setall_(ushort);
-template <typename _VecTp> inline _VecTp v_setall_(short);
-template <typename _VecTp> inline _VecTp v_setall_(unsigned);
-template <typename _VecTp> inline _VecTp v_setall_(int);
-template <typename _VecTp> inline _VecTp v_setall_(uint64);
-template <typename _VecTp> inline _VecTp v_setall_(int64);
-template <typename _VecTp> inline _VecTp v_setall_(float);
-template <typename _VecTp> inline _VecTp v_setall_(double);
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
@@ -249,7 +236,11 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #include "opencv2/core/hal/intrin_wasm.hpp"

 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
+#if defined(CV_RVV_SCALABLE)
 #include "opencv2/core/hal/intrin_rvv_scalable.hpp"
+#else
+#include "opencv2/core/hal/intrin_rvv.hpp"
+#endif

 #elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)

@@ -730,15 +721,314 @@ namespace CV__SIMD_NAMESPACE {
    /** @brief SIMD processing state cleanup call */
    inline void vx_cleanup() { VXPREFIX(_cleanup)(); }

-#if !CV_SIMD_SCALABLE
+#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
    // Compatibility layer
-#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
+
    template<typename T> struct VTraits {
        static inline int vlanes() { return T::nlanes; }
        enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
        using lane_type = typename T::lane_type;
    };

+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a + b; \
+    } \
+    inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a - b; \
+    } \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_add(f1 + f2, vf...); \
+    }
+    #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
+    inline _Tpvec v_shr(const _Tpvec& a, int n) \
+    { \
+        return a >> n; \
+    } \
+    inline _Tpvec v_shl(const _Tpvec& a, int n) \
+    { \
+        return a << n; \
+    }
+
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
+    inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a & b; \
+    } \
+    inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a | b; \
+    } \
+    inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a ^ b; \
+    }
+
+    #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
+    inline _Tpvec v_not(const _Tpvec& a) \
+    { \
+        return ~a; \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint8)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint16)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint64)
+    OPENCV_HAL_WRAP_NOT_OP(v_int8)
+    OPENCV_HAL_WRAP_NOT_OP(v_int16)
+    OPENCV_HAL_WRAP_NOT_OP(v_int32)
+    OPENCV_HAL_WRAP_NOT_OP(v_int64)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a * b; \
+    } \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_mul(f1 * f2, vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
+    inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a / b; \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
+    inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a op b; \
+    }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }
+
+    #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
+
+    OPENCV_HAL_WRAP_CMP(v_uint8)
+    OPENCV_HAL_WRAP_CMP(v_uint16)
+    OPENCV_HAL_WRAP_CMP(v_uint32)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
+    OPENCV_HAL_WRAP_CMP(v_int8)
+    OPENCV_HAL_WRAP_CMP(v_int16)
+    OPENCV_HAL_WRAP_CMP(v_int32)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
+    OPENCV_HAL_WRAP_CMP(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_CMP(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_CMP(v_uint8x16)
+        OPENCV_HAL_WRAP_CMP(v_uint16x8)
+        OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_CMP(v_int8x16)
+        OPENCV_HAL_WRAP_CMP(v_int16x8)
+        OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
+        OPENCV_HAL_WRAP_CMP(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_CMP(v_uint8x32)
+        OPENCV_HAL_WRAP_CMP(v_uint16x16)
+        OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_CMP(v_int8x32)
+        OPENCV_HAL_WRAP_CMP(v_int16x16)
+        OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
+        OPENCV_HAL_WRAP_CMP(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x4)
+        #endif
+    #endif
+
    //////////// get0 ////////////
    #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
    inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
@@ -786,96 +1076,6 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_GRT0(v_float64x4)
        #endif
    #endif
-#endif
-
-    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
-    template<typename... Args> \
-    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
-        return v_add(v_add(f1, f2), f3, vf...); \
-    }
-
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
-        #endif
-    #endif
-
-    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
-    template<typename... Args> \
-    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
-        return v_mul(v_mul(f1, f2), f3, vf...); \
-    }
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
-    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
-    #endif
-    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
-        #endif
-    #endif
-    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
-        #if CV_SIMD_64F
-        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
-        #endif
-    #endif

    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
@@ -946,6 +1146,74 @@ namespace CV__SIMD_NAMESPACE {

 #endif //!CV_SIMD_SCALABLE

+#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
+// Compatibility layer for the backend that cleaned up.
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_add(v_add(f1, f2), vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_mul(v_mul(f1, f2), vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
+    { \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+
+#endif //CV_NEON
+
 //! @cond IGNORED

    // backward compatibility
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx.hpp
@@ -447,10 +447,6 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
    { return _Tpvec(_mm256_setzero_si256()); }                                   \
    inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
    { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
-    template <> inline _Tpvec v_setzero_()                                       \
-    { return v256_setzero_##suffix(); }                                          \
-    template <> inline _Tpvec v_setall_(_Tp v)                                   \
-    { return v256_setall_##suffix(v); }                                          \
    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
@@ -476,10 +472,6 @@ OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
    { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
    inline _Tpvec v256_setall_##suffix(_Tp v)                            \
    { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
-    template <> inline _Tpvec v_setzero_()                               \
-    { return v256_setzero_##suffix(); }                                  \
-    template <> inline _Tpvec v_setall_(_Tp v)                           \
-    { return v256_setall_##suffix(v); }                                  \
    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
@@ -681,51 +673,53 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)

 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)            \
-    { return _Tpvec(intrin(a.val, b.val)); }
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }

-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32,  _mm256_adds_epu8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32,  _mm256_subs_epu8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32,   _mm256_adds_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32,   _mm256_subs_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16,  _mm256_adds_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16,  _mm256_subs_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8,  _mm256_add_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8,  _mm256_sub_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8,  _mm256_mullo_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8,   _mm256_add_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8,   _mm256_sub_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8,   _mm256_mullo_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4,  _mm256_add_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4,  _mm256_sub_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4,   _mm256_add_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4,   _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)

-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)

 // saturating multiply 8-bit, 16-bit
-inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
 {
    v_uint16x16 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
 {
    v_int16x16 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
 {
    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
    __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
@@ -733,7 +727,7 @@ inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
    return v_uint16x16(_v256_packs_epu32(p0, p1));
 }
-inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
 {
    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
    __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
@@ -741,6 +735,14 @@ inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
    return v_int16x16(_mm256_packs_epi32(p0, p1));
 }
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }

 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
@@ -831,13 +833,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return

 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
-    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                   \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
-    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                   \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
-    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                   \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
-    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                   \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
    { return _Tpsvec(srai(a.val, imm)); }                             \
    template<int imm>                                                 \
    inline _Tpuvec v_shl(const _Tpuvec& a)                            \
@@ -865,11 +867,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx


 /** Bitwise logic **/
-#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)     \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix)  \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix)    \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix)  \
-    inline _Tpvec v_not(const _Tpvec& a)                            \
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                   \
    { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }

 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
@@ -898,29 +900,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)

 /** Comparison **/
-#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                            \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_not(v_eq(a, b)); }                                        \
-    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_gt(b, a); }                                               \
-    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_not(v_lt(a, b)); }                                        \
-    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_ge(b, a); }
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }                                         \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
+    { return b > a; }                                             \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a < b); }                                          \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
+    { return b >= a; }

 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
-    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)              \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
    { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
-    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)              \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
    {                                                                    \
        __m256i smask = _mm256_set1_##suffix(sbit);                      \
        return _Tpuvec(_mm256_cmpgt_##suffix(                            \
                       _mm256_xor_si256(a.val, smask),                   \
                       _mm256_xor_si256(b.val, smask)));                 \
    }                                                                    \
-    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)              \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
    { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
-    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)              \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
    { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
@@ -930,25 +932,25 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)

 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
-    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)         \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
    { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)         \
-    { return v_not(v_eq(a, b)); }
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+    { return ~(a == b); }

 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)

 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)           \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
    { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }

 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ,  _Tpvec, suffix)   \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix)   \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt,  _CMP_LT_OQ,  _Tpvec, suffix)  \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt,  _CMP_GT_OQ,  _Tpvec, suffix)  \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ,  _Tpvec, suffix)   \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ,  _Tpvec, suffix)
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)

 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
@@ -1214,9 +1216,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }

 inline int v_reduce_sum(const v_int16x16& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
 inline unsigned v_reduce_sum(const v_uint16x16& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }

 inline float v_reduce_sum(const v_float32x8& a)
 {
@@ -1271,27 +1273,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
    v_uint32x8 l, h;
-    v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
-    return v_reduce_sum(v_add(l, h));
+    v_expand(v_add_wrap(a - b, b - a), l, h);
+    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
 {
    v_uint32x8 l, h;
    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
-    return v_reduce_sum(v_add(l, h));
+    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
 {
-    return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
+    return v_reduce_sum(v_max(a, b) - v_min(a, b));
 }
 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 m = v_lt(a, b);
-    return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
+    v_int32x8 m = a < b;
+    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
 }
 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 {
-    return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))));
+    return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
 }

 /** Popcount **/
@@ -1306,15 +1308,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a)
 inline v_uint16x16 v_popcount(const v_uint16x16& a)
 {
    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p = v_add(p, v_rotate_right<1>(p));
-    return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
 }
 inline v_uint32x8 v_popcount(const v_uint32x8& a)
 {
    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p = v_add(p, v_rotate_right<1>(p));
-    p = v_add(p, v_rotate_right<2>(p));
-    return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
 }
 inline v_uint64x4 v_popcount(const v_uint64x4& a)
 {
@@ -1406,9 +1408,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
    { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_fma(a, a, v_mul(b, b)); }                                      \
+    { return v_fma(a, a, b * b); }                                            \
    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
-    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
+    { return v_sqrt(v_fma(a, a, b*b)); }

 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
@@ -1417,7 +1419,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)

 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
 {
-    return v_add(v_mul(a, b), c);
+    return a * b + c;
 }

 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@@ -1427,16 +1429,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x

 inline v_float32x8 v_invsqrt(const v_float32x8& x)
 {
-    v_float32x8 half = v_mul(x, v256_setall_f32(0.5));
+    v_float32x8 half = x * v256_setall_f32(0.5);
    v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
    // todo: _mm256_fnmsub_ps
-    t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half)));
+    t *= v256_setall_f32(1.5) - ((t * t) * half);
    return t;
 }

 inline v_float64x4 v_invsqrt(const v_float64x4& x)
 {
-    return v_div(v256_setall_f64(1.), v_sqrt(x));
+    return v256_setall_f64(1.) / v_sqrt(x);
 }

 /** Absolute values **/
@@ -1449,23 +1451,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
 OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)

 inline v_float32x8 v_abs(const v_float32x8& x)
-{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); }
+{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
 inline v_float64x4 v_abs(const v_float64x4& x)
-{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); }
+{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }

 /** Absolute difference **/
 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
-{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
 {
    v_int8x32 d = v_sub_wrap(a, b);
-    v_int8x32 m = v_lt(a, b);
-    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
+    v_int8x32 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
 }

 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
@@ -1473,26 +1475,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)

 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 d = v_sub(a, b);
-    v_int32x8 m = v_lt(a, b);
-    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
+    v_int32x8 d = a - b;
+    v_int32x8 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
 }

 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 /** Saturating absolute difference **/
 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
 {
-    v_int8x32 d = v_sub(a, b);
-    v_int8x32 m = v_lt(a, b);
-    return v_sub(v_xor(d, m), m);
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
 }
 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 ////////// Conversions /////////

@@ -1787,7 +1789,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 32 >> 64
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1797,7 +1799,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
    return v_int64x4(_mm256_add_epi64(even, odd));
 }
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 8 >> 32
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
@@ -1814,7 +1816,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
    return v_uint32x8(_mm256_add_epi32(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
@@ -1829,7 +1831,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
    return v_int32x8(_mm256_add_epi32(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1853,7 +1855,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
    ));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1869,13 +1871,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
    ));
 }
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 //////// Fast Dot Product ////////

@@ -1921,7 +1923,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
    return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1932,7 +1934,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
    return v_int64x4(_mm256_add_epi64(lo, hi));
 }
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -1951,7 +1953,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
    v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
 }

 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@@ -2056,43 +2058,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
 {
    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
-                    v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
 }

 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
 {
    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
 }

 template<int n> inline
 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
 {
    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_u_store(ptr, (a + delta) >> n);
 }

 template<int n> inline
 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
 {
    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
 {
    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 // 32
@@ -2125,43 +2127,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
 {
    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
-                    v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
 }

 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
 {
    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
 }

 template<int n> inline
 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
 {
    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
 {
    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_u_store(ptr, (a + delta) >> n);
 }

 template<int n> inline
 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
 {
    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
 {
    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 // 64
@@ -2190,28 +2192,28 @@ template<int n> inline
 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
 {
    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
 {
    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 template<int n> inline
 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
 {
    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
 {
    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 // pack boolean
@@ -3166,20 +3168,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)

 inline void v256_cleanup() { _mm256_zeroall(); }

-#include "intrin_math.hpp"
-inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
-inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
-inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
-inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
-inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
-inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
-
-inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
-inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
-inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
-inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
-inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx512.hpp
@@ -458,10 +458,6 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
    { return _Tpvec(_mm512_setzero_si512()); }                                     \
    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
-    template <> inline _Tpvec v_setzero_()                                         \
-    { return v512_setzero_##suffix(); }                                            \
-    template <> inline _Tpvec v_setall_(_Tp v)                                     \
-    { return v512_setall_##suffix(v); }                                            \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
@@ -487,10 +483,6 @@ OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
-    template <> inline _Tpvec v_setzero_()                                  \
-    { return v512_setzero_##suffix(); }                                     \
-    template <> inline _Tpvec v_setall_(_Tp v)                              \
-    { return v512_setall_##suffix(v); }                                     \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
@@ -671,56 +663,58 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
 }

 #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)               \
-    { return _Tpvec(intrin(a.val, b.val)); }
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(intrin(a.val, b.val)); }                             \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
+    { a.val = intrin(a.val, b.val); return a; }

-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)

-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)

 /** Saturating arithmetics **/
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64,  _mm512_adds_epu8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64,  _mm512_subs_epu8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64,   _mm512_adds_epi8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64,   _mm512_subs_epi8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32,  _mm512_adds_epi16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32,  _mm512_subs_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)

-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)

 // saturating multiply
-inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b)
+inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
 {
    v_uint16x32 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b)
+inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
 {
    v_int16x32 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
+inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
 {
    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
@@ -730,7 +724,7 @@ inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
    const __m512i m = _mm512_set1_epi32(65535);
    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
 }
-inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
+inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
 {
    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
@@ -739,6 +733,15 @@ inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
    return v_int16x32(_mm512_packs_epi32(p0, p1));
 }

+inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
+{ a = a * b; return a; }
+inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
+{ a = a * b; return a; }
+inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
+{ a = a * b; return a; }
+inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
+{ a = a * b; return a; }
+
 inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
 inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }

@@ -799,13 +802,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,

 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
-    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)               \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
-    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)               \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
-    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)               \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
-    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)               \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
    template<int imm>                                             \
    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
@@ -827,10 +830,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)

 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix)  \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix)    \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix)  \
-    inline _Tpvec v_not(const _Tpvec& a)                               \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                     \
    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }

 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
@@ -862,16 +865,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)

 /** Comparison **/
 #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                         \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }

 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval)  \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval)  \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval)  \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)

 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
@@ -883,16 +886,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)

 #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                        \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }

 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval)  \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval)  \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval)  \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)

 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
@@ -1247,9 +1250,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
 OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)

 inline int v_reduce_sum(const v_int16x32& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
 inline uint v_reduce_sum(const v_uint16x32& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }

 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
@@ -1303,17 +1306,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
 }
 inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); }
+{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
 inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
 { return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
 inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
-{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); }
+{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
 inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
-{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); }
+{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
 inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
-{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); }
+{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
 inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
-{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); }
+{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }

 /** Popcount **/
 inline v_uint8x64 v_popcount(const v_int8x64& a)
@@ -1348,8 +1351,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a)
                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
 #else
    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
-    p = v_add(p, v_rotate_right<1>(p));
-    return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
 #endif
 }
 inline v_uint32x16 v_popcount(const v_int32x16& a)
@@ -1358,9 +1361,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a)
    return v_uint32x16(_mm512_popcnt_epi32(a.val));
 #else
    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
-    p = v_add(p, v_rotate_right<1>(p));
-    p = v_add(p, v_rotate_right<2>(p));
-    return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
 #endif
 }
 inline v_uint64x8 v_popcount(const v_int64x8& a)
@@ -1400,9 +1403,9 @@ inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinte
    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_fma(a, a, v_mul(b, b)); }                                      \
+    { return v_fma(a, a, b * b); }                                            \
    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
-    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
+    { return v_sqrt(v_fma(a, a, b * b)); }

 OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
@@ -1410,7 +1413,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8,  pd)

 inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
-{ return v_add(v_mul(a, b), c); }
+{ return a * b + c; }
 inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
 { return v_fma(a, b, c); }

@@ -1419,9 +1422,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x)
 #if CV_AVX_512ER
    return v_float32x16(_mm512_rsqrt28_ps(x.val));
 #else
-    v_float32x16 half = v_mul(x, v512_setall_f32(0.5));
+    v_float32x16 half = x * v512_setall_f32(0.5);
    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
-    t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half)));
+    t *= v512_setall_f32(1.5) - ((t * t) * half);
    return t;
 #endif
 }
@@ -1431,7 +1434,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x)
 #if CV_AVX_512ER
    return v_float64x8(_mm512_rsqrt28_pd(x.val));
 #else
-    return v_div(v512_setall_f64(1.), v_sqrt(x));
+    return v512_setall_f64(1.) / v_sqrt(x);
 //    v_float64x8 half = x * v512_setall_f64(0.5);
 //    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
 //    t *= v512_setall_f64(1.5) - ((t * t) * half);
@@ -1479,17 +1482,17 @@ inline v_float64x8 v_abs(const v_float64x8& x)

 /** Absolute difference **/
 inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
-{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
 {
    v_int8x64 d = v_sub_wrap(a, b);
-    v_int8x64 m = v_lt(a, b);
-    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
+    v_int8x64 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
 }

 inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
@@ -1497,26 +1500,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)

 inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
 {
-    v_int32x16 d = v_sub(a, b);
-    v_int32x16 m = v_lt(a, b);
-    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
+    v_int32x16 d = a - b;
+    v_int32x16 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
 }

 inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 /** Saturating absolute difference **/
 inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
 {
-    v_int8x64 d = v_sub(a, b);
-    v_int8x64 m = v_lt(a, b);
-    return v_sub(v_xor(d, m), m);
+    v_int8x64 d = a - b;
+    v_int8x64 m = a < b;
+    return (d ^ m) - m;
 }
 inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 ////////// Conversions /////////

@@ -1815,7 +1818,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
 { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 32 >> 64
 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
@@ -1825,7 +1828,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
    return v_int64x8(_mm512_add_epi64(even, odd));
 }
 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 8 >> 32
 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
@@ -1841,7 +1844,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
 }
 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
 {
@@ -1856,7 +1859,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
    return v_int32x16(_mm512_add_epi32(prod0, prod1));
 }
 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 16 >> 64
 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
@@ -1880,7 +1883,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
    ));
 }
 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
 {
@@ -1890,13 +1893,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
    return v_int64x8(_mm512_add_epi64(even, odd));
 }
 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 32 >> 64f
 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 //////// Fast Dot Product ////////

@@ -1941,7 +1944,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32&
    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
 }
 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
 { return v_dotprod_expand(a, b); }
@@ -1952,7 +1955,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b,
 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
 { return v_dotprod_expand(a, b); }
 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }


 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
@@ -1966,7 +1969,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v,
    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
 }

 inline v_float32x16 v_matmuladd(const v_float32x16& v,
@@ -2067,43 +2070,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
 {
    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
-                    v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
 }

 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
 {
    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
 }

 template<int n> inline
 v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
 {
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
 {
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_u_store(ptr, (a + delta) >> n);
 }

 template<int n> inline
 v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
 {
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
 {
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 // 32
@@ -2136,43 +2139,43 @@ template<int n> inline
 v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
 {
    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
-                    v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
 }

 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
 {
    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
 }

 template<int n> inline
 v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
 {
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
 {
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_u_store(ptr, (a + delta) >> n);
 }

 template<int n> inline
 v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
 {
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x16& a)
 {
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 // 64
@@ -2193,28 +2196,28 @@ template<int n> inline
 v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
 {
    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
 {
    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 template<int n> inline
 v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
 {
    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
-    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
 }

 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x8& a)
 {
    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, v_shr(v_add(a, delta), n));
+    v_pack_store(ptr, (a + delta) >> n);
 }

 // pack boolean
@@ -3078,20 +3081,6 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm

 inline void v512_cleanup() { _mm256_zeroall(); }

-#include "intrin_math.hpp"
-inline v_float32x16 v_exp(const v_float32x16& x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
-inline v_float32x16 v_log(const v_float32x16& x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
-inline void v_sincos(const v_float32x16& x, v_float32x16& s, v_float32x16& c) { v_sincos_default_32f<v_float32x16, v_int32x16>(x, s, c); }
-inline v_float32x16 v_sin(const v_float32x16& x) { return v_sin_default_32f<v_float32x16, v_int32x16>(x); }
-inline v_float32x16 v_cos(const v_float32x16& x) { return v_cos_default_32f<v_float32x16, v_int32x16>(x); }
-inline v_float32x16 v_erf(const v_float32x16& x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
-
-inline v_float64x8 v_exp(const v_float64x8& x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
-inline v_float64x8 v_log(const v_float64x8& x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
-inline void v_sincos(const v_float64x8& x, v_float64x8& s, v_float64x8& c) { v_sincos_default_64f<v_float64x8, v_int64x8>(x, s, c); }
-inline v_float64x8 v_sin(const v_float64x8& x) { return v_sin_default_64f<v_float64x8, v_int64x8>(x); }
-inline v_float64x8 v_cos(const v_float64x8& x) { return v_cos_default_64f<v_float64x8, v_int64x8>(x); }
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_cpp.hpp
@@ -81,26 +81,9 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
 different platforms. Currently a few different SIMD extensions on different architectures are supported.
-
-OpenCV Universal Intrinsics support the following instruction sets:
-
- *128 bit* registers of various types support is implemented for a wide range of architectures including
-  - x86(SSE/SSE2/SSE4.2),
-  - ARM(NEON): 64-bit float (64F) requires AArch64,
-  - PowerPC(VSX),
-  - MIPS(MSA),
-  - LoongArch(LSX),
-  - RISC-V(RVV 0.7.1): Fixed-length implementation,
-  - WASM: 64-bit float (64F) is not supported,
- *256 bit* registers are supported on
-  - x86(AVX2),
-  - LoongArch (LASX),
- *512 bit* registers are supported on
-  - x86(AVX512),
- *Vector Length Agnostic (VLA)* registers are supported on
-  - RISC-V(RVV 1.0)
-  - ARM(SVE/SVE2): Powered by Arm KleidiCV integration (OpenCV 4.11+),
-
+128 bit registers of various types support is implemented for a wide range of architectures
+including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
+256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
 In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
 will be chosen and code will work as expected although it could be slower.

@@ -242,30 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 Element-wise binary and unary operations.

 - Arithmetics:
-@ref v_add,
-@ref v_sub,
-@ref v_mul,
-@ref v_div,
+@ref operator +(const v_reg &a, const v_reg &b) "+",
+@ref operator -(const v_reg &a, const v_reg &b) "-",
+@ref operator *(const v_reg &a, const v_reg &b) "*",
+@ref operator /(const v_reg &a, const v_reg &b) "/",
@ref v_mul_expand

 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap

 - Bitwise shifts:
+@ref operator <<(const v_reg &a, int s) "<<",
+@ref operator >>(const v_reg &a, int s) ">>",
@ref v_shl, @ref v_shr

 - Bitwise logic:
-@ref v_and,
-@ref v_or,
-@ref v_xor,
-@ref v_not
+@ref operator &(const v_reg &a, const v_reg &b) "&",
+@ref operator |(const v_reg &a, const v_reg &b) "|",
+@ref operator ^(const v_reg &a, const v_reg &b) "^",
+@ref operator ~(const v_reg &a) "~"

 - Comparison:
-@ref v_gt,
-@ref v_ge,
-@ref v_lt,
-@ref v_le,
-@ref v_eq,
-@ref v_ne
+@ref operator >(const v_reg &a, const v_reg &b) ">",
+@ref operator >=(const v_reg &a, const v_reg &b) ">=",
+@ref operator <(const v_reg &a, const v_reg &b) "<",
+@ref operator <=(const v_reg &a, const v_reg &b) "<=",
+@ref operator ==(const v_reg &a, const v_reg &b) "==",
+@ref operator !=(const v_reg &a, const v_reg &b) "!="

 - min/max: @ref v_min, @ref v_max

@@ -278,8 +263,7 @@ Most of these operations return only one value.

 ### Other math

- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log,
-                            @ref v_erf, @ref v_sin, @ref v_cos
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs

 ### Conversions
@@ -379,9 +363,6 @@ Floating point:
 |reverse            | x | x |
 |extract_n          | x | x |
 |broadcast_element  | x |   |
-|exp                | x | x |
-|log                | x | x |
-|sin, cos           | x | x |

 @{ */

@@ -589,43 +570,50 @@ enum {
 /** @brief Add values

 For all types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

 /** @brief Subtract values

 For all types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

 /** @brief Multiply values

 For 16- and 32-bit integer types and floating types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

 /** @brief Divide values

 For floating types only. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);


 /** @brief Bitwise AND

 Only for integer types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

 /** @brief Bitwise OR

 Only for integer types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

 /** @brief Bitwise XOR

 Only for integer types.*/
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);

 /** @brief Bitwise NOT

 Only for integer types.*/
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);


 #ifndef CV_DOXYGEN
@@ -648,26 +636,33 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \

-#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \
+#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
 template<int n> inline \
-v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
    v_reg<_Tp, n> c; \
    for( int i = 0; i < n; i++ ) \
        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
    return c; \
+} \
+template<int n> inline \
+v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
 }

-#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func)
+#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)

-CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add)
-CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub)
-CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul)
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div)
+CV__HAL_INTRIN_IMPL_BIN_OP(+)
+CV__HAL_INTRIN_IMPL_BIN_OP(-)
+CV__HAL_INTRIN_IMPL_BIN_OP(*)
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)

-#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \
+#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
 template<int n> CV_INLINE \
-v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
    v_reg<_Tp, n> c; \
    typedef typename V_TypeTraits<_Tp>::int_type itype; \
@@ -675,20 +670,29 @@ v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
    return c; \
+} \
+template<int n> CV_INLINE \
+v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
 }

-#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */
+#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */


-CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and)
-CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or)
-CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor)
+CV__HAL_INTRIN_IMPL_BIT_OP(&)
+CV__HAL_INTRIN_IMPL_BIT_OP(|)
+CV__HAL_INTRIN_IMPL_BIT_OP(^)

-#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \
+#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
 template<int n> CV_INLINE \
-v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
+v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
 { \
    v_reg<_Tp, n> c; \
    for( int i = 0; i < n; i++ ) \
@@ -696,7 +700,7 @@ v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
    return c; \
 } \

-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not)
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)

 #endif  // !CV_DOXYGEN

@@ -717,85 +721,12 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
 Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)

-/**
- * @brief Exponential \f$ e^x \f$ of elements
- *
- * Only for floating point types. Core implementation steps:
- * 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
- *    \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
- * 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
- * 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
- *    \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
- * 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
- *    - For float16 and float32, use a Taylor Series with 6 terms.
- *    - For float64, use Pade Polynomials Approximation with 4 terms.
- * 5. Combine Results: Multiply the two parts together to get the final result:
- *    \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
- *
- * @note The precision of the calculation depends on the implementation and the data type of the input vector.
- */
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
-#define OPENCV_HAL_MATH_HAVE_EXP 1
-
-/**
- * @brief Natural logarithm \f$ \log(x) \f$ of elements
- *
- * Only for floating point types. Core implementation steps:
- * 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$.
- * 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation.
- * 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result.
- *    - For float16 and float32, use a Taylor Series with 9 terms.
- *    - For float64, use Pade Polynomials Approximation with 6 terms.
- * 4. Combine Results: Add the two parts together to get the final result.
- *
- * @note The precision of the calculation depends on the implementation and the data type of the input.
- *
- * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
- */
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-
-/**
- * @brief Error function.
- *
- * @note Support FP32 precision for now.
- */
-OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
-
-/**
- * @brief Compute sine \f$ sin(x) \f$ and cosine \f$ cos(x) \f$ of elements at the same time
- *
- * Only for floating point types. Core implementation steps:
- * 1. Input Normalization: Scale the periodicity from 2π to 4 and reduce the angle to the range \f$ [0, \frac{\pi}{4}] \f$ using periodicity and trigonometric identities.
- * 2. Polynomial Approximation for \f$ sin(x) \f$ and \f$ cos(x) \f$:
- *   - For float16 and float32, use a Taylor series with 4 terms for sine and 5 terms for cosine.
- *   - For float64, use a Taylor series with 7 terms for sine and 8 terms for cosine.
- * 3. Select Results: select and convert the final sine and cosine values for the original input angle.
- *
- * @note The precision of the calculation depends on the implementation and the data type of the input vector.
- */
-template<typename _Tp, int n>
-inline void v_sincos(const v_reg<_Tp, n>& x, v_reg<_Tp, n>& s, v_reg<_Tp, n>& c)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        s.s[i] = std::sin(x.s[i]);
-        c.s[i] = std::cos(x.s[i]);
-    }
-}
-
-/**
- * @brief Sine \f$ sin(x) \f$ of elements
- *
- * Only for floating point types. Core implementation the same as @ref v_sincos.
- */
+//! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-
-/**
- * @brief Cosine \f$ cos(x) \f$ of elements
- *
- * Only for floating point types. Core implementation the same as @ref v_sincos.
- */
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+//! @endcond

 /** @brief Absolute value of elements

@@ -918,9 +849,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
 template<typename _Tp, int n> \
-inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
    typedef typename V_TypeTraits<_Tp>::int_type itype; \
    v_reg<_Tp, n> c; \
@@ -932,28 +863,28 @@ inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 /** @brief Less-than comparison

 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(<, v_lt)
+OPENCV_HAL_IMPL_CMP_OP(<)

 /** @brief Greater-than comparison

 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(>, v_gt)
+OPENCV_HAL_IMPL_CMP_OP(>)

 /** @brief Less-than or equal comparison

 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(<=, v_le)
+OPENCV_HAL_IMPL_CMP_OP(<=)

 /** @brief Greater-than or equal comparison

 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(>=, v_ge)
+OPENCV_HAL_IMPL_CMP_OP(>=)

 /** @brief Equal comparison */
-OPENCV_HAL_IMPL_CMP_OP(==, v_eq)
+OPENCV_HAL_IMPL_CMP_OP(==)

 /** @brief Not equal comparison */
-OPENCV_HAL_IMPL_CMP_OP(!=, v_ne)
+OPENCV_HAL_IMPL_CMP_OP(!=)

 template<int n>
 inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
@@ -1322,8 +1253,8 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
 { \
    v_reg<_Tp, n> c; \
    for( int i = 0; i < n; i++ ) \
@@ -1334,12 +1265,12 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a,
 /** @brief Bitwise shift left

 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl)
+OPENCV_HAL_IMPL_SHIFT_OP(<< )

 /** @brief Bitwise shift right

 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr)
+OPENCV_HAL_IMPL_SHIFT_OP(>> )

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
@@ -2848,8 +2779,7 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
-inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } \
-template <> inline _Tpvec v_setzero_() { return _Tpvec::zero(); }
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }

 //! @name Init with zero
 //! @{
@@ -2895,8 +2825,7 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
-inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
-template <> inline _Tpvec v_setall_(_Tp val) { return _Tpvec::all(val); }
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }

 //! @name Init with value
 //! @{
@@ -2965,7 +2894,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
 template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
-{ return v_shl(a, shift); }
+{ return a << shift; }

 //! @name Left shift
 //! @{
@@ -2982,7 +2911,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64)
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
 template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
-{ return v_shr(a, shift); }
+{ return a >> shift; }

 //! @name Right shift
 //! @{
@@ -3308,7 +3237,7 @@ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,


 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
                                                           const v_reg<double, n/2>& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_lasx.hpp
@@ -557,10 +557,6 @@ inline __m256i _lasx_256_castpd_si256(const __m256d& v)
    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                                   \
    inline _Tpvec v256_setall_##suffix(_Tp v)                                     \
    { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); }                  \
-    template <> inline _Tpvec v_setzero_()                                        \
-    { return v256_setzero_##suffix(); }                                           \
-    template <> inline _Tpvec v_setall_(_Tp v)                                    \
-    { return v256_setall_##suffix(v); }                                           \
    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
@@ -592,11 +588,7 @@ inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
    inline _Tpvec v256_setzero_##suffix()                                 \
    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                           \
    inline _Tpvec v256_setall_##suffix(_Tp v)                             \
-    { return _Tpvec(_v256_setall_##zsuffix(v)); }                         \
-    template <> inline _Tpvec v_setzero_()                                \
-    { return v256_setzero_##suffix(); }                                   \
-    template <> inline _Tpvec v_setall_(_Tp v)                            \
-    { return v256_setall_##suffix(v); }                                   \
+    { return _Tpvec(_v256_setall_##zsuffix(v)); }                   \
    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
@@ -658,18 +650,16 @@ inline v_float32x8 v256_shuffle(const v_float32x8 &a)
 template<int m>
 inline v_float64x4 v256_shuffle(const v_float64x4 &a)
 {
-    const int m1 = m & 0b1;
-    const int m2 = m & 0b10;
-    const int m3 = m & 0b100;
-    const int m4 = m & 0b1000;
-    const int m5 = m2 << 1;
-    const int m6 = m3 << 2;
-    const int m7 = m4 << 3;
-    const int m8 = m1 & m5 & m6 & m7;
+    int imm8 = m & 0b0001;  //0 or 1
+    if (m & 0x0b0010) imm8 |= 0b0100;
+    //else imm8 |= 0b0000;
+    if (m & 0x0b0100) imm8 |= 0b110000;  //2 or 3
+    else imm8 |= 0b100000;
+    if (m & 0x0b1000) imm8 |= 0b11000000;
+    else imm8 |= 0b10000000;

-    return v_float64x4(__lasx_xvshuf4i_d(*((__m256i*)&a.val), *((__m256i*)&a.val), m8));
+    return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
 }
-
 template<typename _Tpvec>
 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
 {
@@ -754,51 +744,53 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)

 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin)           \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)            \
-    { return _Tpvec(intrin(a.val, b.val)); }
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }

-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32,  __lasx_xvsadd_bu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32,  __lasx_xvssub_bu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32,   __lasx_xvsadd_b)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32,   __lasx_xvssub_b)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16,  __lasx_xvsadd_h)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16,  __lasx_xvssub_h)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8,  __lasx_xvadd_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8,  __lasx_xvsub_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8,  __lasx_xvmul_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8,   __lasx_xvadd_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8,   __lasx_xvsub_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8,   __lasx_xvmul_w)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4,  __lasx_xvadd_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4,  __lasx_xvsub_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4,   __lasx_xvadd_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4,   __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32,  __lasx_xvsadd_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32,  __lasx_xvssub_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32,   __lasx_xvsadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32,   __lasx_xvssub_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16,  __lasx_xvsadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16,  __lasx_xvssub_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8,  __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8,  __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8,  __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8,   __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8,   __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8,   __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4,  __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4,  __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4,   __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4,   __lasx_xvsub_d)

-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d)
-OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)

 // saturating multiply 8-bit, 16-bit
-inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
 {
    v_uint16x16 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
 {
    v_int16x16 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
 {
    __m256i pl = __lasx_xvmul_h(a.val, b.val);
    __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
@@ -806,7 +798,7 @@ inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
    __m256i p1 = __lasx_xvilvh_h(ph, pl);
    return v_uint16x16(_v256_packs_epu32(p0, p1));
 }
-inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
 {
    __m256i pl = __lasx_xvmul_h(a.val, b.val);
    __m256i ph = __lasx_xvmuh_h(a.val, b.val);
@@ -814,6 +806,14 @@ inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
    __m256i p1 = __lasx_xvilvh_h(ph, pl);
    return v_int16x16(_lasx_packs_w(p0, p1));
 }
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }

 /** Non-saturating arithmetics **/

@@ -902,13 +902,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return

 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                             \
-    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                                               \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                                        \
    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
-    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                                               \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                                        \
    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
-    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                                               \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                                        \
    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
-    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                                               \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                                        \
    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }                            \
    template<int imm>                                                                             \
    inline _Tpuvec v_shl(const _Tpuvec& a)                                                        \
@@ -930,10 +930,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, __lasx_xvsra_d)

 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const)    \
-    OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix)  \
-    OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix)    \
-    OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix)  \
-    inline _Tpvec v_not(const _Tpvec& a)                               \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix)   \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                      \
    { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }

 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
@@ -946,14 +946,16 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4,   v, __lasx_xvreplgr2vr_d(-1))
 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4,    v, __lasx_xvreplgr2vr_d(-1))

 #define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)                         \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                                      \
-    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                            \
+    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }                    \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                              \
+    { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }

 #define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast)       \
-    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast)  \
-    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast)    \
-    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast)  \
-    inline _Tpvec v_not(const _Tpvec& a)                                           \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast)      \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast)       \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast)      \
+    inline _Tpvec operator ~ (const _Tpvec& a)                                     \
    { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }

 OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8,  v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
@@ -979,25 +981,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const

 /** Comparison **/
 #define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec)                     \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_not(v_eq(a, b)); }                                  \
-    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_gt(b, a); }                                         \
-    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_not(v_lt(a, b)); }                                  \
-    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_ge(b, a); }
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+    { return ~(a == b); }                                          \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)   \
+    { return b > a; }                                              \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+    { return ~(a < b); }                                           \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+    { return b >= a; }

 #define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)   \
-    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)                  \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
    { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
-    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)                  \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)           \
    {                                                                        \
        return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val));                \
    }                                                                        \
-    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)                  \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
    { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
-    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)                  \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
    { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); }                 \
    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec)                                  \
    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
@@ -1007,37 +1009,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8,  v_int32x8,  w, wu)

 #define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix)         \
-    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)          \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
    { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); }       \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)          \
-    { return v_not(v_eq(a, b)); }
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }

 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)

 #define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)    \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)               \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
    { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }

 #define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix)              \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix)   \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix)   \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt,  xvfcmp_clt, _Tpvec, ssuffix)  \
-    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix)
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(<,  xvfcmp_clt, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)

 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)

-inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b)
+inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
 { return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }

-inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b)
+inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
 { return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }

-inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b)
+inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
 { return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }

-inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b)
+inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
 { return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }

 inline v_float32x8 v_not_nan(const v_float32x8& a)
@@ -1098,7 +1100,7 @@ inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
 template<int imm>
 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
 {
-    enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
+    enum {IMM_L = (imm - 16) & 0xFF};
    enum {IMM_R = (16 - imm) & 0xFF};

    if (imm == 0) return a;
@@ -1115,7 +1117,7 @@ inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
 template<int imm>
 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
 {
-    enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
+    enum {IMM_L = (imm - 16) & 0xFF};

    if (imm == 0) return a;
    if (imm > 32) return v_uint8x32();
@@ -1305,9 +1307,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }

 inline int v_reduce_sum(const v_int16x16& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
 inline unsigned v_reduce_sum(const v_uint16x16& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }

 inline float v_reduce_sum(const v_float32x8& a)
 {
@@ -1375,27 +1377,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
    v_uint32x8 l, h;
-    v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
-    return v_reduce_sum(v_add(l, h));
+    v_expand(v_add_wrap(a - b, b - a), l, h);
+    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
 {
    v_uint32x8 l, h;
    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
-    return v_reduce_sum(v_add(l, h));
+    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
 {
-    return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
+    return v_reduce_sum(v_max(a, b) - v_min(a, b));
 }
 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 m = v_lt(a, b);
-    return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
+    v_int32x8 m = a < b;
+    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
 }
 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 {
-    v_float32x8 a_b = v_sub(a, b);
+    v_float32x8 a_b = a - b;
    return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
 }

@@ -1499,9 +1501,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
    inline _Tpvec v_sqrt(const _Tpvec& x)                                      \
    { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); }                         \
    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)            \
-    { return v_fma(a, a, v_mul(b, b)); }                                       \
+    { return v_fma(a, a, b * b); }                                             \
    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                \
-    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
+    { return v_sqrt(v_fma(a, a, b*b)); }

 OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
 OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
@@ -1552,20 +1554,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
 { return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }

 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 /** Saturating absolute difference **/
 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
 {
-    v_int8x32 d = v_sub(a, b);
-    v_int8x32 m = v_lt(a, b);
-    return v_sub(v_xor(d, m), m);
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
 }
 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 ////////// Conversions /////////

@@ -1887,7 +1889,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }

 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 32 >> 64
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1911,7 +1913,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
    return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
@@ -1922,7 +1924,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
    return v_int32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1934,7 +1936,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
    return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1946,13 +1948,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 }

 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 //////// Fast Dot Product ////////

@@ -1989,7 +1991,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
    return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
 {
@@ -2000,7 +2002,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
    return v_int64x4(__lasx_xvadd_d(lo, hi));
 }
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -2020,7 +2022,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
    v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
 }

 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@@ -3013,20 +3015,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a)

 inline void v256_cleanup() {}

-#include "intrin_math.hpp"
-inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
-inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
-inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
-inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
-inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
-inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
-
-inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
-inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
-inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
-inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
-inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_legacy_ops.h
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_legacy_ops.h
@@ -1,111 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-// This file has been created for compatibility with older versions of Universal Intrinscs
-// Binary operators for vector types has been removed since version 4.11
-// Include this file manually after OpenCV headers if you need these operators
-
-#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
-#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
-
-#ifdef __OPENCV_BUILD
-#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
-#endif
-
-#ifdef __riscv
-#warning "Operators might conflict with built-in functions on RISC-V platform"
-#endif
-
-#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
-#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
-#endif
-
-
-namespace cv { namespace hal {
-
-#define BIN_OP(OP, FUN) \
-template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
-
-#define BIN_A_OP(OP, FUN) \
-template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
-
-#define UN_OP(OP, FUN) \
-template <typename R> R operator OP (const R & val) { return FUN(val); }
-
-BIN_OP(+, v_add)
-BIN_OP(-, v_sub)
-BIN_OP(*, v_mul)
-BIN_OP(/, v_div)
-BIN_OP(&, v_and)
-BIN_OP(|, v_or)
-BIN_OP(^, v_xor)
-
-BIN_OP(==, v_eq)
-BIN_OP(!=, v_ne)
-BIN_OP(<, v_lt)
-BIN_OP(>, v_gt)
-BIN_OP(<=, v_le)
-BIN_OP(>=, v_ge)
-
-BIN_A_OP(+=, v_add)
-BIN_A_OP(-=, v_sub)
-BIN_A_OP(*=, v_mul)
-BIN_A_OP(/=, v_div)
-BIN_A_OP(&=, v_and)
-BIN_A_OP(|=, v_or)
-BIN_A_OP(^=, v_xor)
-
-UN_OP(~, v_not)
-
-// TODO: shift operators?
-
-}} // cv::hal::
-
-//==============================================================================
-
-#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
-
-namespace cv { namespace hal {
-
-inline static void opencv_operator_compile_test()
-{
-    using namespace cv;
-    v_float32 a, b, c;
-    uint8_t shift = 1;
-    a = b + c;
-    a = b - c;
-    a = b * c;
-    a = b / c;
-    a = b & c;
-    a = b | c;
-    a = b ^ c;
-    // a = b >> shift;
-    // a = b << shift;
-
-    a = (b == c);
-    a = (b != c);
-    a = (b < c);}}
-    a = (b > c);
-    a = (b <= c);
-    a = (b >= c);
-
-    a += b;
-    a -= b;
-    a *= b;
-    a /= b;
-    a &= b;
-    a |= b;
-    a ^= b;
-    // a <<= shift;
-    // a >>= shift;
-
-    a = ~b;
-}
-
-}} // cv::hal::
-
-#endif
-
-
-#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_lsx.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_lsx.hpp
@@ -417,10 +417,6 @@ inline __m128i _lsx_128_castpd_si128(const __m128d& v)
    { return _Tpvec(__lsx_vldi(0)); }                                             \
    inline _Tpvec v_setall_##suffix(_Tp v)                                        \
    { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); }                    \
-    template <> inline _Tpvec v_setzero_()                                        \
-    { return v_setzero_##suffix(); }                                              \
-    template <> inline _Tpvec v_setall_(_Tp v)                                    \
-    { return v_setall_##suffix(v); }                                              \
    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,  suffix, OPENCV_HAL_NOP)         \
    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,   suffix, OPENCV_HAL_NOP)         \
    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,  suffix, OPENCV_HAL_NOP)         \
@@ -452,10 +448,6 @@ inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
    { return _Tpvec(__lsx_vldi(0)); }                                       \
    inline _Tpvec v_setall_##suffix(_Tp v)                                  \
    { return _Tpvec(_v128_setall_##zsuffix(v)); }                           \
-    template <> inline _Tpvec v_setzero_()                                  \
-    { return v_setzero_##suffix(); }                                        \
-    template <> inline _Tpvec v_setall_(_Tp v)                              \
-    { return v_setall_##suffix(v); }                                        \
    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,     suffix,   cast)        \
    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,      suffix,   cast)        \
    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,     suffix,   cast)        \
@@ -533,51 +525,53 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)

 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin)           \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)  \
-    { return _Tpvec(intrin(a.val, b.val)); }
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }

-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16,  __lsx_vsadd_bu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16,  __lsx_vssub_bu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16,   __lsx_vsadd_b)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16,   __lsx_vssub_b)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8,  __lsx_vsadd_hu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8,  __lsx_vssub_hu)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8,   __lsx_vsadd_h)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8,   __lsx_vssub_h)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4,  __lsx_vadd_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4,  __lsx_vsub_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4,  __lsx_vmul_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4,   __lsx_vadd_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4,   __lsx_vsub_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4,   __lsx_vmul_w)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2,  __lsx_vadd_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2,  __lsx_vsub_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2,   __lsx_vadd_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2,   __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16,  __lsx_vsadd_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16,  __lsx_vssub_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16,   __lsx_vsadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16,   __lsx_vssub_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8,  __lsx_vsadd_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8,  __lsx_vssub_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8,   __lsx_vsadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8,   __lsx_vssub_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4,  __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4,  __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4,  __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4,   __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4,   __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4,   __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2,  __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2,  __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2,   __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2,   __lsx_vsub_d)

-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d)
-OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)

 // saturating multiply 8-bit, 16-bit
-inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b)
+inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
 {
    v_uint16x8 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b)
+inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
 {
    v_int16x8 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
 }
-inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
+inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
 {
    __m128i a0 = a.val, b0 = b.val;
    __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
@@ -586,7 +580,7 @@ inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
    __m128i ph  = __lsx_vilvh_w(pod, pev);
    return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
 }
-inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
+inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
 {
    __m128i a0 = a.val, b0 = b.val;
    __m128i pev = __lsx_vmulwev_w_h(a0, b0);
@@ -595,6 +589,14 @@ inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
    __m128i ph  = __lsx_vilvh_w(pod, pev);
    return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
 }
+inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
+{ a = a * b; return a; }
+inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
+{ a = a * b; return a; }
+inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
+{ a = a * b; return a; }
+inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
+{ a = a * b; return a; }

 /** Non-saturating arithmetics **/

@@ -679,13 +681,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)

 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                 \
-    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                                  \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                           \
    { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
-    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                                  \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                           \
    { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
-    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                                  \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                           \
    { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
-    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                                  \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                           \
    { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); }                 \
    template<int imm>                                                                \
    inline _Tpuvec v_shl(const _Tpuvec& a)                                           \
@@ -706,10 +708,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)

 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix)                                 \
-    OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix)                   \
-    OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix)                     \
-    OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix)                   \
-    inline _Tpvec v_not(const _Tpvec& a)                                             \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix)                       \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix)                        \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix)                       \
+    inline _Tpvec operator ~(const _Tpvec& a)                                        \
    { return _Tpvec(__lsx_vnori_b(a.val, 0)); }                                      \

 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16,   v)
@@ -722,14 +724,18 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2,   v)
 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2,    v)

 #define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)               \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                           \
-    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                 \
+    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }                   \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                   \
+    { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val);                          \
+      a.val = cast(c);                                                               \
+      return a;}

 #define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast)                             \
-    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast)              \
-    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast)                \
-    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast)              \
-    inline _Tpvec v_not(const _Tpvec& a)                                             \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast)                  \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast)                   \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast)                  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                                       \
    { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); }                           \

 OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
@@ -754,23 +760,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const

 /** Comparison **/
 #define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec)                            \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_not(v_eq(a, b)); }                                        \
-    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_gt(b, a); }                                               \
-    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_not(v_lt(a, b)); }                                        \
-    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_ge(b, a); }                                               \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)         \
+    { return ~( a == b ); }                                              \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)         \
+    { return b > a ; }                                                   \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)         \
+    { return ~(a < b); }                                                 \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)         \
+    { return b >= a; }                                                   \

 #define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)    \
-    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)                  \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
    { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
-    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)                  \
+    inline _Tpuvec operator >  (const _Tpuvec& a, const _Tpuvec& b)          \
    { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); }                  \
-    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)                  \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
    { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
-    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)                  \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
    { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); }                   \
    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec)                                   \
    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
@@ -780,37 +786,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8,  v_int16x8,  h, hu)
 OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4,  v_int32x4,  w, wu)

 #define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix)          \
-    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)          \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
    { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); }         \
-    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)          \
-    { return v_not(v_eq(a, b)); }
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }

 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)

 #define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)       \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                 \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)       \
    { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); }           \

 #define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix)                    \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix)          \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix)          \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt,  vfcmp_clt, _Tpvec, ssuffix)         \
-    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix)          \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(<,  vfcmp_clt, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix)            \

 OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
 OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)

-inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b)
+inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
 { return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }

-inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b)
+inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
 { return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }

-inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b)
+inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
 { return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }

-inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b)
+inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
 { return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }

 inline v_float32x4 v_not_nan(const v_float32x4& a)
@@ -1182,7 +1188,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)

 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
-    v_float32x4 a_b = v_sub(a, b);
+    v_float32x4 a_b = a - b;
    return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
 }

@@ -1289,9 +1295,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
    inline _Tpvec v_sqrt(const _Tpvec& x)                                       \
    { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); }                            \
    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
-    { return v_fma(a, a, v_mul(b, b)); }                                        \
+    { return v_fma(a, a, b * b); }                                              \
    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
-    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
+    { return v_sqrt(v_fma(a, a, b * b)); }

 OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
 OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
@@ -1343,20 +1349,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 { return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }

 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = v_sub(a, b);
-    v_int8x16 m = v_lt(a, b);
-    return v_sub(v_xor(d, m), m);
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
 }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 ///////// Conversions /////////

@@ -1667,7 +1673,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 }

 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_add(v_dotprod_expand(a, b), c) ;}
+{ return v_dotprod_expand(a, b) + c ;}

 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1679,7 +1685,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
    return v_int32x4(__lsx_vadd_w(prod0, prod1));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -1692,7 +1698,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
    return v_uint64x2(__lsx_vadd_d(prod0, prod1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1704,13 +1710,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
    return v_int64x2(__lsx_vadd_d(prod0, prod1));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 //32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }


 ///////// Fast Dot Product //////
@@ -1749,7 +1755,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
    return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1761,7 +1767,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
    return v_int64x2(__lsx_vadd_d(lo, hi));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
@@ -2523,20 +2529,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& a)

 inline void v_cleanup() {}

-#include "intrin_math.hpp"
-inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
-inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
-inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
-
-inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
-inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
-inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_math.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_math.hpp
@@ -1,687 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-/* Universal Intrinsics implementation of sin, cos, exp and log
-
-   Inspired by Intel Approximate Math library, and based on the
-   corresponding algorithms of the cephes math library
-*/
-
-/* Copyright (C) 2010,2011  RJVB - extensions */
-/* Copyright (C) 2011  Julien Pommier
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  (this is the zlib license)
-*/
-#ifndef OPENCV_HAL_INTRIN_MATH_HPP
-#define OPENCV_HAL_INTRIN_MATH_HPP
-
-//! @name Exponential
-//! @{
-// Implementation is the same as float32 vector.
-template<typename _TpVec16F, typename _TpVec16S>
-inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
-    const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
-    const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
-    const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
-    const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
-    const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
-    const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
-    const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
-    const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
-    const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
-    const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
-    const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
-    const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
-    const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
-
-    _TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
-    _TpVec16S _vexp_mm;
-    const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
-
-    // compute exponential of x
-    _vexp_x = v_max(x, _vexp_lo_f16);
-    _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
-
-    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
-    _vexp_mm = v_floor(_vexp_);
-    _vexp_ = v_cvt_f16(_vexp_mm);
-    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
-    _vexp_mm = v_shl(_vexp_mm, 10);
-
-    _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
-    _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
-    _vexp_xx = v_mul(_vexp_x, _vexp_x);
-
-    _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
-
-    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
-    _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
-    _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
-
-    // exp(NAN) -> NAN
-    _TpVec16F mask_not_nan = v_not_nan(x);
-    return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
-}
-
-template<typename _TpVec32F, typename _TpVec32S>
-inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
-    const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
-    const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
-    const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
-    const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
-    const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
-    const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
-    const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
-    const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
-    const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
-    const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
-    const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
-    const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
-    const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
-
-    _TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
-    _TpVec32S _vexp_mm;
-    const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
-
-    // compute exponential of x
-    _vexp_x = v_max(x, _vexp_lo_f32);
-    _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
-
-    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
-    _vexp_mm = v_floor(_vexp_);
-    _vexp_ = v_cvt_f32(_vexp_mm);
-    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
-    _vexp_mm = v_shl(_vexp_mm, 23);
-
-    _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
-    _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
-    _vexp_xx = v_mul(_vexp_x, _vexp_x);
-
-    _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
-    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
-
-    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
-    _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
-    _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
-
-    // exp(NAN) -> NAN
-    _TpVec32F mask_not_nan = v_not_nan(x);
-    return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
-}
-
-template<typename _TpVec64F, typename _TpVec64S>
-inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
-    const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
-    const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
-    const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
-    const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
-    const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
-    const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
-    const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
-    const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
-    const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
-    const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
-    const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
-    const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
-    const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
-    const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
-    const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
-
-    _TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
-    _TpVec64S _vexp_mm;
-    const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
-
-    // compute exponential of x
-    _vexp_x = v_max(x, _vexp_lo_f64);
-    _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
-
-    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
-    _vexp_mm = v_expand_low(v_floor(_vexp_));
-    _vexp_ = v_cvt_f64(_vexp_mm);
-    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
-    _vexp_mm = v_shl(_vexp_mm, 52);
-
-    _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
-    _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
-    _vexp_xx = v_mul(_vexp_x, _vexp_x);
-
-    _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
-    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
-    _vexp_y = v_mul(_vexp_y, _vexp_x);
-
-    _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
-    _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
-    _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
-
-    _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
-    _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
-    _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
-
-    // exp(NAN) -> NAN
-    _TpVec64F mask_not_nan = v_not_nan(x);
-    return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
-}
-//! @}
-
-//! @name Natural Logarithm
-//! @{
-template<typename _TpVec16F, typename _TpVec16S>
-inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
-    const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
-    const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
-    const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
-    const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
-    const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
-    const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
-    const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
-    const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
-    const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
-    const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
-    const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
-    const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
-    const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
-
-    _TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
-    _TpVec16S _vlog_ux, _vlog_emm0;
-    const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
-
-    _vlog_ux = v_reinterpret_as_s16(x);
-    _vlog_emm0 = v_shr(_vlog_ux, 10);
-
-    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
-    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
-    _vlog_x = v_reinterpret_as_f16(_vlog_ux);
-
-    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
-    _vlog_e = v_cvt_f16(_vlog_emm0);
-
-    _vlog_e = v_add(_vlog_e, _vlog_one_fp16);
-
-    _TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
-    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
-    _vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
-    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
-    _vlog_x = v_add(_vlog_x, _vlog_tmp);
-
-    _vlog_z = v_mul(_vlog_x, _vlog_x);
-
-    _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
-    _vlog_y = v_mul(_vlog_y, _vlog_x);
-    _vlog_y = v_mul(_vlog_y, _vlog_z);
-
-    _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
-
-    _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
-
-    _vlog_x = v_add(_vlog_x, _vlog_y);
-    _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
-    // log(0) -> -INF
-    _TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
-    _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
-    // log(NEG), log(NAN) -> NAN
-    _TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
-    _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
-    // log(INF) -> INF
-    _TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
-    _vlog_x = v_select(mask_inf, x, _vlog_x);
-    return _vlog_x;
-}
-
-template<typename _TpVec32F, typename _TpVec32S>
-inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
-    const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
-    const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
-    const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
-    const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
-    const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
-    const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
-    const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
-    const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
-    const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
-    const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
-    const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
-    const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
-    const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
-
-    _TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
-    _TpVec32S _vlog_ux, _vlog_emm0;
-    const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
-
-    _vlog_ux = v_reinterpret_as_s32(x);
-    _vlog_emm0 = v_shr(_vlog_ux, 23);
-
-    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
-    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
-    _vlog_x = v_reinterpret_as_f32(_vlog_ux);
-
-    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
-    _vlog_e = v_cvt_f32(_vlog_emm0);
-
-    _vlog_e = v_add(_vlog_e, _vlog_one_fp32);
-
-    _TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
-    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
-    _vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
-    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
-    _vlog_x = v_add(_vlog_x, _vlog_tmp);
-
-    _vlog_z = v_mul(_vlog_x, _vlog_x);
-
-    _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
-    _vlog_y = v_mul(_vlog_y, _vlog_x);
-    _vlog_y = v_mul(_vlog_y, _vlog_z);
-
-    _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
-
-    _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
-
-    _vlog_x = v_add(_vlog_x, _vlog_y);
-    _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
-    // log(0) -> -INF
-    _TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
-    _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
-    // log(NEG), log(NAN) -> NAN
-    _TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
-    _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
-    // log(INF) -> INF
-    _TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
-    _vlog_x = v_select(mask_inf, x, _vlog_x);
-    return _vlog_x;
-}
-
-template<typename _TpVec64F, typename _TpVec64S>
-inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
-    const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
-    const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
-    const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
-    const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
-    const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
-    const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
-    const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
-    const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
-    const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
-    const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
-    const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
-    const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
-    const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
-
-    const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
-    const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
-
-    _TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
-    _TpVec64S _vlog_ux, _vlog_emm0;
-    const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
-
-    _vlog_ux = v_reinterpret_as_s64(x);
-    _vlog_emm0 = v_shr(_vlog_ux, 52);
-
-    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
-    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
-    _vlog_x = v_reinterpret_as_f64(_vlog_ux);
-
-    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
-    _vlog_e = v_cvt_f64(_vlog_emm0);
-
-    _vlog_e = v_add(_vlog_e, _vlog_one_fp64);
-
-    _TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
-    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
-    _vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
-    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
-    _vlog_x = v_add(_vlog_x, _vlog_tmp);
-
-    _vlog_xx = v_mul(_vlog_x, _vlog_x);
-
-    _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
-    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
-    _vlog_y = v_mul(_vlog_y, _vlog_x);
-    _vlog_y = v_mul(_vlog_y, _vlog_xx);
-
-    _vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
-    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
-    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
-    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
-    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
-
-    _vlog_z = v_div(_vlog_y, _vlog_z);
-    _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
-    _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
-
-    _vlog_z = v_add(_vlog_z, _vlog_x);
-    _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
-
-    // log(0) -> -INF
-    _TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
-    _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
-    // log(NEG), log(NAN) -> NAN
-    _TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
-    _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
-    // log(INF) -> INF
-    _TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
-    _vlog_z = v_select(mask_inf, x, _vlog_z);
-    return _vlog_z;
-}
-//! @}
-
-//! @name Sine and Cosine
-//! @{
-template<typename _TpVec16F, typename _TpVec16S>
-inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
-    const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
-    const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
-    const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
-    const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
-    const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
-    const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
-    const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
-    const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
-    const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
-    const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
-    const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
-    const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
-
-    _TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
-    _TpVec16S emm2;
-
-    sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
-    _vx = v_abs(x);
-    _vy = v_mul(_vx, v_cephes_FOPI);
-
-    emm2 = v_trunc(_vy);
-    emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
-    emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
-    _vy = v_cvt_f16(emm2);
-
-    _TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
-
-    _vx = v_fma(_vy, v_minus_DP1, _vx);
-    _vx = v_fma(_vy, v_minus_DP2, _vx);
-    _vx = v_fma(_vy, v_minus_DP3, _vx);
-
-    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
-    sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
-
-    _TpVec16F _vxx = v_mul(_vx, _vx);
-    _TpVec16F y1, y2;
-
-    y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
-    y1 = v_fma(y1, _vxx, v_coscof_p2);
-    y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
-    y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
-
-    y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
-    y2 = v_fma(y2, _vxx, v_sincof_p2);
-    y2 = v_mul(y2, _vxx);
-    y2 = v_fma(y2, _vx, _vx);
-
-    ysin = v_select(poly_mask, y2, y1);
-    ycos = v_select(poly_mask, y1, y2);
-    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
-    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
-
-    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
-    _TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
-    _TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
-    ysin = v_select(mask_nan, v_nan, ysin);
-    ycos = v_select(mask_nan, v_nan, ycos);
-}
-
-template<typename _TpVec16F, typename _TpVec16S>
-inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
-    _TpVec16F ysin, ycos;
-    v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
-    return ysin;
-}
-
-template<typename _TpVec16F, typename _TpVec16S>
-inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
-    _TpVec16F ysin, ycos;
-    v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
-    return ycos;
-}
-
-
-template<typename _TpVec32F, typename _TpVec32S>
-inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
-    const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
-    const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
-    const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
-    const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
-    const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
-    const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
-    const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
-    const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
-    const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
-    const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
-    const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
-    const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
-
-    _TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
-    _TpVec32S emm2;
-
-    sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
-    _vx = v_abs(x);
-    _vy = v_mul(_vx, v_cephes_FOPI);
-
-    emm2 = v_trunc(_vy);
-    emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
-    emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
-    _vy = v_cvt_f32(emm2);
-
-    _TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
-
-    _vx = v_fma(_vy, v_minus_DP1, _vx);
-    _vx = v_fma(_vy, v_minus_DP2, _vx);
-    _vx = v_fma(_vy, v_minus_DP3, _vx);
-
-    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
-    sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
-
-    _TpVec32F _vxx = v_mul(_vx, _vx);
-    _TpVec32F y1, y2;
-
-    y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
-    y1 = v_fma(y1, _vxx, v_coscof_p2);
-    y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
-    y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
-
-    y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
-    y2 = v_fma(y2, _vxx, v_sincof_p2);
-    y2 = v_mul(y2, _vxx);
-    y2 = v_fma(y2, _vx, _vx);
-
-    ysin = v_select(poly_mask, y2, y1);
-    ycos = v_select(poly_mask, y1, y2);
-    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
-    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
-
-    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
-    _TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
-    _TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
-    ysin = v_select(mask_nan, v_nan, ysin);
-    ycos = v_select(mask_nan, v_nan, ycos);
-}
-
-template<typename _TpVec32F, typename _TpVec32S>
-inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
-    _TpVec32F ysin, ycos;
-    v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
-    return ysin;
-}
-
-template<typename _TpVec32F, typename _TpVec32S>
-inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
-    _TpVec32F ysin, ycos;
-    v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
-    return ycos;
-}
-
-template<typename _TpVec64F, typename _TpVec64S>
-inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
-    const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
-    const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
-    const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
-    const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
-    const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
-    const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
-    const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
-    const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
-    const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
-    const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
-    const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
-    const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
-    const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
-    const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
-    const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
-    const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
-    const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
-    const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
-
-    _TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
-    _TpVec64S emm2;
-
-    sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
-    _vx = v_abs(x);
-    _vy = v_mul(_vx, v_cephes_FOPI);
-
-    emm2 = v_expand_low(v_trunc(_vy));
-    emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
-    emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
-    _vy = v_cvt_f64(emm2);
-
-    _TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
-
-    _vx = v_fma(_vy, v_minus_DP1, _vx);
-    _vx = v_fma(_vy, v_minus_DP2, _vx);
-    _vx = v_fma(_vy, v_minus_DP3, _vx);
-
-    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
-    sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
-
-    _TpVec64F _vxx = v_mul(_vx, _vx);
-    _TpVec64F y1, y2;
-
-    y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
-    y1 = v_fma(y1, _vxx, v_cos_C3);
-    y1 = v_fma(y1, _vxx, v_cos_C4);
-    y1 = v_fma(y1, _vxx, v_cos_C5);
-    y1 = v_fma(y1, _vxx, v_cos_C6);
-    y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
-    y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
-
-    y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
-    y2 = v_fma(y2, _vxx, v_sin_C3);
-    y2 = v_fma(y2, _vxx, v_sin_C4);
-    y2 = v_fma(y2, _vxx, v_sin_C5);
-    y2 = v_fma(y2, _vxx, v_sin_C6);
-    y2 = v_mul(y2, _vxx);
-    y2 = v_fma(y2, _vx, _vx);
-
-    ysin = v_select(poly_mask, y2, y1);
-    ycos = v_select(poly_mask, y1, y2);
-    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
-    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
-
-    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
-    _TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
-    _TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
-    ysin = v_select(mask_nan, v_nan, ysin);
-    ycos = v_select(mask_nan, v_nan, ycos);
-}
-
-template<typename _TpVec64F, typename _TpVec64S>
-inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
-    _TpVec64F ysin, ycos;
-    v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
-    return ysin;
-}
-
-template<typename _TpVec64F, typename _TpVec64S>
-inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
-    _TpVec64F ysin, ycos;
-    v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
-    return ycos;
-}
-//! @}
-
-
-/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
-   https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
-*/
-
-//! @name Error Function
-//! @{
-template<typename _TpVec32F, typename _TpVec32S>
-inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
-    const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
-            coef1 = v_setall_<_TpVec32F>(1.061405429f),
-            coef2 = v_setall_<_TpVec32F>(-1.453152027f),
-            coef3 = v_setall_<_TpVec32F>(1.421413741f),
-            coef4 = v_setall_<_TpVec32F>(-0.284496736f),
-            coef5 = v_setall_<_TpVec32F>(0.254829592f),
-            ones = v_setall_<_TpVec32F>(1.0f),
-            neg_zeros = v_setall_<_TpVec32F>(-0.f);
-    _TpVec32F t = v_abs(v);
-    // sign(v)
-    _TpVec32F sign_mask = v_and(neg_zeros, v);
-
-    t = v_div(ones, v_fma(coef0, t, ones));
-    _TpVec32F r = v_fma(coef1, t, coef2);
-    r = v_fma(r, t, coef3);
-    r = v_fma(r, t, coef4);
-    r = v_fma(r, t, coef5);
-    // - v * v
-    _TpVec32F v2 = v_mul(v, v);
-    _TpVec32F mv2 = v_xor(neg_zeros, v2);
-    // - exp(- v * v)
-    _TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
-    _TpVec32F neg_exp = v_xor(neg_zeros, exp);
-    _TpVec32F res = v_mul(t, neg_exp);
-    res = v_fma(r, res, ones);
-    return v_xor(sign_mask, res);
-}
-//! @}
-
-#endif // OPENCV_HAL_INTRIN_MATH_HPP
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_msa.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_msa.hpp
@@ -235,8 +235,6 @@ struct v_float64x2
 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
-template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
-template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
@@ -347,46 +345,53 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }

 #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
 }

-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)

 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \
-inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)  \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
 {                                                            \
    _Tpwvec c, d;                                            \
    v_mul_expand(a, b, c, d);                                \
    return v_pack(c, d);                                     \
-}
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{a = a * b; return a; }

 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -541,13 +546,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
    return v_int64x2(msa_hadd_s64(prod, prod));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }


 //////// Fast Dot Product ////////
@@ -591,10 +596,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
 { return v_dotprod_expand(a, b, c); }

 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix)   \
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix)    \
-OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix)   \
-inline _Tpvec v_not(const _Tpvec& a) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \
+inline _Tpvec operator ~ (const _Tpvec& a) \
 { \
    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
 }
@@ -609,16 +614,21 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)

 #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
+    return a; \
 }

-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32)
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32)
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)

-inline v_float32x4 v_not(const v_float32x4& a)
+inline v_float32x4 operator ~ (const v_float32x4& a)
 {
    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 }
@@ -649,16 +659,21 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)

 #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
-inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
+    return a; \
 }

-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64)
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64)
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)

-inline v_float64x2 v_not(const v_float64x2& a)
+inline v_float64x2 operator ~ (const v_float64x2& a)
 {
    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 }
@@ -689,17 +704,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)

 #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
-inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
-inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
-inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }

 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
@@ -806,9 +821,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_

 // trade efficiency for convenience
 #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec v_shl(const _Tpvec& a, int n) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
 { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec v_shr(const _Tpvec& a, int n) \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
 { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
@@ -1863,20 +1878,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)

 inline void v_cleanup() {}

-#include "intrin_math.hpp"
-inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
-inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
-inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
-
-inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
-inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
-inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_neon.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_neon.hpp
@@ -56,7 +56,7 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

 #define CV_SIMD128 1
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define CV_SIMD128_64F 1
 #else
 #define CV_SIMD128_64F 0
@@ -72,7 +72,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 //
 // [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
 // [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
-#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
 #define CV_NEON_AARCH64 1
 #else
 #define CV_NEON_AARCH64 0
@@ -381,8 +381,6 @@ private:
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
-template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
-template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
@@ -888,10 +886,9 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b

 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
-    int16x8_t p0 = vmull_s8(vget_low_s8(a.val),  vget_low_s8(b.val));
-    int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
-    int32x4_t s0 = vaddl_s16(vget_low_s16(p0), vget_low_s16(p1));
-    return v_int32x4(vaddq_s32(s0, vaddl_s16(vget_high_s16(p0), vget_high_s16(p1))));
+    int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
+    return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
@@ -1081,7 +1078,7 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 static inline uint64x2_t vmvnq_u64(uint64x2_t a)
 {
    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
@@ -1823,7 +1820,7 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
    return v_int32x4(vmovl_s16(v1));
 }

-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
@@ -2649,28 +2646,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)

 inline void v_cleanup() {}

-#include "intrin_math.hpp"
-#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
-inline v_float16x8 v_exp(const v_float16x8& x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
-inline v_float16x8 v_log(const v_float16x8& x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
-inline void v_sincos(const v_float16x8& x, v_float16x8& s, v_float16x8& c) { v_sincos_default_16f<v_float16x8, v_int16x8>(x, s, c); }
-inline v_float16x8 v_sin(const v_float16x8& x) { return v_sin_default_16f<v_float16x8, v_int16x8>(x); }
-inline v_float16x8 v_cos(const v_float16x8& x) { return v_cos_default_16f<v_float16x8, v_int16x8>(x); }
-#endif
-inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
-inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
-inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
-#if CV_SIMD128_64F
-inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
-inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
-inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
-#endif
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv.hpp
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -355,12 +355,10 @@ inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64

 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
-inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } \
-template <> inline v_##_Tp##x##num v_setzero_() { return v_setzero_##suffix(); }          \
-template <> inline v_##_Tp##x##num v_setall_(__Tp v) { return v_setall_##suffix(v); }
+inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }

 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
-OPENCV_HAL_IMPL_RISCVV_INIT_SET(schar, int8, s8, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
@@ -373,57 +371,72 @@ inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v,
 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }

-template <> inline v_float32x4 v_setzero_() { return v_setzero_f32(); }
-template <> inline v_float32x4 v_setall_(float v) { return v_setall_f32(v); }
-
-template <> inline v_float64x2 v_setzero_() { return v_setzero_f64(); }
-template <> inline v_float64x2 v_setall_(double v) { return v_setall_f64(v); }

 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
 }

 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
-inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
    return _Tpvec(intrin(a.val, b.val, num)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val, num); \
+    return a; \
 }

-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4)
-inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
 {
    return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
 }
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
+    return a;
+}

-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2)
-inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
+inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
 {
    return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
 }
+inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
+{
+    a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
+    return a;
+}
 // TODO: exp, log, sin, cos

 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
@@ -549,10 +562,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 }

 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num)   \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \
-    inline _Tpvec v_not(const _Tpvec & a) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
+    inline _Tpvec operator ~ (const _Tpvec & a) \
    { \
        return _Tpvec(vnot_v_##suffix(a.val, num)); \
    }
@@ -567,31 +580,41 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)

 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
+    return a; \
 }

-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)

-inline v_float32x4 v_not(const v_float32x4& a)
+inline v_float32x4 operator ~ (const v_float32x4& a)
 {
    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
 }

 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
-inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
+    return a; \
 }

-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)

-inline v_float64x2 v_not(const v_float64x2& a)
+inline v_float64x2 operator ~ (const v_float64x2& a)
 {
    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
 }
@@ -1151,32 +1174,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)

 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
-inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 { \
    vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 { \
    vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
 { \
    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
 { \
    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
 { \
    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 { \
    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
@@ -1192,37 +1215,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)

 //TODO: ==
-inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
 {
    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
 {
    vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
 {
    vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
 {
    vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
 {
    vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
 {
    vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
@@ -1236,37 +1259,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a)
 }

 //TODO: ==
-inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
 {
    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
 {
    vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
 {
    vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
 {
    vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
 {
    vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
 {
    vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
@@ -1308,13 +1331,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)


 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
-inline _Tpvec v_shl(const _Tpvec& a, int n) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }

 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
-inline _Tpvec v_shr(const _Tpvec& a, int n) \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
@@ -2014,11 +2037,13 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)

 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt)   \
-    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)       \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
    {                                                           \
        auto res = mul(a.val, b.val, num);                      \
        return _Tpvec(cvt(res, 0, num));                        \
-    }
+    }                                                           \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)     \
+    { a = a * b; return a; }

 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  16, vwmul_vv_i16m2, vnclip_wx_i8m1)
 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
@@ -2820,7 +2845,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
                                    const v_float64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 {
    vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
@@ -2829,7 +2854,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
 { v_float64x2 res = v_dotprod_expand_fast(a, b);
-  return v_add(res, c); }
+  return res + c; }
 #endif
 ////// FP16 support ///////
 #if __riscv_v == 7000
@@ -2866,20 +2891,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)

 inline void v_cleanup() {}

-#include "intrin_math.hpp"
-inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
-inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
-inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
-
-inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
-inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
-inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_010_compat_non-policy.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_010_compat_non-policy.hpp
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_010_compat_overloaded-non-policy.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_010_compat_overloaded-non-policy.hpp
@@ -0,0 +1,768 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copied from
+// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
+
+#ifndef __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
+#define __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
+
+
+// The maximum number of parameters is 20, this is held by segment load
+// instructions with a NFIELD (NF) of 8. 20 is contributed by 8 vector register
+// pointers passed, 1 vector mask register, 8 passthrough register for
+// undisturbed policy, and 3 for address base, byte index, vl.
+#define _GET_OVERRIDE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,\
+_14, _15, _16, _17, _18, _19, _20, NAME, ...) NAME
+
+
+#if __has_include ("riscv_vector.h")
+#include <riscv_vector.h>
+#endif
+#ifndef __RISCV_VECTOR_H
+#include_next <riscv_vector.h>
+#endif
+
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+#define vmerge(mask, op1, op2, vl) __riscv_vmerge((op1), (op2), (mask), (vl))
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+#define vfmerge(mask, op1, op2, vl) __riscv_vfmerge((op1), (op2), (mask), (vl))
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+#define vcompress(mask, dest, src, vl) __riscv_vcompress_tu((dest), (src), (mask), (vl))
+// Reinterpret between different type under the same SEW/LMUL
+// Reinterpret between different SEW under the same LMUL
+#define vse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse16, __riscv_vse16, 2, 1)(__VA_ARGS__)
+#define vse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse32, __riscv_vse32, 2, 1)(__VA_ARGS__)
+#define vse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse64, __riscv_vse64, 2, 1)(__VA_ARGS__)
+#define vse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse8, __riscv_vse8, 2, 1)(__VA_ARGS__)
+#define vsse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse16, __riscv_vsse16, 3, 2, 1)(__VA_ARGS__)
+#define vsse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse32, __riscv_vsse32, 3, 2, 1)(__VA_ARGS__)
+#define vsse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse64, __riscv_vsse64, 3, 2, 1)(__VA_ARGS__)
+#define vsse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse8, __riscv_vsse8, 3, 2, 1)(__VA_ARGS__)
+#define vloxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei8_tumu, 4, __riscv_vloxei8, 2, 1)(__VA_ARGS__)
+#define vloxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei16_tumu, 4, __riscv_vloxei16, 2, 1)(__VA_ARGS__)
+#define vloxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei32_tumu, 4, __riscv_vloxei32, 2, 1)(__VA_ARGS__)
+#define vloxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei64_tumu, 4, __riscv_vloxei64, 2, 1)(__VA_ARGS__)
+#define vluxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei8_tumu, 4, __riscv_vluxei8, 2, 1)(__VA_ARGS__)
+#define vluxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei16_tumu, 4, __riscv_vluxei16, 2, 1)(__VA_ARGS__)
+#define vluxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei32_tumu, 4, __riscv_vluxei32, 2, 1)(__VA_ARGS__)
+#define vluxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei64_tumu, 4, __riscv_vluxei64, 2, 1)(__VA_ARGS__)
+#define vsoxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei8, __riscv_vsoxei8, 3, 2, 1)(__VA_ARGS__)
+#define vsoxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei16, __riscv_vsoxei16, 3, 2, 1)(__VA_ARGS__)
+#define vsoxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei32, __riscv_vsoxei32, 3, 2, 1)(__VA_ARGS__)
+#define vsoxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei64, __riscv_vsoxei64, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei8, __riscv_vsuxei8, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei16, __riscv_vsuxei16, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei32, __riscv_vsuxei32, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei64, __riscv_vsuxei64, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e16, __riscv_vsseg2e16, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e16, __riscv_vsseg3e16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e16, __riscv_vsseg4e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e16, __riscv_vsseg5e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e16, __riscv_vsseg6e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e16, __riscv_vsseg7e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e16, __riscv_vsseg8e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e32, __riscv_vsseg2e32, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e32, __riscv_vsseg3e32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e32, __riscv_vsseg4e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e32, __riscv_vsseg5e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e32, __riscv_vsseg6e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e32, __riscv_vsseg7e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e32, __riscv_vsseg8e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e64, __riscv_vsseg2e64, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e64, __riscv_vsseg3e64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e64, __riscv_vsseg4e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e64, __riscv_vsseg5e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e64, __riscv_vsseg6e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e64, __riscv_vsseg7e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e64, __riscv_vsseg8e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e8, __riscv_vsseg2e8, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e8, __riscv_vsseg3e8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e8, __riscv_vsseg4e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e8, __riscv_vsseg5e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e8, __riscv_vsseg6e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e8, __riscv_vsseg7e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e8, __riscv_vsseg8e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e16, __riscv_vssseg2e16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e16, __riscv_vssseg3e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e16, __riscv_vssseg4e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e16, __riscv_vssseg5e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e16, __riscv_vssseg6e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e16, __riscv_vssseg7e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e16, __riscv_vssseg8e16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e32, __riscv_vssseg2e32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e32, __riscv_vssseg3e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e32, __riscv_vssseg4e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e32, __riscv_vssseg5e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e32, __riscv_vssseg6e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e32, __riscv_vssseg7e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e32, __riscv_vssseg8e32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e64, __riscv_vssseg2e64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e64, __riscv_vssseg3e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e64, __riscv_vssseg4e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e64, __riscv_vssseg5e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e64, __riscv_vssseg6e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e64, __riscv_vssseg7e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e64, __riscv_vssseg8e64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e8, __riscv_vssseg2e8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e8, __riscv_vssseg3e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e8, __riscv_vssseg4e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e8, __riscv_vssseg5e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e8, __riscv_vssseg6e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e8, __riscv_vssseg7e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e8, __riscv_vssseg8e8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei8_tumu, 7, 6, __riscv_vloxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei8_tumu, 9, 8, 7, __riscv_vloxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei16_tumu, 7, 6, __riscv_vloxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei16_tumu, 9, 8, 7, __riscv_vloxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei32_tumu, 7, 6, __riscv_vloxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei32_tumu, 9, 8, 7, __riscv_vloxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei64_tumu, 7, 6, __riscv_vloxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei64_tumu, 9, 8, 7, __riscv_vloxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei8_tumu, 7, 6, __riscv_vluxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei8_tumu, 9, 8, 7, __riscv_vluxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei16_tumu, 7, 6, __riscv_vluxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei16_tumu, 9, 8, 7, __riscv_vluxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei32_tumu, 7, 6, __riscv_vluxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei32_tumu, 9, 8, 7, __riscv_vluxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei64_tumu, 7, 6, __riscv_vluxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei64_tumu, 9, 8, 7, __riscv_vluxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei8, __riscv_vsoxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei8, __riscv_vsoxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei8, __riscv_vsoxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei8, __riscv_vsoxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei8, __riscv_vsoxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei8, __riscv_vsoxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei8, __riscv_vsoxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei16, __riscv_vsoxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei16, __riscv_vsoxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei16, __riscv_vsoxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei16, __riscv_vsoxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei16, __riscv_vsoxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei16, __riscv_vsoxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei16, __riscv_vsoxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei32, __riscv_vsoxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei32, __riscv_vsoxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei32, __riscv_vsoxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei32, __riscv_vsoxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei32, __riscv_vsoxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei32, __riscv_vsoxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei32, __riscv_vsoxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei64, __riscv_vsoxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei64, __riscv_vsoxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei64, __riscv_vsoxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei64, __riscv_vsoxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei64, __riscv_vsoxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei64, __riscv_vsoxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei64, __riscv_vsoxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei8, __riscv_vsuxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei8, __riscv_vsuxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei8, __riscv_vsuxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei8, __riscv_vsuxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei8, __riscv_vsuxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei8, __riscv_vsuxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei8, __riscv_vsuxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei16, __riscv_vsuxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei16, __riscv_vsuxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei16, __riscv_vsuxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei16, __riscv_vsuxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei16, __riscv_vsuxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei16, __riscv_vsuxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei16, __riscv_vsuxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei32, __riscv_vsuxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei32, __riscv_vsuxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei32, __riscv_vsuxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei32, __riscv_vsuxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei32, __riscv_vsuxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei32, __riscv_vsuxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei32, __riscv_vsuxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei64, __riscv_vsuxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei64, __riscv_vsuxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei64, __riscv_vsuxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei64, __riscv_vsuxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei64, __riscv_vsuxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei64, __riscv_vsuxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei64, __riscv_vsuxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vadd_tumu, 4, __riscv_vadd, 2, 1)(__VA_ARGS__)
+#define vsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsub_tumu, 4, __riscv_vsub, 2, 1)(__VA_ARGS__)
+#define vrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrsub_tumu, 4, __riscv_vrsub, 2, 1)(__VA_ARGS__)
+#define vneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vneg_tumu, 3, __riscv_vneg, 1)(__VA_ARGS__)
+#define vwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vv_tumu, 4, __riscv_vwadd_vv, 2, 1)(__VA_ARGS__)
+#define vwadd_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vx_tumu, 4, __riscv_vwadd_vx, 2, 1)(__VA_ARGS__)
+#define vwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wv_tumu, 4, __riscv_vwadd_wv, 2, 1)(__VA_ARGS__)
+#define vwadd_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wx_tumu, 4, __riscv_vwadd_wx, 2, 1)(__VA_ARGS__)
+#define vwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vv_tumu, 4, __riscv_vwsub_vv, 2, 1)(__VA_ARGS__)
+#define vwsub_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vx_tumu, 4, __riscv_vwsub_vx, 2, 1)(__VA_ARGS__)
+#define vwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wv_tumu, 4, __riscv_vwsub_wv, 2, 1)(__VA_ARGS__)
+#define vwsub_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wx_tumu, 4, __riscv_vwsub_wx, 2, 1)(__VA_ARGS__)
+#define vwaddu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vv_tumu, 4, __riscv_vwaddu_vv, 2, 1)(__VA_ARGS__)
+#define vwaddu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vx_tumu, 4, __riscv_vwaddu_vx, 2, 1)(__VA_ARGS__)
+#define vwaddu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wv_tumu, 4, __riscv_vwaddu_wv, 2, 1)(__VA_ARGS__)
+#define vwaddu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wx_tumu, 4, __riscv_vwaddu_wx, 2, 1)(__VA_ARGS__)
+#define vwsubu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vv_tumu, 4, __riscv_vwsubu_vv, 2, 1)(__VA_ARGS__)
+#define vwsubu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vx_tumu, 4, __riscv_vwsubu_vx, 2, 1)(__VA_ARGS__)
+#define vwsubu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wv_tumu, 4, __riscv_vwsubu_wv, 2, 1)(__VA_ARGS__)
+#define vwsubu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wx_tumu, 4, __riscv_vwsubu_wx, 2, 1)(__VA_ARGS__)
+#define vsext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf2_tumu, 3, __riscv_vsext_vf2, 1)(__VA_ARGS__)
+#define vsext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf4_tumu, 3, __riscv_vsext_vf4, 1)(__VA_ARGS__)
+#define vsext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf8_tumu, 3, __riscv_vsext_vf8, 1)(__VA_ARGS__)
+#define vzext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf2_tumu, 3, __riscv_vzext_vf2, 1)(__VA_ARGS__)
+#define vzext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf4_tumu, 3, __riscv_vzext_vf4, 1)(__VA_ARGS__)
+#define vzext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf8_tumu, 3, __riscv_vzext_vf8, 1)(__VA_ARGS__)
+#define vadc(...) __riscv_vadc(__VA_ARGS__)
+#define vsbc(...) __riscv_vsbc(__VA_ARGS__)
+#define vmadc(...) __riscv_vmadc(__VA_ARGS__)
+#define vmsbc(...) __riscv_vmsbc(__VA_ARGS__)
+#define vand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vand_tumu, 4, __riscv_vand, 2, 1)(__VA_ARGS__)
+#define vor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vor_tumu, 4, __riscv_vor, 2, 1)(__VA_ARGS__)
+#define vxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vxor_tumu, 4, __riscv_vxor, 2, 1)(__VA_ARGS__)
+#define vnot(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vnot_tumu, 3, __riscv_vnot, 1)(__VA_ARGS__)
+#define vsll(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsll_tumu, 4, __riscv_vsll, 2, 1)(__VA_ARGS__)
+#define vsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsra_tumu, 4, __riscv_vsra, 2, 1)(__VA_ARGS__)
+#define vsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsrl_tumu, 4, __riscv_vsrl, 2, 1)(__VA_ARGS__)
+#define vnsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsra_tumu, 4, __riscv_vnsra, 2, 1)(__VA_ARGS__)
+#define vnsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsrl_tumu, 4, __riscv_vnsrl, 2, 1)(__VA_ARGS__)
+#define vmseq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmseq_mu, 4, __riscv_vmseq, 2, 1)(__VA_ARGS__)
+#define vmsne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsne_mu, 4, __riscv_vmsne, 2, 1)(__VA_ARGS__)
+#define vmslt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmslt_mu, 4, __riscv_vmslt, 2, 1)(__VA_ARGS__)
+#define vmsle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsle_mu, 4, __riscv_vmsle, 2, 1)(__VA_ARGS__)
+#define vmsgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgt_mu, 4, __riscv_vmsgt, 2, 1)(__VA_ARGS__)
+#define vmsge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsge_mu, 4, __riscv_vmsge, 2, 1)(__VA_ARGS__)
+#define vmsltu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsltu_mu, 4, __riscv_vmsltu, 2, 1)(__VA_ARGS__)
+#define vmsleu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsleu_mu, 4, __riscv_vmsleu, 2, 1)(__VA_ARGS__)
+#define vmsgtu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgtu_mu, 4, __riscv_vmsgtu, 2, 1)(__VA_ARGS__)
+#define vmsgeu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgeu_mu, 4, __riscv_vmsgeu, 2, 1)(__VA_ARGS__)
+#define vmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmin_tumu, 4, __riscv_vmin, 2, 1)(__VA_ARGS__)
+#define vmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmax_tumu, 4, __riscv_vmax, 2, 1)(__VA_ARGS__)
+#define vminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vminu_tumu, 4, __riscv_vminu, 2, 1)(__VA_ARGS__)
+#define vmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmaxu_tumu, 4, __riscv_vmaxu, 2, 1)(__VA_ARGS__)
+#define vmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmul_tumu, 4, __riscv_vmul, 2, 1)(__VA_ARGS__)
+#define vmulh(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulh_tumu, 4, __riscv_vmulh, 2, 1)(__VA_ARGS__)
+#define vmulhsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhsu_tumu, 4, __riscv_vmulhsu, 2, 1)(__VA_ARGS__)
+#define vmulhu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhu_tumu, 4, __riscv_vmulhu, 2, 1)(__VA_ARGS__)
+#define vdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdiv_tumu, 4, __riscv_vdiv, 2, 1)(__VA_ARGS__)
+#define vrem(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrem_tumu, 4, __riscv_vrem, 2, 1)(__VA_ARGS__)
+#define vdivu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdivu_tumu, 4, __riscv_vdivu, 2, 1)(__VA_ARGS__)
+#define vremu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vremu_tumu, 4, __riscv_vremu, 2, 1)(__VA_ARGS__)
+#define vwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmul_tumu, 4, __riscv_vwmul, 2, 1)(__VA_ARGS__)
+#define vwmulsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulsu_tumu, 4, __riscv_vwmulsu, 2, 1)(__VA_ARGS__)
+#define vwmulu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulu_tumu, 4, __riscv_vwmulu, 2, 1)(__VA_ARGS__)
+#define vmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmacc_tumu, __riscv_vmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsac_tumu, __riscv_vnmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmadd_tumu, __riscv_vmadd_tu, 3, 2, 1)(__VA_ARGS__)
+#define vnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsub_tumu, __riscv_vnmsub_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmacc_tumu, __riscv_vwmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmaccsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccsu_tumu, __riscv_vwmaccsu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmaccus(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccus_tumu, __riscv_vwmaccus_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmaccu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccu_tumu, __riscv_vwmaccu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vmv_v(...) __riscv_vmv_v(__VA_ARGS__)
+#define vsadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsadd_tumu, 4, __riscv_vsadd, 2, 1)(__VA_ARGS__)
+#define vssub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssub_tumu, 4, __riscv_vssub, 2, 1)(__VA_ARGS__)
+#define vsaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsaddu_tumu, 4, __riscv_vsaddu, 2, 1)(__VA_ARGS__)
+#define vssubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssubu_tumu, 4, __riscv_vssubu, 2, 1)(__VA_ARGS__)
+#define vaadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaadd_tumu, 4, __riscv_vaadd, 2, 1)(__VA_ARGS__)
+#define vasub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasub_tumu, 4, __riscv_vasub, 2, 1)(__VA_ARGS__)
+#define vaaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaaddu_tumu, 4, __riscv_vaaddu, 2, 1)(__VA_ARGS__)
+#define vasubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasubu_tumu, 4, __riscv_vasubu, 2, 1)(__VA_ARGS__)
+#define vsmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsmul_mu, 4, __riscv_vsmul, 2, 1)(__VA_ARGS__)
+#define vssra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssra_tumu, 4, __riscv_vssra, 2, 1)(__VA_ARGS__)
+#define vssrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssrl_tumu, 4, __riscv_vssrl, 2, 1)(__VA_ARGS__)
+#define vnclip(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclip_tumu, 4, __riscv_vnclip, 2, 1)(__VA_ARGS__)
+#define vnclipu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclipu_tumu, 4, __riscv_vnclipu, 2, 1)(__VA_ARGS__)
+#define vfadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfadd_tumu, 4, __riscv_vfadd, 2, 1)(__VA_ARGS__)
+#define vfsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsub_tumu, 4, __riscv_vfsub, 2, 1)(__VA_ARGS__)
+#define vfrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrsub_tumu, 4, __riscv_vfrsub, 2, 1)(__VA_ARGS__)
+#define vfneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfneg_tumu, 3, __riscv_vfneg, 1)(__VA_ARGS__)
+#define vfwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vv_tumu, 4, __riscv_vfwadd_vv, 2, 1)(__VA_ARGS__)
+#define vfwadd_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vf_tumu, 4, __riscv_vfwadd_vf, 2, 1)(__VA_ARGS__)
+#define vfwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wv_tumu, 4, __riscv_vfwadd_wv, 2, 1)(__VA_ARGS__)
+#define vfwadd_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wf_tumu, 4, __riscv_vfwadd_wf, 2, 1)(__VA_ARGS__)
+#define vfwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vv_tumu, 4, __riscv_vfwsub_vv, 2, 1)(__VA_ARGS__)
+#define vfwsub_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vf_tumu, 4, __riscv_vfwsub_vf, 2, 1)(__VA_ARGS__)
+#define vfwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wv_tumu, 4, __riscv_vfwsub_wv, 2, 1)(__VA_ARGS__)
+#define vfwsub_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wf_tumu, 4, __riscv_vfwsub_wf, 2, 1)(__VA_ARGS__)
+#define vfmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmul_tumu, 4, __riscv_vfmul, 2, 1)(__VA_ARGS__)
+#define vfdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfdiv_tumu, 4, __riscv_vfdiv, 2, 1)(__VA_ARGS__)
+#define vfrdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrdiv_tumu, 4, __riscv_vfrdiv, 2, 1)(__VA_ARGS__)
+#define vfwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmul_tumu, 4, __riscv_vfwmul, 2, 1)(__VA_ARGS__)
+#define vfmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmacc_tumu, __riscv_vfmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmacc_tumu, __riscv_vfnmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsac_tumu, __riscv_vfmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsac_tumu, __riscv_vfnmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmadd_tumu, __riscv_vfmadd_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmadd_tumu, __riscv_vfnmadd_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsub_tumu, __riscv_vfmsub_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsub_tumu, __riscv_vfnmsub_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmacc_tumu, __riscv_vfwmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmacc_tumu, __riscv_vfwnmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmsac_tumu, __riscv_vfwmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmsac_tumu, __riscv_vfwnmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfsqrt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfsqrt_tumu, 3, __riscv_vfsqrt, 1)(__VA_ARGS__)
+#define vfrsqrt7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrsqrt7_tumu, 3, __riscv_vfrsqrt7, 1)(__VA_ARGS__)
+#define vfrec7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrec7_tumu, 3, __riscv_vfrec7, 1)(__VA_ARGS__)
+#define vfmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmin_tumu, 4, __riscv_vfmin, 2, 1)(__VA_ARGS__)
+#define vfmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmax_tumu, 4, __riscv_vfmax, 2, 1)(__VA_ARGS__)
+#define vfsgnj(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnj_tumu, 4, __riscv_vfsgnj, 2, 1)(__VA_ARGS__)
+#define vfsgnjn(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjn_tumu, 4, __riscv_vfsgnjn, 2, 1)(__VA_ARGS__)
+#define vfsgnjx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjx_tumu, 4, __riscv_vfsgnjx, 2, 1)(__VA_ARGS__)
+#define vfabs(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfabs_tumu, 3, __riscv_vfabs, 1)(__VA_ARGS__)
+#define vmfeq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfeq_mu, 4, __riscv_vmfeq, 2, 1)(__VA_ARGS__)
+#define vmfne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfne_mu, 4, __riscv_vmfne, 2, 1)(__VA_ARGS__)
+#define vmflt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmflt_mu, 4, __riscv_vmflt, 2, 1)(__VA_ARGS__)
+#define vmfle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfle_mu, 4, __riscv_vmfle, 2, 1)(__VA_ARGS__)
+#define vmfgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfgt_mu, 4, __riscv_vmfgt, 2, 1)(__VA_ARGS__)
+#define vmfge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfge_mu, 4, __riscv_vmfge, 2, 1)(__VA_ARGS__)
+#define vfclass(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfclass_tumu, 3, __riscv_vfclass, 1)(__VA_ARGS__)
+#define vfcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_x_tumu, 3, __riscv_vfcvt_x, 1)(__VA_ARGS__)
+#define vfcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_x_tumu, 3, __riscv_vfcvt_rtz_x, 1)(__VA_ARGS__)
+#define vfcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_xu_tumu, 3, __riscv_vfcvt_xu, 1)(__VA_ARGS__)
+#define vfcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_xu_tumu, 3, __riscv_vfcvt_rtz_xu, 1)(__VA_ARGS__)
+#define vfcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_f_tumu, 3, __riscv_vfcvt_f, 1)(__VA_ARGS__)
+#define vwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvt_x_tumu, 3, __riscv_vwcvt_x, 1)(__VA_ARGS__)
+#define vwcvtu_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvtu_x_tumu, 3, __riscv_vwcvtu_x, 1)(__VA_ARGS__)
+#define vfwcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_f_tumu, 3, __riscv_vfwcvt_f, 1)(__VA_ARGS__)
+#define vfwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_x_tumu, 3, __riscv_vfwcvt_x, 1)(__VA_ARGS__)
+#define vfwcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_x_tumu, 3, __riscv_vfwcvt_rtz_x, 1)(__VA_ARGS__)
+#define vfwcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_xu_tumu, 3, __riscv_vfwcvt_xu, 1)(__VA_ARGS__)
+#define vfwcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_xu_tumu, 3, __riscv_vfwcvt_rtz_xu, 1)(__VA_ARGS__)
+#define vfncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_x_tumu, 3, __riscv_vfncvt_x, 1)(__VA_ARGS__)
+#define vfncvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_x_tumu, 3, __riscv_vfncvt_rtz_x, 1)(__VA_ARGS__)
+#define vncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vncvt_x_tumu, 3, __riscv_vncvt_x, 1)(__VA_ARGS__)
+#define vfncvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_xu_tumu, 3, __riscv_vfncvt_xu, 1)(__VA_ARGS__)
+#define vfncvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_xu_tumu, 3, __riscv_vfncvt_rtz_xu, 1)(__VA_ARGS__)
+#define vfncvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_f_tumu, 3, __riscv_vfncvt_f, 1)(__VA_ARGS__)
+#define vfncvt_rod_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rod_f_tumu, 3, __riscv_vfncvt_rod_f, 1)(__VA_ARGS__)
+#define vredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredsum_tum, __riscv_vredsum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmax_tum, __riscv_vredmax_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmin_tum, __riscv_vredmin_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredand_tum, __riscv_vredand_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredor_tum, __riscv_vredor_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredxor_tum, __riscv_vredxor_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmaxu_tum, __riscv_vredmaxu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredminu_tum, __riscv_vredminu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsum_tum, __riscv_vwredsum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwredsumu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsumu_tum, __riscv_vwredsumu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredosum_tum, __riscv_vfredosum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredusum_tum, __riscv_vfredusum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmax_tum, __riscv_vfredmax_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmin_tum, __riscv_vfredmin_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredosum_tum, __riscv_vfwredosum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredusum_tum, __riscv_vfwredusum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vsm(...) __riscv_vsm(__VA_ARGS__)
+#define vmand(...) __riscv_vmand(__VA_ARGS__)
+#define vmnand(...) __riscv_vmnand(__VA_ARGS__)
+#define vmandn(...) __riscv_vmandn(__VA_ARGS__)
+#define vmxor(...) __riscv_vmxor(__VA_ARGS__)
+#define vmor(...) __riscv_vmor(__VA_ARGS__)
+#define vmnor(...) __riscv_vmnor(__VA_ARGS__)
+#define vmorn(...) __riscv_vmorn(__VA_ARGS__)
+#define vmxnor(...) __riscv_vmxnor(__VA_ARGS__)
+#define vmmv(...) __riscv_vmmv(__VA_ARGS__)
+#define vmnot(...) __riscv_vmnot(__VA_ARGS__)
+#define vcpop(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vcpop, __riscv_vcpop, 1)(__VA_ARGS__)
+#define vfirst(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vfirst, __riscv_vfirst, 1)(__VA_ARGS__)
+#define vmsbf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsbf_mu, 3, __riscv_vmsbf, 1)(__VA_ARGS__)
+#define vmsif(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsif_mu, 3, __riscv_vmsif, 1)(__VA_ARGS__)
+#define vmsof(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsof_mu, 3, __riscv_vmsof, 1)(__VA_ARGS__)
+#define vfmv_f(...) __riscv_vfmv_f(__VA_ARGS__)
+#define vfmv_s(...) __riscv_vfmv_s_tu(__VA_ARGS__)
+#define vmv_x(...) __riscv_vmv_x(__VA_ARGS__)
+#define vmv_s(...) __riscv_vmv_s_tu(__VA_ARGS__)
+#define vslideup(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslideup_tumu, __riscv_vslideup_tu, 3, 2, 1)(__VA_ARGS__)
+#define vslidedown(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslidedown_tumu, __riscv_vslidedown_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1up_tumu, 4, __riscv_vfslide1up, 2, 1)(__VA_ARGS__)
+#define vfslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1down_tumu, 4, __riscv_vfslide1down, 2, 1)(__VA_ARGS__)
+#define vslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1up_tumu, 4, __riscv_vslide1up, 2, 1)(__VA_ARGS__)
+#define vslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1down_tumu, 4, __riscv_vslide1down, 2, 1)(__VA_ARGS__)
+#define vrgather(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgather_tumu, 4, __riscv_vrgather, 2, 1)(__VA_ARGS__)
+#define vrgatherei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgatherei16_tumu, 4, __riscv_vrgatherei16, 2, 1)(__VA_ARGS__)
+#define vreinterpret_u8mf8(...) __riscv_vreinterpret_u8mf8(__VA_ARGS__)
+#define vreinterpret_u8mf4(...) __riscv_vreinterpret_u8mf4(__VA_ARGS__)
+#define vreinterpret_u8mf2(...) __riscv_vreinterpret_u8mf2(__VA_ARGS__)
+#define vreinterpret_u8m1(...) __riscv_vreinterpret_u8m1(__VA_ARGS__)
+#define vreinterpret_u8m2(...) __riscv_vreinterpret_u8m2(__VA_ARGS__)
+#define vreinterpret_u8m4(...) __riscv_vreinterpret_u8m4(__VA_ARGS__)
+#define vreinterpret_u8m8(...) __riscv_vreinterpret_u8m8(__VA_ARGS__)
+#define vreinterpret_i8mf8(...) __riscv_vreinterpret_i8mf8(__VA_ARGS__)
+#define vreinterpret_i8mf4(...) __riscv_vreinterpret_i8mf4(__VA_ARGS__)
+#define vreinterpret_i8mf2(...) __riscv_vreinterpret_i8mf2(__VA_ARGS__)
+#define vreinterpret_i8m1(...) __riscv_vreinterpret_i8m1(__VA_ARGS__)
+#define vreinterpret_i8m2(...) __riscv_vreinterpret_i8m2(__VA_ARGS__)
+#define vreinterpret_i8m4(...) __riscv_vreinterpret_i8m4(__VA_ARGS__)
+#define vreinterpret_i8m8(...) __riscv_vreinterpret_i8m8(__VA_ARGS__)
+#define vreinterpret_f16mf4(...) __riscv_vreinterpret_f16mf4(__VA_ARGS__)
+#define vreinterpret_f16mf2(...) __riscv_vreinterpret_f16mf2(__VA_ARGS__)
+#define vreinterpret_f16m1(...) __riscv_vreinterpret_f16m1(__VA_ARGS__)
+#define vreinterpret_f16m2(...) __riscv_vreinterpret_f16m2(__VA_ARGS__)
+#define vreinterpret_f16m4(...) __riscv_vreinterpret_f16m4(__VA_ARGS__)
+#define vreinterpret_f16m8(...) __riscv_vreinterpret_f16m8(__VA_ARGS__)
+#define vreinterpret_u16mf4(...) __riscv_vreinterpret_u16mf4(__VA_ARGS__)
+#define vreinterpret_u16mf2(...) __riscv_vreinterpret_u16mf2(__VA_ARGS__)
+#define vreinterpret_u16m1(...) __riscv_vreinterpret_u16m1(__VA_ARGS__)
+#define vreinterpret_u16m2(...) __riscv_vreinterpret_u16m2(__VA_ARGS__)
+#define vreinterpret_u16m4(...) __riscv_vreinterpret_u16m4(__VA_ARGS__)
+#define vreinterpret_u16m8(...) __riscv_vreinterpret_u16m8(__VA_ARGS__)
+#define vreinterpret_i16mf4(...) __riscv_vreinterpret_i16mf4(__VA_ARGS__)
+#define vreinterpret_i16mf2(...) __riscv_vreinterpret_i16mf2(__VA_ARGS__)
+#define vreinterpret_i16m1(...) __riscv_vreinterpret_i16m1(__VA_ARGS__)
+#define vreinterpret_i16m2(...) __riscv_vreinterpret_i16m2(__VA_ARGS__)
+#define vreinterpret_i16m4(...) __riscv_vreinterpret_i16m4(__VA_ARGS__)
+#define vreinterpret_i16m8(...) __riscv_vreinterpret_i16m8(__VA_ARGS__)
+#define vreinterpret_f32mf2(...) __riscv_vreinterpret_f32mf2(__VA_ARGS__)
+#define vreinterpret_f32m1(...) __riscv_vreinterpret_f32m1(__VA_ARGS__)
+#define vreinterpret_f32m2(...) __riscv_vreinterpret_f32m2(__VA_ARGS__)
+#define vreinterpret_f32m4(...) __riscv_vreinterpret_f32m4(__VA_ARGS__)
+#define vreinterpret_f32m8(...) __riscv_vreinterpret_f32m8(__VA_ARGS__)
+#define vreinterpret_u32mf2(...) __riscv_vreinterpret_u32mf2(__VA_ARGS__)
+#define vreinterpret_u32m1(...) __riscv_vreinterpret_u32m1(__VA_ARGS__)
+#define vreinterpret_u32m2(...) __riscv_vreinterpret_u32m2(__VA_ARGS__)
+#define vreinterpret_u32m4(...) __riscv_vreinterpret_u32m4(__VA_ARGS__)
+#define vreinterpret_u32m8(...) __riscv_vreinterpret_u32m8(__VA_ARGS__)
+#define vreinterpret_i32mf2(...) __riscv_vreinterpret_i32mf2(__VA_ARGS__)
+#define vreinterpret_i32m1(...) __riscv_vreinterpret_i32m1(__VA_ARGS__)
+#define vreinterpret_i32m2(...) __riscv_vreinterpret_i32m2(__VA_ARGS__)
+#define vreinterpret_i32m4(...) __riscv_vreinterpret_i32m4(__VA_ARGS__)
+#define vreinterpret_i32m8(...) __riscv_vreinterpret_i32m8(__VA_ARGS__)
+#define vreinterpret_f64m1(...) __riscv_vreinterpret_f64m1(__VA_ARGS__)
+#define vreinterpret_f64m2(...) __riscv_vreinterpret_f64m2(__VA_ARGS__)
+#define vreinterpret_f64m4(...) __riscv_vreinterpret_f64m4(__VA_ARGS__)
+#define vreinterpret_f64m8(...) __riscv_vreinterpret_f64m8(__VA_ARGS__)
+#define vreinterpret_u64m1(...) __riscv_vreinterpret_u64m1(__VA_ARGS__)
+#define vreinterpret_u64m2(...) __riscv_vreinterpret_u64m2(__VA_ARGS__)
+#define vreinterpret_u64m4(...) __riscv_vreinterpret_u64m4(__VA_ARGS__)
+#define vreinterpret_u64m8(...) __riscv_vreinterpret_u64m8(__VA_ARGS__)
+#define vreinterpret_i64m1(...) __riscv_vreinterpret_i64m1(__VA_ARGS__)
+#define vreinterpret_i64m2(...) __riscv_vreinterpret_i64m2(__VA_ARGS__)
+#define vreinterpret_i64m4(...) __riscv_vreinterpret_i64m4(__VA_ARGS__)
+#define vreinterpret_i64m8(...) __riscv_vreinterpret_i64m8(__VA_ARGS__)
+#define vlmul_ext_f16mf2(...) __riscv_vlmul_ext_f16mf2(__VA_ARGS__)
+#define vlmul_ext_f16m1(...) __riscv_vlmul_ext_f16m1(__VA_ARGS__)
+#define vlmul_ext_f16m2(...) __riscv_vlmul_ext_f16m2(__VA_ARGS__)
+#define vlmul_ext_f16m4(...) __riscv_vlmul_ext_f16m4(__VA_ARGS__)
+#define vlmul_ext_f16m8(...) __riscv_vlmul_ext_f16m8(__VA_ARGS__)
+#define vlmul_ext_f32m1(...) __riscv_vlmul_ext_f32m1(__VA_ARGS__)
+#define vlmul_ext_f32m2(...) __riscv_vlmul_ext_f32m2(__VA_ARGS__)
+#define vlmul_ext_f32m4(...) __riscv_vlmul_ext_f32m4(__VA_ARGS__)
+#define vlmul_ext_f32m8(...) __riscv_vlmul_ext_f32m8(__VA_ARGS__)
+#define vlmul_ext_f64m2(...) __riscv_vlmul_ext_f64m2(__VA_ARGS__)
+#define vlmul_ext_f64m4(...) __riscv_vlmul_ext_f64m4(__VA_ARGS__)
+#define vlmul_ext_f64m8(...) __riscv_vlmul_ext_f64m8(__VA_ARGS__)
+#define vlmul_ext_i8mf4(...) __riscv_vlmul_ext_i8mf4(__VA_ARGS__)
+#define vlmul_ext_i8mf2(...) __riscv_vlmul_ext_i8mf2(__VA_ARGS__)
+#define vlmul_ext_i8m1(...) __riscv_vlmul_ext_i8m1(__VA_ARGS__)
+#define vlmul_ext_i8m2(...) __riscv_vlmul_ext_i8m2(__VA_ARGS__)
+#define vlmul_ext_i8m4(...) __riscv_vlmul_ext_i8m4(__VA_ARGS__)
+#define vlmul_ext_i8m8(...) __riscv_vlmul_ext_i8m8(__VA_ARGS__)
+#define vlmul_ext_i16mf2(...) __riscv_vlmul_ext_i16mf2(__VA_ARGS__)
+#define vlmul_ext_i16m1(...) __riscv_vlmul_ext_i16m1(__VA_ARGS__)
+#define vlmul_ext_i16m2(...) __riscv_vlmul_ext_i16m2(__VA_ARGS__)
+#define vlmul_ext_i16m4(...) __riscv_vlmul_ext_i16m4(__VA_ARGS__)
+#define vlmul_ext_i16m8(...) __riscv_vlmul_ext_i16m8(__VA_ARGS__)
+#define vlmul_ext_i32m1(...) __riscv_vlmul_ext_i32m1(__VA_ARGS__)
+#define vlmul_ext_i32m2(...) __riscv_vlmul_ext_i32m2(__VA_ARGS__)
+#define vlmul_ext_i32m4(...) __riscv_vlmul_ext_i32m4(__VA_ARGS__)
+#define vlmul_ext_i32m8(...) __riscv_vlmul_ext_i32m8(__VA_ARGS__)
+#define vlmul_ext_i64m2(...) __riscv_vlmul_ext_i64m2(__VA_ARGS__)
+#define vlmul_ext_i64m4(...) __riscv_vlmul_ext_i64m4(__VA_ARGS__)
+#define vlmul_ext_i64m8(...) __riscv_vlmul_ext_i64m8(__VA_ARGS__)
+#define vlmul_ext_u8mf4(...) __riscv_vlmul_ext_u8mf4(__VA_ARGS__)
+#define vlmul_ext_u8mf2(...) __riscv_vlmul_ext_u8mf2(__VA_ARGS__)
+#define vlmul_ext_u8m1(...) __riscv_vlmul_ext_u8m1(__VA_ARGS__)
+#define vlmul_ext_u8m2(...) __riscv_vlmul_ext_u8m2(__VA_ARGS__)
+#define vlmul_ext_u8m4(...) __riscv_vlmul_ext_u8m4(__VA_ARGS__)
+#define vlmul_ext_u8m8(...) __riscv_vlmul_ext_u8m8(__VA_ARGS__)
+#define vlmul_ext_u16mf2(...) __riscv_vlmul_ext_u16mf2(__VA_ARGS__)
+#define vlmul_ext_u16m1(...) __riscv_vlmul_ext_u16m1(__VA_ARGS__)
+#define vlmul_ext_u16m2(...) __riscv_vlmul_ext_u16m2(__VA_ARGS__)
+#define vlmul_ext_u16m4(...) __riscv_vlmul_ext_u16m4(__VA_ARGS__)
+#define vlmul_ext_u16m8(...) __riscv_vlmul_ext_u16m8(__VA_ARGS__)
+#define vlmul_ext_u32m1(...) __riscv_vlmul_ext_u32m1(__VA_ARGS__)
+#define vlmul_ext_u32m2(...) __riscv_vlmul_ext_u32m2(__VA_ARGS__)
+#define vlmul_ext_u32m4(...) __riscv_vlmul_ext_u32m4(__VA_ARGS__)
+#define vlmul_ext_u32m8(...) __riscv_vlmul_ext_u32m8(__VA_ARGS__)
+#define vlmul_ext_u64m2(...) __riscv_vlmul_ext_u64m2(__VA_ARGS__)
+#define vlmul_ext_u64m4(...) __riscv_vlmul_ext_u64m4(__VA_ARGS__)
+#define vlmul_ext_u64m8(...) __riscv_vlmul_ext_u64m8(__VA_ARGS__)
+#define vlmul_trunc_f16mf4(...) __riscv_vlmul_trunc_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_f16mf2(...) __riscv_vlmul_trunc_f16mf2(__VA_ARGS__)
+#define vlmul_trunc_f16m1(...) __riscv_vlmul_trunc_f16m1(__VA_ARGS__)
+#define vlmul_trunc_f16m2(...) __riscv_vlmul_trunc_f16m2(__VA_ARGS__)
+#define vlmul_trunc_f16m4(...) __riscv_vlmul_trunc_f16m4(__VA_ARGS__)
+#define vlmul_trunc_f32mf2(...) __riscv_vlmul_trunc_f32mf2(__VA_ARGS__)
+#define vlmul_trunc_f32m1(...) __riscv_vlmul_trunc_f32m1(__VA_ARGS__)
+#define vlmul_trunc_f32m2(...) __riscv_vlmul_trunc_f32m2(__VA_ARGS__)
+#define vlmul_trunc_f32m4(...) __riscv_vlmul_trunc_f32m4(__VA_ARGS__)
+#define vlmul_trunc_f64m1(...) __riscv_vlmul_trunc_f64m1(__VA_ARGS__)
+#define vlmul_trunc_f64m2(...) __riscv_vlmul_trunc_f64m2(__VA_ARGS__)
+#define vlmul_trunc_f64m4(...) __riscv_vlmul_trunc_f64m4(__VA_ARGS__)
+#define vlmul_trunc_i8mf8(...) __riscv_vlmul_trunc_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_i8mf4(...) __riscv_vlmul_trunc_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_i8mf2(...) __riscv_vlmul_trunc_i8mf2(__VA_ARGS__)
+#define vlmul_trunc_i8m1(...) __riscv_vlmul_trunc_i8m1(__VA_ARGS__)
+#define vlmul_trunc_i8m2(...) __riscv_vlmul_trunc_i8m2(__VA_ARGS__)
+#define vlmul_trunc_i8m4(...) __riscv_vlmul_trunc_i8m4(__VA_ARGS__)
+#define vlmul_trunc_i16mf4(...) __riscv_vlmul_trunc_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_i16mf2(...) __riscv_vlmul_trunc_i16mf2(__VA_ARGS__)
+#define vlmul_trunc_i16m1(...) __riscv_vlmul_trunc_i16m1(__VA_ARGS__)
+#define vlmul_trunc_i16m2(...) __riscv_vlmul_trunc_i16m2(__VA_ARGS__)
+#define vlmul_trunc_i16m4(...) __riscv_vlmul_trunc_i16m4(__VA_ARGS__)
+#define vlmul_trunc_i32mf2(...) __riscv_vlmul_trunc_i32mf2(__VA_ARGS__)
+#define vlmul_trunc_i32m1(...) __riscv_vlmul_trunc_i32m1(__VA_ARGS__)
+#define vlmul_trunc_i32m2(...) __riscv_vlmul_trunc_i32m2(__VA_ARGS__)
+#define vlmul_trunc_i32m4(...) __riscv_vlmul_trunc_i32m4(__VA_ARGS__)
+#define vlmul_trunc_i64m1(...) __riscv_vlmul_trunc_i64m1(__VA_ARGS__)
+#define vlmul_trunc_i64m2(...) __riscv_vlmul_trunc_i64m2(__VA_ARGS__)
+#define vlmul_trunc_i64m4(...) __riscv_vlmul_trunc_i64m4(__VA_ARGS__)
+#define vlmul_trunc_u8mf8(...) __riscv_vlmul_trunc_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_u8mf4(...) __riscv_vlmul_trunc_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_u8mf2(...) __riscv_vlmul_trunc_u8mf2(__VA_ARGS__)
+#define vlmul_trunc_u8m1(...) __riscv_vlmul_trunc_u8m1(__VA_ARGS__)
+#define vlmul_trunc_u8m2(...) __riscv_vlmul_trunc_u8m2(__VA_ARGS__)
+#define vlmul_trunc_u8m4(...) __riscv_vlmul_trunc_u8m4(__VA_ARGS__)
+#define vlmul_trunc_u16mf4(...) __riscv_vlmul_trunc_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_u16mf2(...) __riscv_vlmul_trunc_u16mf2(__VA_ARGS__)
+#define vlmul_trunc_u16m1(...) __riscv_vlmul_trunc_u16m1(__VA_ARGS__)
+#define vlmul_trunc_u16m2(...) __riscv_vlmul_trunc_u16m2(__VA_ARGS__)
+#define vlmul_trunc_u16m4(...) __riscv_vlmul_trunc_u16m4(__VA_ARGS__)
+#define vlmul_trunc_u32mf2(...) __riscv_vlmul_trunc_u32mf2(__VA_ARGS__)
+#define vlmul_trunc_u32m1(...) __riscv_vlmul_trunc_u32m1(__VA_ARGS__)
+#define vlmul_trunc_u32m2(...) __riscv_vlmul_trunc_u32m2(__VA_ARGS__)
+#define vlmul_trunc_u32m4(...) __riscv_vlmul_trunc_u32m4(__VA_ARGS__)
+#define vlmul_trunc_u64m1(...) __riscv_vlmul_trunc_u64m1(__VA_ARGS__)
+#define vlmul_trunc_u64m2(...) __riscv_vlmul_trunc_u64m2(__VA_ARGS__)
+#define vlmul_trunc_u64m4(...) __riscv_vlmul_trunc_u64m4(__VA_ARGS__)
+#define vset(...) __riscv_vset(__VA_ARGS__)
+#define vget_f16m1(...) __riscv_vget_f16m1(__VA_ARGS__)
+#define vget_f16m2(...) __riscv_vget_f16m2(__VA_ARGS__)
+#define vget_f16m4(...) __riscv_vget_f16m4(__VA_ARGS__)
+#define vget_f32m1(...) __riscv_vget_f32m1(__VA_ARGS__)
+#define vget_f32m2(...) __riscv_vget_f32m2(__VA_ARGS__)
+#define vget_f32m4(...) __riscv_vget_f32m4(__VA_ARGS__)
+#define vget_f64m1(...) __riscv_vget_f64m1(__VA_ARGS__)
+#define vget_f64m2(...) __riscv_vget_f64m2(__VA_ARGS__)
+#define vget_f64m4(...) __riscv_vget_f64m4(__VA_ARGS__)
+#define vget_i8m1(...) __riscv_vget_i8m1(__VA_ARGS__)
+#define vget_i8m2(...) __riscv_vget_i8m2(__VA_ARGS__)
+#define vget_i8m4(...) __riscv_vget_i8m4(__VA_ARGS__)
+#define vget_i16m1(...) __riscv_vget_i16m1(__VA_ARGS__)
+#define vget_i16m2(...) __riscv_vget_i16m2(__VA_ARGS__)
+#define vget_i16m4(...) __riscv_vget_i16m4(__VA_ARGS__)
+#define vget_i32m1(...) __riscv_vget_i32m1(__VA_ARGS__)
+#define vget_i32m2(...) __riscv_vget_i32m2(__VA_ARGS__)
+#define vget_i32m4(...) __riscv_vget_i32m4(__VA_ARGS__)
+#define vget_i64m1(...) __riscv_vget_i64m1(__VA_ARGS__)
+#define vget_i64m2(...) __riscv_vget_i64m2(__VA_ARGS__)
+#define vget_i64m4(...) __riscv_vget_i64m4(__VA_ARGS__)
+#define vget_u8m1(...) __riscv_vget_u8m1(__VA_ARGS__)
+#define vget_u8m2(...) __riscv_vget_u8m2(__VA_ARGS__)
+#define vget_u8m4(...) __riscv_vget_u8m4(__VA_ARGS__)
+#define vget_u16m1(...) __riscv_vget_u16m1(__VA_ARGS__)
+#define vget_u16m2(...) __riscv_vget_u16m2(__VA_ARGS__)
+#define vget_u16m4(...) __riscv_vget_u16m4(__VA_ARGS__)
+#define vget_u32m1(...) __riscv_vget_u32m1(__VA_ARGS__)
+#define vget_u32m2(...) __riscv_vget_u32m2(__VA_ARGS__)
+#define vget_u32m4(...) __riscv_vget_u32m4(__VA_ARGS__)
+#define vget_u64m1(...) __riscv_vget_u64m1(__VA_ARGS__)
+#define vget_u64m2(...) __riscv_vget_u64m2(__VA_ARGS__)
+#define vget_u64m4(...) __riscv_vget_u64m4(__VA_ARGS__)
+#define vle16(...) __riscv_vle16_tumu(__VA_ARGS__)
+#define vle32(...) __riscv_vle32_tumu(__VA_ARGS__)
+#define vle64(...) __riscv_vle64_tumu(__VA_ARGS__)
+#define vle8(...) __riscv_vle8_tumu(__VA_ARGS__)
+#define vlse16(...) __riscv_vlse16_tumu(__VA_ARGS__)
+#define vlse32(...) __riscv_vlse32_tumu(__VA_ARGS__)
+#define vlse64(...) __riscv_vlse64_tumu(__VA_ARGS__)
+#define vlse8(...) __riscv_vlse8_tumu(__VA_ARGS__)
+#define vle16ff(...) __riscv_vle16ff_tumu(__VA_ARGS__)
+#define vle32ff(...) __riscv_vle32ff_tumu(__VA_ARGS__)
+#define vle64ff(...) __riscv_vle64ff_tumu(__VA_ARGS__)
+#define vle8ff(...) __riscv_vle8ff_tumu(__VA_ARGS__)
+#define vlseg2e16(...) __riscv_vlseg2e16_tumu(__VA_ARGS__)
+#define vlseg3e16(...) __riscv_vlseg3e16_tumu(__VA_ARGS__)
+#define vlseg4e16(...) __riscv_vlseg4e16_tumu(__VA_ARGS__)
+#define vlseg5e16(...) __riscv_vlseg5e16_tumu(__VA_ARGS__)
+#define vlseg6e16(...) __riscv_vlseg6e16_tumu(__VA_ARGS__)
+#define vlseg7e16(...) __riscv_vlseg7e16_tumu(__VA_ARGS__)
+#define vlseg8e16(...) __riscv_vlseg8e16_tumu(__VA_ARGS__)
+#define vlseg2e32(...) __riscv_vlseg2e32_tumu(__VA_ARGS__)
+#define vlseg3e32(...) __riscv_vlseg3e32_tumu(__VA_ARGS__)
+#define vlseg4e32(...) __riscv_vlseg4e32_tumu(__VA_ARGS__)
+#define vlseg5e32(...) __riscv_vlseg5e32_tumu(__VA_ARGS__)
+#define vlseg6e32(...) __riscv_vlseg6e32_tumu(__VA_ARGS__)
+#define vlseg7e32(...) __riscv_vlseg7e32_tumu(__VA_ARGS__)
+#define vlseg8e32(...) __riscv_vlseg8e32_tumu(__VA_ARGS__)
+#define vlseg2e64(...) __riscv_vlseg2e64_tumu(__VA_ARGS__)
+#define vlseg3e64(...) __riscv_vlseg3e64_tumu(__VA_ARGS__)
+#define vlseg4e64(...) __riscv_vlseg4e64_tumu(__VA_ARGS__)
+#define vlseg5e64(...) __riscv_vlseg5e64_tumu(__VA_ARGS__)
+#define vlseg6e64(...) __riscv_vlseg6e64_tumu(__VA_ARGS__)
+#define vlseg7e64(...) __riscv_vlseg7e64_tumu(__VA_ARGS__)
+#define vlseg8e64(...) __riscv_vlseg8e64_tumu(__VA_ARGS__)
+#define vlseg2e16ff(...) __riscv_vlseg2e16ff_tumu(__VA_ARGS__)
+#define vlseg3e16ff(...) __riscv_vlseg3e16ff_tumu(__VA_ARGS__)
+#define vlseg4e16ff(...) __riscv_vlseg4e16ff_tumu(__VA_ARGS__)
+#define vlseg5e16ff(...) __riscv_vlseg5e16ff_tumu(__VA_ARGS__)
+#define vlseg6e16ff(...) __riscv_vlseg6e16ff_tumu(__VA_ARGS__)
+#define vlseg7e16ff(...) __riscv_vlseg7e16ff_tumu(__VA_ARGS__)
+#define vlseg8e16ff(...) __riscv_vlseg8e16ff_tumu(__VA_ARGS__)
+#define vlseg2e32ff(...) __riscv_vlseg2e32ff_tumu(__VA_ARGS__)
+#define vlseg3e32ff(...) __riscv_vlseg3e32ff_tumu(__VA_ARGS__)
+#define vlseg4e32ff(...) __riscv_vlseg4e32ff_tumu(__VA_ARGS__)
+#define vlseg5e32ff(...) __riscv_vlseg5e32ff_tumu(__VA_ARGS__)
+#define vlseg6e32ff(...) __riscv_vlseg6e32ff_tumu(__VA_ARGS__)
+#define vlseg7e32ff(...) __riscv_vlseg7e32ff_tumu(__VA_ARGS__)
+#define vlseg8e32ff(...) __riscv_vlseg8e32ff_tumu(__VA_ARGS__)
+#define vlseg2e64ff(...) __riscv_vlseg2e64ff_tumu(__VA_ARGS__)
+#define vlseg3e64ff(...) __riscv_vlseg3e64ff_tumu(__VA_ARGS__)
+#define vlseg4e64ff(...) __riscv_vlseg4e64ff_tumu(__VA_ARGS__)
+#define vlseg5e64ff(...) __riscv_vlseg5e64ff_tumu(__VA_ARGS__)
+#define vlseg6e64ff(...) __riscv_vlseg6e64ff_tumu(__VA_ARGS__)
+#define vlseg7e64ff(...) __riscv_vlseg7e64ff_tumu(__VA_ARGS__)
+#define vlseg8e64ff(...) __riscv_vlseg8e64ff_tumu(__VA_ARGS__)
+#define vlseg2e8(...) __riscv_vlseg2e8_tumu(__VA_ARGS__)
+#define vlseg3e8(...) __riscv_vlseg3e8_tumu(__VA_ARGS__)
+#define vlseg4e8(...) __riscv_vlseg4e8_tumu(__VA_ARGS__)
+#define vlseg5e8(...) __riscv_vlseg5e8_tumu(__VA_ARGS__)
+#define vlseg6e8(...) __riscv_vlseg6e8_tumu(__VA_ARGS__)
+#define vlseg7e8(...) __riscv_vlseg7e8_tumu(__VA_ARGS__)
+#define vlseg8e8(...) __riscv_vlseg8e8_tumu(__VA_ARGS__)
+#define vlseg2e8ff(...) __riscv_vlseg2e8ff_tumu(__VA_ARGS__)
+#define vlseg3e8ff(...) __riscv_vlseg3e8ff_tumu(__VA_ARGS__)
+#define vlseg4e8ff(...) __riscv_vlseg4e8ff_tumu(__VA_ARGS__)
+#define vlseg5e8ff(...) __riscv_vlseg5e8ff_tumu(__VA_ARGS__)
+#define vlseg6e8ff(...) __riscv_vlseg6e8ff_tumu(__VA_ARGS__)
+#define vlseg7e8ff(...) __riscv_vlseg7e8ff_tumu(__VA_ARGS__)
+#define vlseg8e8ff(...) __riscv_vlseg8e8ff_tumu(__VA_ARGS__)
+#define vlsseg2e16(...) __riscv_vlsseg2e16_tumu(__VA_ARGS__)
+#define vlsseg3e16(...) __riscv_vlsseg3e16_tumu(__VA_ARGS__)
+#define vlsseg4e16(...) __riscv_vlsseg4e16_tumu(__VA_ARGS__)
+#define vlsseg5e16(...) __riscv_vlsseg5e16_tumu(__VA_ARGS__)
+#define vlsseg6e16(...) __riscv_vlsseg6e16_tumu(__VA_ARGS__)
+#define vlsseg7e16(...) __riscv_vlsseg7e16_tumu(__VA_ARGS__)
+#define vlsseg8e16(...) __riscv_vlsseg8e16_tumu(__VA_ARGS__)
+#define vlsseg2e32(...) __riscv_vlsseg2e32_tumu(__VA_ARGS__)
+#define vlsseg3e32(...) __riscv_vlsseg3e32_tumu(__VA_ARGS__)
+#define vlsseg4e32(...) __riscv_vlsseg4e32_tumu(__VA_ARGS__)
+#define vlsseg5e32(...) __riscv_vlsseg5e32_tumu(__VA_ARGS__)
+#define vlsseg6e32(...) __riscv_vlsseg6e32_tumu(__VA_ARGS__)
+#define vlsseg7e32(...) __riscv_vlsseg7e32_tumu(__VA_ARGS__)
+#define vlsseg8e32(...) __riscv_vlsseg8e32_tumu(__VA_ARGS__)
+#define vlsseg2e64(...) __riscv_vlsseg2e64_tumu(__VA_ARGS__)
+#define vlsseg3e64(...) __riscv_vlsseg3e64_tumu(__VA_ARGS__)
+#define vlsseg4e64(...) __riscv_vlsseg4e64_tumu(__VA_ARGS__)
+#define vlsseg5e64(...) __riscv_vlsseg5e64_tumu(__VA_ARGS__)
+#define vlsseg6e64(...) __riscv_vlsseg6e64_tumu(__VA_ARGS__)
+#define vlsseg7e64(...) __riscv_vlsseg7e64_tumu(__VA_ARGS__)
+#define vlsseg8e64(...) __riscv_vlsseg8e64_tumu(__VA_ARGS__)
+#define vlsseg2e8(...) __riscv_vlsseg2e8_tumu(__VA_ARGS__)
+#define vlsseg3e8(...) __riscv_vlsseg3e8_tumu(__VA_ARGS__)
+#define vlsseg4e8(...) __riscv_vlsseg4e8_tumu(__VA_ARGS__)
+#define vlsseg5e8(...) __riscv_vlsseg5e8_tumu(__VA_ARGS__)
+#define vlsseg6e8(...) __riscv_vlsseg6e8_tumu(__VA_ARGS__)
+#define vlsseg7e8(...) __riscv_vlsseg7e8_tumu(__VA_ARGS__)
+#define vlsseg8e8(...) __riscv_vlsseg8e8_tumu(__VA_ARGS__)
+#define viota(...) __riscv_viota_tumu(__VA_ARGS__)
+#define vid(...) __riscv_vid_tumu(__VA_ARGS__)
+#endif
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// 0.11 -> 0.12 compatibility
+
+#ifndef _RVV_IMPLICIT_VXRM
+#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
+#endif
+
+// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
+
+// masked
+#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+
+// unmasked
+#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@@ -0,0 +1,213 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
+#define OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
+
+// This file requires VTraits to be defined for vector types
+
+#define OPENCV_HAL_IMPL_RVV_FUN_AND(REG, SUF) \
+inline static REG vand(const REG & op1, const REG & op2, size_t vl) \
+{ \
+    return vand_vv_##SUF(op1, op2, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint8m1_t, i8m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint8m1_t, u8m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint16m1_t, i16m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint16m1_t, u16m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint32m1_t, i32m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint32m1_t, u32m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint64m1_t, i64m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint64m1_t, u64m1)
+
+#define OPENCV_HAL_IMPL_RVV_FUN_LOXEI(REG, SUF, INDX, ISUF) \
+inline static REG vloxe##ISUF(const VTraits<REG>::lane_type *base, INDX bindex, size_t vl) \
+{ \
+    return vloxe##ISUF##_v_##SUF(base, bindex, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint8m1_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint8m2_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m4_t, i8m4, vuint8m4_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m8_t, i8m8, vuint8m8_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint32m4_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint32m8_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint16m1_t, i16m1, vuint32m2_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m1_t, i32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m2_t, i32m2, vuint32m2_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m4_t, i32m4, vuint32m4_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m8_t, i32m8, vuint32m8_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint64m1_t, i64m1, vuint32mf2_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m1_t, u8m1, vuint8m1_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FUN_MUL(REG, SUF) \
+inline static REG##m1_t vmul(const REG##m1_t & op1, const REG##m1_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m1(op1, op2, vl); \
+} \
+inline static REG##m1_t vmul(const REG##m1_t & op1, VTraits<REG##m1_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m1(op1, op2, vl); \
+} \
+inline static REG##m2_t vmul(const REG##m2_t & op1, const REG##m2_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m2(op1, op2, vl); \
+} \
+inline static REG##m2_t vmul(const REG##m2_t & op1, VTraits<REG##m2_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m2(op1, op2, vl); \
+} \
+inline static REG##m4_t vmul(const REG##m4_t & op1, const REG##m4_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m4(op1, op2, vl); \
+} \
+inline static REG##m4_t vmul(const REG##m4_t & op1, VTraits<REG##m4_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m4(op1, op2, vl); \
+} \
+inline static REG##m8_t vmul(const REG##m8_t & op1, const REG##m8_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m8(op1, op2, vl); \
+} \
+inline static REG##m8_t vmul(const REG##m8_t & op1, VTraits<REG##m8_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m8(op1, op2, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vint8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vint16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vint32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint32, u32)
+
+#define OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(REG1, SUF1, REG2, SUF2) \
+inline static REG1##m1_t vreinterpret_##SUF1##m1(const REG2##m1_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m1_##SUF1##m1(src); \
+} \
+inline static REG1##m2_t vreinterpret_##SUF1##m2(const REG2##m2_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m2_##SUF1##m2(src); \
+} \
+inline static REG1##m4_t vreinterpret_##SUF1##m4(const REG2##m4_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m4_##SUF1##m4(src); \
+} \
+inline static REG1##m8_t vreinterpret_##SUF1##m8(const REG2##m8_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m8_##SUF1##m8(src); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint8, i8, vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint16, i16, vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vint32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vfloat32, f32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vfloat32, f32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vint8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vint16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vint32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint64, u64)
+
+#define OPENCV_HAL_IMPL_RVV_FUN_STORE(REG, SUF, SZ) \
+inline static void vse##SZ(VTraits<REG>::lane_type *base, REG value, size_t vl) \
+{ \
+    return vse##SZ##_v_##SUF##m1(base, value, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint8, u8, 8)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int8, i8, 8)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint16, u16, 16)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int16, i16, 16)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint32, u32, 32)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int32, i32, 32)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint64, u64, 64)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int64, i64, 64)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float32, f32, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float64, f64, 64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(REG, SUF) \
+inline static VTraits<REG>::lane_type vmv_x(const REG & reg) \
+{\
+    return vmv_x_s_##SUF##m1_##SUF(reg); \
+}
+#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(REG, SUF) \
+inline static VTraits<REG>::lane_type vfmv_f(const REG & reg) \
+{\
+    return vfmv_f_s_##SUF##m1_##SUF(reg); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int64, i64)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float64, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FUN_SLIDE(REG, SUF) \
+inline static REG vslidedown(const REG & dst, const REG & src, size_t offset, size_t vl) \
+{ \
+    return vslidedown_vx_##SUF##m1(dst, src, offset, vl); \
+} \
+inline static REG vslideup(const REG & dst, const REG & src, size_t offset, size_t vl) \
+{ \
+    return vslideup_vx_##SUF##m1(dst, src, offset, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float32, f32)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int64, i64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float64, f64)
+#endif
+
+inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t vl)
+{
+    return vmul_vx_u32mf2(op1, op2, vl);
+}
+
+inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
+{
+    return vreinterpret_v_i32mf2_u32mf2(val);
+}
+
+inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
+{
+    return vreinterpret_v_u16mf2_u32mf2(val);
+}
+
+#endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_scalable.hpp
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_sse.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_sse.hpp
@@ -347,8 +347,6 @@ namespace hal_sse_internal
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
-template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
-template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(cast(a.val)); }

@@ -366,11 +364,6 @@ inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }

-template <> inline v_uint64x2 v_setzero_() { return v_setzero_u64(); }
-template <> inline v_int64x2 v_setzero_() { return v_setzero_s64(); }
-template <> inline v_uint64x2 v_setall_(uint64 val) { return v_setall_u64(val); }
-template <> inline v_int64x2 v_setall_(int64 val) { return v_setall_s64(val); }
-
 template<typename _Tpvec> inline
 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
 template<typename _Tpvec> inline
@@ -742,46 +735,53 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }

 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
-    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
    { \
        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
    }

-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)

 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
-    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
    {                                                            \
        _Tpwvec c, d;                                            \
        v_mul_expand(a, b, c, d);                                \
        return v_pack(c, d);                                     \
-    }
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }

 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
@@ -845,7 +845,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
@@ -872,7 +872,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 #endif
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
@@ -886,7 +886,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
    return v_uint32x4(_mm_add_epi32(p0, p1));
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -899,7 +899,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
    return v_int32x4(_mm_add_epi32(p0, p1));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -911,14 +911,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
    v_expand(c, c0, c1);
    v_expand(d, d0, d1);

-    c0 = v_add(c0, c1); d0 = v_add(d0, d1);
+    c0 += c1; d0 += d1;
    return v_uint64x2(_mm_add_epi64(
        _mm_unpacklo_epi64(c0.val, d0.val),
        _mm_unpackhi_epi64(c0.val, d0.val)
    ));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -931,7 +931,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
    ));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
@@ -939,8 +939,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 #if CV_SSE4_1
    return v_cvt_f64(v_dotprod(a, b));
 #else
-    v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b));
-    v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b));
+    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
+    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);

    return v_float64x2(_mm_add_pd(
        _mm_unpacklo_pd(c.val, d.val),
@@ -949,7 +949,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 #endif
 }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 //////// Fast Dot Product ////////

@@ -957,13 +957,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 { return v_dotprod(a, b); }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_dotprod(a, b); }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_add(v_dotprod_fast(a, b), c); }
+{ return v_dotprod_fast(a, b) + c; }

 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
@@ -977,7 +977,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
    return v_uint32x4(_mm_add_epi32(p0, p1));
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
@@ -994,7 +994,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 #endif
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1006,34 +1006,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
    v_expand(c, c0, c1);
    v_expand(d, d0, d1);

-    c0 = v_add(c0, c1); d0 = v_add(d0, d1);
-    return v_add(c0, d0);
+    c0 += c1; d0 += d1;
+    return c0 + d0;
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
    v_int32x4 prod = v_dotprod(a, b);
    v_int64x2 c, d;
    v_expand(prod, c, d);
-    return v_add(c, d);
+    return c + d;
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 32 >> 64f
 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }

 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix)   \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \
-    inline _Tpvec v_not(const _Tpvec& a) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
    { \
        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
    }
@@ -1182,58 +1182,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
 }

 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
-inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
 { \
    __m128i not_mask = _mm_set1_epi32(-1); \
    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
 { \
    __m128i not_mask = _mm_set1_epi32(-1); \
    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
 { \
    __m128i smask = _mm_set1_##suffix(sbit); \
    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
 } \
-inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
 { \
    __m128i smask = _mm_set1_##suffix(sbit); \
    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
 } \
-inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
 { \
    __m128i smask = _mm_set1_##suffix(sbit); \
    __m128i not_mask = _mm_set1_epi32(-1); \
    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 } \
-inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
 { \
    __m128i smask = _mm_set1_##suffix(sbit); \
    __m128i not_mask = _mm_set1_epi32(-1); \
    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 } \
-inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
 { \
    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
 } \
-inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
 { \
    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
 } \
-inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
 { \
    __m128i not_mask = _mm_set1_epi32(-1); \
    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
 { \
    __m128i not_mask = _mm_set1_epi32(-1); \
    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
@@ -1244,17 +1244,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)

 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
-inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
-inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
-inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
-inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
-inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }

 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
@@ -1262,17 +1262,17 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)

 #if CV_SSE4_1
 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
-inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
-inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
-{ return v_not(v_eq(a, b)); }
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
 #else
 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
-inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
  return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
-inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
-{ return v_not(v_eq(a, b)); }
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
 #endif

 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
@@ -1311,17 +1311,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
 /** Absolute difference **/

 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
-{ return v_add_wrap(v_sub(a, b),  v_sub(b, a)); }
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_add_wrap(v_sub(a, b),  v_sub(b, a)); }
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }

 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
    v_int8x16 d = v_sub_wrap(a, b);
-    v_int8x16 m = v_lt(a, b);
-    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
 }
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1329,25 +1329,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    v_int32x4 d = v_sub(a, b);
-    v_int32x4 m = v_lt(a, b);
-    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
 }

 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = v_sub(a, b);
-    v_int8x16 m = v_lt(a, b);
-    return v_sub(v_xor(d, m), m);
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
 }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_sub(v_max(a, b), v_min(a, b)); }
+{ return v_max(a, b) - v_min(a, b); }


 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return v_add(v_mul(a, b), c);
+    return a * b + c;
 }

 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@@ -1381,12 +1381,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 } \
 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpvec res = v_fma(a, a, v_mul(b, b)); \
+    _Tpvec res = v_fma(a, a, b*b); \
    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
 } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    return v_fma(a, a, v_mul(b, b)); \
+    return v_fma(a, a, b*b); \
 } \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 { \
@@ -1397,19 +1397,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((
 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))

 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
-inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
 { \
    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
 } \
-inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
 { \
    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
 } \
-inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
 { \
    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
 } \
-inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
 { \
    return _Tpsvec(srai(a.val, imm)); \
 } \
@@ -1711,9 +1711,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)

 inline int v_reduce_sum(const v_int16x8& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
 inline unsigned v_reduce_sum(const v_uint16x8& a)
-{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }

 inline uint64 v_reduce_sum(const v_uint64x2& a)
 {
@@ -1770,13 +1770,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
    v_uint32x4 l, h;
    v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(v_add(l, h));
+    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
    v_uint32x4 l, h;
    v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(v_add(l, h));
+    return v_reduce_sum(l + h);
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1805,15 +1805,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
 inline v_uint16x8 v_popcount(const v_uint16x8& a)
 {
    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p = v_add(p, v_rotate_right<1>(p));
-    return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
 }
 inline v_uint32x4 v_popcount(const v_uint32x4& a)
 {
    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p = v_add(p, v_rotate_right<1>(p));
-    p = v_add(p, v_rotate_right<2>(p));
-    return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
@@ -3459,21 +3459,6 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v)

 inline void v_cleanup() {}

-#include "intrin_math.hpp"
-inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
-inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
-inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
-
-inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
-inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
-inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
-
-
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_vsx.hpp
@@ -261,8 +261,6 @@ OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast)                        \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); }             \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));}          \
-template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); }               \
-template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); }       \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a)  \
 { return _Tpvec((cast)a.val); }

@@ -515,44 +513,48 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
 /* Element-wise binary and unary operations */
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin)       \
-inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(intrin(a.val, b.val)); }
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(intrin(a.val, b.val)); }                         \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)   \
+{ a.val = intrin(a.val, b.val); return a; }

-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16,  vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)

 // saturating multiply
 #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
-    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
    {                                                            \
        _Tpwvec c, d;                                            \
        v_mul_expand(a, b, c, d);                                \
        return v_pack(c, d);                                     \
-    }
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }

 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -594,9 +596,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)

 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
-inline _Tpvec v_shl(const _Tpvec& a, int imm)                \
+inline _Tpvec operator << (const _Tpvec& a, int imm)         \
 { return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
-inline _Tpvec v_shr(const _Tpvec& a, int imm)                \
+inline _Tpvec operator >> (const _Tpvec& a, int imm)         \
 { return _Tpvec(shr(a.val, splfunc(imm))); }                 \
 template<int imm> inline _Tpvec v_shl(const _Tpvec& a)       \
 { return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
@@ -615,10 +617,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)

 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec)    \
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and)  \
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or)    \
-OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor)  \
-inline _Tpvec v_not(const _Tpvec& a)                \
+OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and)  \
+OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or)   \
+OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor)  \
+inline _Tpvec operator ~ (const _Tpvec& a)      \
 { return _Tpvec(vec_not(a.val)); }

 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
@@ -648,17 +650,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)

 /** Comparison **/
 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec)                 \
-inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)           \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)   \
 { return _Tpvec(vec_cmpeq(a.val, b.val)); }                    \
-inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)           \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
 { return _Tpvec(vec_cmpne(a.val, b.val)); }                    \
-inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)           \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b)    \
 { return _Tpvec(vec_cmplt(a.val, b.val)); }                    \
-inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b)           \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b)    \
 { return _Tpvec(vec_cmpgt(a.val, b.val)); }                    \
-inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)           \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
 { return _Tpvec(vec_cmple(a.val, b.val)); }                    \
-inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)           \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
 { return _Tpvec(vec_cmpge(a.val, b.val)); }

 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
@@ -1058,7 +1060,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)

 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
-{ return v_add(v_mul(a,  b), c); }
+{ return a * b + c; }

 // TODO: exp, log, sin, cos

@@ -1087,12 +1089,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
-{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); }
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }

 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }
 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{ return v_abs(v_sub(a, b)); }
+{ return v_abs(a - b); }

 /** Absolute difference for signed integers **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
@@ -1440,7 +1442,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
    return v_int64x2(vec_add(even, odd));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_add(v_dotprod(a, b), c); }
+{ return v_dotprod(a, b) + c; }

 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -1483,7 +1485,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
    return v_uint64x2(vec_add(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1493,13 +1495,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
    return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_add(v_dotprod_expand(a, b), c); }
+{ return v_dotprod_expand(a, b) + c; }

 //////// Fast Dot Product ////////

@@ -1507,7 +1509,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 { return v_dotprod(a, b); }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_add(v_int32x4(vec_msum(a.val, b.val, vec_int4_z)), c); }
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_dotprod(a, b); }
@@ -1518,7 +1520,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
 { return v_dotprod_expand(a, b); }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_add(v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)), c); }
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }

 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1529,7 +1531,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1542,10 +1544,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
    v_int32x4 prod = v_dotprod(a, b);
    v_int64x2 c, d;
    v_expand(prod, c, d);
-    return v_add(c, d);
+    return c + d;
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_add(v_dotprod_expand_fast(a, b), c); }
+{ return v_dotprod_expand_fast(a, b) + c; }

 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
@@ -1596,19 +1598,6 @@ template<int i, typename Tvec>
 inline Tvec v_broadcast_element(const Tvec& v)
 { return Tvec(vec_splat(v.val, i)); }

-#include "intrin_math.hpp"
-inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
-inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
-inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
-inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
-
-inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
-inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
-inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
-inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

--- a/3rdpart/OpenCV/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/3rdpart/OpenCV/include/opencv2/core/hal/intrin_wasm.hpp