点阵完成,加入opencv

This commit is contained in:
2026-01-20 19:55:56 +08:00
parent 59564fd312
commit bc9f2824ed
367 changed files with 162001 additions and 52 deletions

View File

@@ -0,0 +1,260 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_HAL_HPP
#define OPENCV_HAL_HPP
#include "opencv2/core/cvdef.h"
#include "opencv2/core/cvstd.hpp"
#include "opencv2/core/hal/interface.h"
namespace cv { namespace hal {
//! @addtogroup core_hal_functions
//! @{
CV_EXPORTS int normHamming(const uchar* a, int n);
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
CV_EXPORTS int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors);
CV_EXPORTS int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors);
CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
int m_a, int n_a, int n_d, int flags);
CV_EXPORTS void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
int m_a, int n_a, int n_d, int flags);
CV_EXPORTS void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
int m_a, int n_a, int n_d, int flags);
CV_EXPORTS void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
int m_a, int n_a, int n_d, int flags);
CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
CV_EXPORTS float normL1_(const float* a, const float* b, int n);
CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
CV_EXPORTS void exp32f(const float* src, float* dst, int n);
CV_EXPORTS void exp64f(const double* src, double* dst, int n);
CV_EXPORTS void log32f(const float* src, float* dst, int n);
CV_EXPORTS void log64f(const double* src, double* dst, int n);
CV_EXPORTS void cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int n, bool angleInDegrees);
CV_EXPORTS void cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int n, bool angleInDegrees);
CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
CV_EXPORTS void polarToCart32f(const float* mag, const float* angle, float* x, float* y, int n, bool angleInDegrees);
CV_EXPORTS void polarToCart64f(const double* mag, const double* angle, double* x, double* y, int n, bool angleInDegrees);
CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip8u( const uchar *, size_t, const uchar * src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip8s( const schar *, size_t, const schar * src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip16u( const ushort *, size_t, const ushort * src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip16s( const short *, size_t, const short * src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip32s( const int *, size_t, const int * src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip32f( const float *, size_t, const float * src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip64f( const double *, size_t, const double * src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );
CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
struct CV_EXPORTS DFT1D
{
static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
virtual void apply(const uchar *src, uchar *dst) = 0;
virtual ~DFT1D() {}
};
struct CV_EXPORTS DFT2D
{
static Ptr<DFT2D> create(int width, int height, int depth,
int src_channels, int dst_channels,
int flags, int nonzero_rows = 0);
virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
virtual ~DFT2D() {}
};
struct CV_EXPORTS DCT2D
{
static Ptr<DCT2D> create(int width, int height, int depth, int flags);
virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
virtual ~DCT2D() {}
};
//! @} core_hal
//=============================================================================
// for binary compatibility with 3.0
//! @cond IGNORED
CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS void exp(const float* src, float* dst, int n);
CV_EXPORTS void exp(const double* src, double* dst, int n);
CV_EXPORTS void log(const float* src, float* dst, int n);
CV_EXPORTS void log(const double* src, double* dst, int n);
CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
CV_EXPORTS void sqrt(const float* src, float* dst, int len);
CV_EXPORTS void sqrt(const double* src, double* dst, int len);
CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
//! @endcond
}} //cv::hal
#endif //OPENCV_HAL_HPP

View File

@@ -0,0 +1,190 @@
#ifndef OPENCV_CORE_HAL_INTERFACE_H
#define OPENCV_CORE_HAL_INTERFACE_H
//! @addtogroup core_hal_interface
//! @{
//! @name Return codes
//! @{
#define CV_HAL_ERROR_OK 0
#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
#define CV_HAL_ERROR_UNKNOWN -1
//! @}
#ifdef __cplusplus
#include <cstddef>
#else
#include <stddef.h>
#include <stdbool.h>
#endif
//! @name Data types
//! primitive types
//! - schar - signed 1 byte integer
//! - uchar - unsigned 1 byte integer
//! - short - signed 2 byte integer
//! - ushort - unsigned 2 byte integer
//! - int - signed 4 byte integer
//! - uint - unsigned 4 byte integer
//! - int64 - signed 8 byte integer
//! - uint64 - unsigned 8 byte integer
//! @{
#if !defined _MSC_VER && !defined __BORLANDC__
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
# include <cstdint>
# ifdef __NEWLIB__
typedef unsigned int uint;
# else
typedef std::uint32_t uint;
# endif
# else
# include <stdint.h>
typedef uint32_t uint;
# endif
#else
typedef unsigned uint;
#endif
typedef signed char schar;
#ifndef __IPL_H__
typedef unsigned char uchar;
typedef unsigned short ushort;
#endif
#if defined _MSC_VER || defined __BORLANDC__
typedef __int64 int64;
typedef unsigned __int64 uint64;
# define CV_BIG_INT(n) n##I64
# define CV_BIG_UINT(n) n##UI64
#else
typedef int64_t int64;
typedef uint64_t uint64;
# define CV_BIG_INT(n) n##LL
# define CV_BIG_UINT(n) n##ULL
#endif
#define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
#define CV_CN_MAX 512
#define CV_CN_SHIFT 3
#define CV_DEPTH_MAX (1 << CV_CN_SHIFT)
#define CV_8U 0
#define CV_8S 1
#define CV_16U 2
#define CV_16S 3
#define CV_32S 4
#define CV_32F 5
#define CV_64F 6
#define CV_16F 7
#define CV_MAT_DEPTH_MASK (CV_DEPTH_MAX - 1)
#define CV_MAT_DEPTH(flags) ((flags) & CV_MAT_DEPTH_MASK)
#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
#define CV_MAKE_TYPE CV_MAKETYPE
#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
//! @}
//! @name Comparison operation
//! @sa cv::CmpTypes
//! @{
#define CV_HAL_CMP_EQ 0
#define CV_HAL_CMP_GT 1
#define CV_HAL_CMP_GE 2
#define CV_HAL_CMP_LT 3
#define CV_HAL_CMP_LE 4
#define CV_HAL_CMP_NE 5
//! @}
//! @name Border processing modes
//! @sa cv::BorderTypes
//! @{
#define CV_HAL_BORDER_CONSTANT 0
#define CV_HAL_BORDER_REPLICATE 1
#define CV_HAL_BORDER_REFLECT 2
#define CV_HAL_BORDER_WRAP 3
#define CV_HAL_BORDER_REFLECT_101 4
#define CV_HAL_BORDER_TRANSPARENT 5
#define CV_HAL_BORDER_ISOLATED 16
//! @}
//! @name DFT flags
//! @{
#define CV_HAL_DFT_INVERSE 1
#define CV_HAL_DFT_SCALE 2
#define CV_HAL_DFT_ROWS 4
#define CV_HAL_DFT_COMPLEX_OUTPUT 16
#define CV_HAL_DFT_REAL_OUTPUT 32
#define CV_HAL_DFT_TWO_STAGE 64
#define CV_HAL_DFT_STAGE_COLS 128
#define CV_HAL_DFT_IS_CONTINUOUS 512
#define CV_HAL_DFT_IS_INPLACE 1024
//! @}
//! @name SVD flags
//! @{
#define CV_HAL_SVD_NO_UV 1
#define CV_HAL_SVD_SHORT_UV 2
#define CV_HAL_SVD_MODIFY_A 4
#define CV_HAL_SVD_FULL_UV 8
//! @}
//! @name Gemm flags
//! @{
#define CV_HAL_GEMM_1_T 1
#define CV_HAL_GEMM_2_T 2
#define CV_HAL_GEMM_3_T 4
//! @}
//! @}
#endif

View File

@@ -0,0 +1,988 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_HAL_INTRIN_HPP
#define OPENCV_HAL_INTRIN_HPP
#include <cmath>
#include <float.h>
#include <stdlib.h>
#include "opencv2/core/cvdef.h"
#if defined(__GNUC__) && __GNUC__ == 12
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#define OPENCV_HAL_ADD(a, b) ((a) + (b))
#define OPENCV_HAL_AND(a, b) ((a) & (b))
#define OPENCV_HAL_NOP(a) (a)
#define OPENCV_HAL_1ST(a, b) (a)
namespace {
inline unsigned int trailingZeros32(unsigned int value) {
#if defined(_MSC_VER)
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
unsigned long index = 0;
_BitScanForward(&index, value);
return (unsigned int)index;
#elif defined(__clang__)
// clang-cl doesn't export _tzcnt_u32 for non BMI systems
return value ? __builtin_ctz(value) : 32;
#else
return _tzcnt_u32(value);
#endif
#elif defined(__GNUC__) || defined(__GNUG__)
return __builtin_ctz(value);
#elif defined(__ICC) || defined(__INTEL_COMPILER)
return _bit_scan_forward(value);
#elif defined(__clang__)
return llvm.cttz.i32(value, true);
#else
static const int MultiplyDeBruijnBitPosition[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
#endif
}
}
// unlike HAL API, which is in cv::hal,
// we put intrinsics into cv namespace to make its
// access from within opencv code more accessible
namespace cv {
namespace hal {
enum StoreMode
{
STORE_UNALIGNED = 0,
STORE_ALIGNED = 1,
STORE_ALIGNED_NOCACHE = 2
};
}
// TODO FIXIT: Don't use "God" traits. Split on separate cases.
template<typename _Tp> struct V_TypeTraits
{
};
#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
template<> struct V_TypeTraits<type> \
{ \
typedef type value_type; \
typedef int_type_ int_type; \
typedef abs_type_ abs_type; \
typedef uint_type_ uint_type; \
typedef w_type_ w_type; \
typedef q_type_ q_type; \
typedef sum_type_ sum_type; \
\
static inline int_type reinterpret_int(type x) \
{ \
union { type l; int_type i; } v; \
v.l = x; \
return v.i; \
} \
\
static inline type reinterpret_from_int(int_type x) \
{ \
union { type l; int_type i; } v; \
v.i = x; \
return v.l; \
} \
}
#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
template<> struct V_TypeTraits<type> \
{ \
typedef type value_type; \
typedef int_type_ int_type; \
typedef abs_type_ abs_type; \
typedef uint_type_ uint_type; \
typedef w_type_ w_type; \
typedef sum_type_ sum_type; \
\
static inline int_type reinterpret_int(type x) \
{ \
union { type l; int_type i; } v; \
v.l = x; \
return v.i; \
} \
\
static inline type reinterpret_from_int(int_type x) \
{ \
union { type l; int_type i; } v; \
v.i = x; \
return v.l; \
} \
}
CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
#ifndef CV_DOXYGEN
#ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
#ifdef CV_FORCE_SIMD128_CPP
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#elif defined(CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#else
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#endif
#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
template <typename _VecTp> inline _VecTp v_setzero_();
template <typename _VecTp> inline _VecTp v_setall_(uchar);
template <typename _VecTp> inline _VecTp v_setall_(schar);
template <typename _VecTp> inline _VecTp v_setall_(ushort);
template <typename _VecTp> inline _VecTp v_setall_(short);
template <typename _VecTp> inline _VecTp v_setall_(unsigned);
template <typename _VecTp> inline _VecTp v_setall_(int);
template <typename _VecTp> inline _VecTp v_setall_(uint64);
template <typename _VecTp> inline _VecTp v_setall_(int64);
template <typename _VecTp> inline _VecTp v_setall_(float);
template <typename _VecTp> inline _VecTp v_setall_(double);
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#endif
}
#ifdef CV_DOXYGEN
# undef CV_AVX2
# undef CV_SSE2
# undef CV_NEON
# undef CV_VSX
# undef CV_FP16
# undef CV_MSA
# undef CV_RVV
#endif
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_sse_em.hpp"
#include "opencv2/core/hal/intrin_sse.hpp"
#elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_neon.hpp"
#elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP)
#define CV_SIMD128_CPP 0
#include "opencv2/core/hal/intrin_rvv071.hpp"
#elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_vsx.hpp"
#elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_msa.hpp"
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_wasm.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_lsx.hpp"
#else
#include "opencv2/core/hal/intrin_cpp.hpp"
#endif
// AVX2 can be used together with SSE2, so
// we define those two sets of intrinsics at once.
// Most of the intrinsics do not conflict (the proper overloaded variant is
// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
// Correspondingly, the wide intrinsics (which are mapped to the "widest"
// available instruction set) will get vx_ prefix
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx.hpp"
#endif
// AVX512 can be used together with SSE2 and AVX2, so
// we define those sets of intrinsics at once.
// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
#if CV_AVX512_SKX
#define CV__SIMD_FORWARD 512
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx512.hpp"
#endif
#if CV_LASX
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_lasx.hpp"
#endif
//! @cond IGNORED
namespace cv {
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
#ifndef CV_SIMD128
#define CV_SIMD128 0
#endif
#ifndef CV_SIMD128_CPP
#define CV_SIMD128_CPP 0
#endif
#ifndef CV_SIMD128_64F
#define CV_SIMD128_64F 0
#endif
#ifndef CV_SIMD256
#define CV_SIMD256 0
#endif
#ifndef CV_SIMD256_64F
#define CV_SIMD256_64F 0
#endif
#ifndef CV_SIMD512
#define CV_SIMD512 0
#endif
#ifndef CV_SIMD512_64F
#define CV_SIMD512_64F 0
#endif
#ifndef CV_SIMD128_FP16
#define CV_SIMD128_FP16 0
#endif
#ifndef CV_SIMD256_FP16
#define CV_SIMD256_FP16 0
#endif
#ifndef CV_SIMD512_FP16
#define CV_SIMD512_FP16 0
#endif
#ifndef CV_SIMD_SCALABLE
#define CV_SIMD_SCALABLE 0
#endif
#ifndef CV_SIMD_SCALABLE_64F
#define CV_SIMD_SCALABLE_64F 0
#endif
//==================================================================================================
template<typename _Tp> struct V_RegTraits
{
};
#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
template<> struct V_RegTraits<_reg> \
{ \
typedef _reg reg; \
typedef _u_reg u_reg; \
typedef _w_reg w_reg; \
typedef _q_reg q_reg; \
typedef _int_reg int_reg; \
typedef _round_reg round_reg; \
}
#if CV_SIMD128 || CV_SIMD128_CPP
CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
#if CV_SIMD128_64F || CV_SIMD128_CPP
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
#else
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
#endif
CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
#if CV_SIMD128_64F
CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
#endif
#endif
#if CV_SIMD256
CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
#endif
#if CV_SIMD512
CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
#endif
#if CV_SIMD_SCALABLE
CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
#endif
//! @endcond
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
#define CV__SIMD_NAMESPACE simd512
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 1
#define CV_SIMD_64F CV_SIMD512_64F
#define CV_SIMD_FP16 CV_SIMD512_FP16
#define CV_SIMD_WIDTH 64
//! @addtogroup core_hal_intrin
//! @{
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
typedef v_uint8x64 v_uint8;
//! @brief Maximum available vector register capacity 8-bit signed integer values
typedef v_int8x64 v_int8;
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
typedef v_uint16x32 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x32 v_int16;
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x16 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
typedef v_int32x16 v_int32;
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
typedef v_uint64x8 v_uint64;
//! @brief Maximum available vector register capacity 64-bit signed integer values
typedef v_int64x8 v_int64;
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
typedef v_float32x16 v_float32;
#if CV_SIMD512_64F
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
typedef v_float64x8 v_float64;
#endif
//! @}
#define VXPREFIX(func) v512##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
#define CV__SIMD_NAMESPACE simd256
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 1
#define CV_SIMD_64F CV_SIMD256_64F
#define CV_SIMD_FP16 CV_SIMD256_FP16
#define CV_SIMD_WIDTH 32
//! @addtogroup core_hal_intrin
//! @{
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
typedef v_uint8x32 v_uint8;
//! @brief Maximum available vector register capacity 8-bit signed integer values
typedef v_int8x32 v_int8;
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
typedef v_uint16x16 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x16 v_int16;
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x8 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
typedef v_int32x8 v_int32;
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
typedef v_uint64x4 v_uint64;
//! @brief Maximum available vector register capacity 64-bit signed integer values
typedef v_int64x4 v_int64;
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
typedef v_float32x8 v_float32;
#if CV_SIMD256_64F
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
typedef v_float64x4 v_float64;
#endif
//! @}
#define VXPREFIX(func) v256##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
#if defined CV_SIMD128_CPP
#define CV__SIMD_NAMESPACE simd128_cpp
#else
#define CV__SIMD_NAMESPACE simd128
#endif
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD CV_SIMD128
#define CV_SIMD_64F CV_SIMD128_64F
#define CV_SIMD_WIDTH 16
//! @addtogroup core_hal_intrin
//! @{
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
typedef v_uint8x16 v_uint8;
//! @brief Maximum available vector register capacity 8-bit signed integer values
typedef v_int8x16 v_int8;
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
typedef v_uint16x8 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x8 v_int16;
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x4 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
typedef v_int32x4 v_int32;
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
typedef v_uint64x2 v_uint64;
//! @brief Maximum available vector register capacity 64-bit signed integer values
typedef v_int64x2 v_int64;
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
typedef v_float32x4 v_float32;
#if CV_SIMD128_64F
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
typedef v_float64x2 v_float64;
#endif
//! @}
#define VXPREFIX(func) v##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#elif CV_SIMD_SCALABLE
#define CV__SIMD_NAMESPACE simd
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 0
#define CV_SIMD_WIDTH 128 /* 1024/8 */
#define VXPREFIX(func) v##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#endif
//! @cond IGNORED
#ifndef CV_SIMD_64F
#define CV_SIMD_64F 0
#endif
namespace CV__SIMD_NAMESPACE {
//! @addtogroup core_hal_intrin
//! @{
//! @name Wide init with value
//! @{
//! @brief Create maximum available capacity vector with elements set to a specific value
inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
#endif
//! @}
//! @name Wide init with zero
//! @{
//! @brief Create maximum available capacity vector with elements set to zero
inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
#endif
//! @}
//! @name Wide load from memory
//! @{
//! @brief Load maximum available capacity register contents from memory
inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
#endif
//! @}
//! @name Wide load from memory(aligned)
//! @{
//! @brief Load maximum available capacity register contents from memory(aligned)
inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#endif
//! @}
//! @name Wide load lower half from memory
//! @{
//! @brief Load lower half of maximum available capacity register from memory
inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
#endif
//! @}
//! @name Wide load halfs from memory
//! @{
//! @brief Load maximum available capacity register contents from two memory blocks
inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#endif
//! @}
//! @name Wide LUT of elements
//! @{
//! @brief Load maximum available capacity register contents with array elements by provided indexes
inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#endif
//! @}
//! @name Wide LUT of element pairs
//! @{
//! @brief Load maximum available capacity register contents with array element pairs by provided indexes
inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#endif
//! @}
//! @name Wide LUT of element quads
//! @{
//! @brief Load maximum available capacity register contents with array element quads by provided indexes
inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
//! @}
//! @name Wide load with double expansion
//! @{
//! @brief Load maximum available capacity register contents from memory with double expand
inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
//! @}
//! @name Wide load with quad expansion
//! @{
//! @brief Load maximum available capacity register contents from memory with quad expand
inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
//! @}
/** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
#if !CV_SIMD_SCALABLE
// Compatibility layer
#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
template<typename T> struct VTraits {
static inline int vlanes() { return T::nlanes; }
enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
using lane_type = typename T::lane_type;
};
//////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
{ \
return v.get0(); \
}
OPENCV_HAL_WRAP_GRT0(v_uint8)
OPENCV_HAL_WRAP_GRT0(v_int8)
OPENCV_HAL_WRAP_GRT0(v_uint16)
OPENCV_HAL_WRAP_GRT0(v_int16)
OPENCV_HAL_WRAP_GRT0(v_uint32)
OPENCV_HAL_WRAP_GRT0(v_int32)
OPENCV_HAL_WRAP_GRT0(v_uint64)
OPENCV_HAL_WRAP_GRT0(v_int64)
OPENCV_HAL_WRAP_GRT0(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
OPENCV_HAL_WRAP_GRT0(v_int8x16)
OPENCV_HAL_WRAP_GRT0(v_int16x8)
OPENCV_HAL_WRAP_GRT0(v_int32x4)
OPENCV_HAL_WRAP_GRT0(v_int64x2)
OPENCV_HAL_WRAP_GRT0(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
OPENCV_HAL_WRAP_GRT0(v_int8x32)
OPENCV_HAL_WRAP_GRT0(v_int16x16)
OPENCV_HAL_WRAP_GRT0(v_int32x8)
OPENCV_HAL_WRAP_GRT0(v_int64x4)
OPENCV_HAL_WRAP_GRT0(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x4)
#endif
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
return v_add(v_add(f1, f2), f3, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
template<typename... Args> \
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
return v_mul(v_mul(f1, f2), f3, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
OPENCV_HAL_WRAP_EXTRACT(v_int8)
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
OPENCV_HAL_WRAP_EXTRACT(v_int16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64)
OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
{ \
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32)
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
#endif
#endif //!CV_SIMD_SCALABLE
//! @cond IGNORED
// backward compatibility
template<typename _Tp, typename _Tvec> static inline
void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
// backward compatibility
template<typename _Tp, typename _Tvec> static inline
void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
//! @endcond
//! @}
#undef VXPREFIX
} // namespace
#ifndef CV_SIMD_FP16
#define CV_SIMD_FP16 0 //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
#endif
#ifndef CV_SIMD
#define CV_SIMD 0
#endif
#include "simd_utils.impl.hpp"
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
} // cv::
//! @endcond
#if defined(__GNUC__) && __GNUC__ == 12
#pragma GCC diagnostic pop
#endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,191 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef CV__SIMD_FORWARD
#error "Need to pre-define forward width"
#endif
namespace cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
/** Types **/
#if CV__SIMD_FORWARD == 1024
// [todo] 1024
#error "1024-long ops not implemented yet"
#elif CV__SIMD_FORWARD == 512
// 512
#define __CV_VX(fun) v512_##fun
#define __CV_V_UINT8 v_uint8x64
#define __CV_V_INT8 v_int8x64
#define __CV_V_UINT16 v_uint16x32
#define __CV_V_INT16 v_int16x32
#define __CV_V_UINT32 v_uint32x16
#define __CV_V_INT32 v_int32x16
#define __CV_V_UINT64 v_uint64x8
#define __CV_V_INT64 v_int64x8
#define __CV_V_FLOAT32 v_float32x16
#define __CV_V_FLOAT64 v_float64x8
struct v_uint8x64;
struct v_int8x64;
struct v_uint16x32;
struct v_int16x32;
struct v_uint32x16;
struct v_int32x16;
struct v_uint64x8;
struct v_int64x8;
struct v_float32x16;
struct v_float64x8;
#elif CV__SIMD_FORWARD == 256
// 256
#define __CV_VX(fun) v256_##fun
#define __CV_V_UINT8 v_uint8x32
#define __CV_V_INT8 v_int8x32
#define __CV_V_UINT16 v_uint16x16
#define __CV_V_INT16 v_int16x16
#define __CV_V_UINT32 v_uint32x8
#define __CV_V_INT32 v_int32x8
#define __CV_V_UINT64 v_uint64x4
#define __CV_V_INT64 v_int64x4
#define __CV_V_FLOAT32 v_float32x8
#define __CV_V_FLOAT64 v_float64x4
struct v_uint8x32;
struct v_int8x32;
struct v_uint16x16;
struct v_int16x16;
struct v_uint32x8;
struct v_int32x8;
struct v_uint64x4;
struct v_int64x4;
struct v_float32x8;
struct v_float64x4;
#else
// 128
#define __CV_VX(fun) v_##fun
#define __CV_V_UINT8 v_uint8x16
#define __CV_V_INT8 v_int8x16
#define __CV_V_UINT16 v_uint16x8
#define __CV_V_INT16 v_int16x8
#define __CV_V_UINT32 v_uint32x4
#define __CV_V_INT32 v_int32x4
#define __CV_V_UINT64 v_uint64x2
#define __CV_V_INT64 v_int64x2
#define __CV_V_FLOAT32 v_float32x4
#define __CV_V_FLOAT64 v_float64x2
struct v_uint8x16;
struct v_int8x16;
struct v_uint16x8;
struct v_int16x8;
struct v_uint32x4;
struct v_int32x4;
struct v_uint64x2;
struct v_int64x2;
struct v_float32x4;
struct v_float64x2;
#endif
/** Value reordering **/
// Expansion
void v_expand(const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
void v_expand(const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
void v_expand(const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
void v_expand(const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
// Low Expansion
__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
__CV_V_INT16 v_expand_low(const __CV_V_INT8&);
__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
__CV_V_INT32 v_expand_low(const __CV_V_INT16&);
__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
__CV_V_INT64 v_expand_low(const __CV_V_INT32&);
// High Expansion
__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
__CV_V_INT16 v_expand_high(const __CV_V_INT8&);
__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
__CV_V_INT32 v_expand_high(const __CV_V_INT16&);
__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
__CV_V_INT64 v_expand_high(const __CV_V_INT32&);
// Load & Low Expansion
__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
__CV_V_INT16 __CV_VX(load_expand)(const schar*);
__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
__CV_V_INT32 __CV_VX(load_expand)(const short*);
__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
__CV_V_INT64 __CV_VX(load_expand)(const int*);
// Load lower 8-bit and expand into 32-bit
__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
__CV_V_INT32 __CV_VX(load_expand_q)(const schar*);
// Saturating Pack
__CV_V_UINT8 v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
__CV_V_INT8 v_pack(const __CV_V_INT16&, const __CV_V_INT16&);
__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
__CV_V_INT16 v_pack(const __CV_V_INT32&, const __CV_V_INT32&);
// Non-saturating Pack
__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
__CV_V_INT32 v_pack(const __CV_V_INT64&, const __CV_V_INT64&);
// Pack signed integers with unsigned saturation
__CV_V_UINT8 v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
/** Arithmetic, bitwise and comparison operations **/
// Non-saturating multiply
#if CV_VSX
template<typename Tvec>
Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
#else
__CV_V_UINT8 v_mul_wrap(const __CV_V_UINT8&, const __CV_V_UINT8&);
__CV_V_INT8 v_mul_wrap(const __CV_V_INT8&, const __CV_V_INT8&);
__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
__CV_V_INT16 v_mul_wrap(const __CV_V_INT16&, const __CV_V_INT16&);
#endif
// Multiply and expand
#if CV_VSX
template<typename Tvec, typename Twvec>
void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
#else
void v_mul_expand(const __CV_V_UINT8&, const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
void v_mul_expand(const __CV_V_INT8&, const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
void v_mul_expand(const __CV_V_INT16&, const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
#endif
// Conversions
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
/** Cleanup **/
#undef CV__SIMD_FORWARD
#undef __CV_VX
#undef __CV_V_UINT8
#undef __CV_V_INT8
#undef __CV_V_UINT16
#undef __CV_V_INT16
#undef __CV_V_UINT32
#undef __CV_V_INT32
#undef __CV_V_UINT64
#undef __CV_V_INT64
#undef __CV_V_FLOAT32
#undef __CV_V_FLOAT64
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
} // cv::

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,111 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// This file has been created for compatibility with older versions of Universal Intrinscs
// Binary operators for vector types has been removed since version 4.11
// Include this file manually after OpenCV headers if you need these operators
#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
#ifdef __OPENCV_BUILD
#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
#endif
#ifdef __riscv
#warning "Operators might conflict with built-in functions on RISC-V platform"
#endif
#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
#endif
namespace cv { namespace hal {
#define BIN_OP(OP, FUN) \
template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
#define BIN_A_OP(OP, FUN) \
template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
#define UN_OP(OP, FUN) \
template <typename R> R operator OP (const R & val) { return FUN(val); }
BIN_OP(+, v_add)
BIN_OP(-, v_sub)
BIN_OP(*, v_mul)
BIN_OP(/, v_div)
BIN_OP(&, v_and)
BIN_OP(|, v_or)
BIN_OP(^, v_xor)
BIN_OP(==, v_eq)
BIN_OP(!=, v_ne)
BIN_OP(<, v_lt)
BIN_OP(>, v_gt)
BIN_OP(<=, v_le)
BIN_OP(>=, v_ge)
BIN_A_OP(+=, v_add)
BIN_A_OP(-=, v_sub)
BIN_A_OP(*=, v_mul)
BIN_A_OP(/=, v_div)
BIN_A_OP(&=, v_and)
BIN_A_OP(|=, v_or)
BIN_A_OP(^=, v_xor)
UN_OP(~, v_not)
// TODO: shift operators?
}} // cv::hal::
//==============================================================================
#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
namespace cv { namespace hal {
inline static void opencv_operator_compile_test()
{
using namespace cv;
v_float32 a, b, c;
uint8_t shift = 1;
a = b + c;
a = b - c;
a = b * c;
a = b / c;
a = b & c;
a = b | c;
a = b ^ c;
// a = b >> shift;
// a = b << shift;
a = (b == c);
a = (b != c);
a = (b < c);}}
a = (b > c);
a = (b <= c);
a = (b >= c);
a += b;
a -= b;
a *= b;
a /= b;
a &= b;
a |= b;
a ^= b;
// a <<= shift;
// a >>= shift;
a = ~b;
}
}} // cv::hal::
#endif
#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,687 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
/* Universal Intrinsics implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2010,2011 RJVB - extensions */
/* Copyright (C) 2011 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#ifndef OPENCV_HAL_INTRIN_MATH_HPP
#define OPENCV_HAL_INTRIN_MATH_HPP
//! @name Exponential
//! @{
// Implementation is the same as float32 vector.
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
_TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
_TpVec16S _vexp_mm;
const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f16);
_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f16(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
_vexp_mm = v_shl(_vexp_mm, 10);
_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
// exp(NAN) -> NAN
_TpVec16F mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
_TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
_TpVec32S _vexp_mm;
const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f32);
_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
_vexp_mm = v_floor(_vexp_);
_vexp_ = v_cvt_f32(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
_vexp_mm = v_shl(_vexp_mm, 23);
_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
// exp(NAN) -> NAN
_TpVec32F mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
_TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
_TpVec64S _vexp_mm;
const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
// compute exponential of x
_vexp_x = v_max(x, _vexp_lo_f64);
_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
_vexp_mm = v_expand_low(v_floor(_vexp_));
_vexp_ = v_cvt_f64(_vexp_mm);
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
_vexp_mm = v_shl(_vexp_mm, 52);
_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
_vexp_xx = v_mul(_vexp_x, _vexp_x);
_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
_vexp_y = v_mul(_vexp_y, _vexp_x);
_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
// exp(NAN) -> NAN
_TpVec64F mask_not_nan = v_not_nan(x);
return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
}
//! @}
//! @name Natural Logarithm
//! @{
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
_TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
_TpVec16S _vlog_ux, _vlog_emm0;
const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
_vlog_ux = v_reinterpret_as_s16(x);
_vlog_emm0 = v_shr(_vlog_ux, 10);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
_vlog_x = v_reinterpret_as_f16(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
_vlog_e = v_cvt_f16(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp16);
_TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_z = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_z);
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
// log(0) -> -INF
_TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
_vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
// log(NEG), log(NAN) -> NAN
_TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
// log(INF) -> INF
_TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
_vlog_x = v_select(mask_inf, x, _vlog_x);
return _vlog_x;
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
_TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
_TpVec32S _vlog_ux, _vlog_emm0;
const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
_vlog_ux = v_reinterpret_as_s32(x);
_vlog_emm0 = v_shr(_vlog_ux, 23);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
_vlog_x = v_reinterpret_as_f32(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
_vlog_e = v_cvt_f32(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp32);
_TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_z = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_z);
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
// log(0) -> -INF
_TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
_vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
// log(NEG), log(NAN) -> NAN
_TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
// log(INF) -> INF
_TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
_vlog_x = v_select(mask_inf, x, _vlog_x);
return _vlog_x;
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
_TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
_TpVec64S _vlog_ux, _vlog_emm0;
const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
_vlog_ux = v_reinterpret_as_s64(x);
_vlog_emm0 = v_shr(_vlog_ux, 52);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
_vlog_x = v_reinterpret_as_f64(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
_vlog_e = v_cvt_f64(_vlog_emm0);
_vlog_e = v_add(_vlog_e, _vlog_one_fp64);
_TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
_vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
_vlog_x = v_add(_vlog_x, _vlog_tmp);
_vlog_xx = v_mul(_vlog_x, _vlog_x);
_vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
_vlog_y = v_mul(_vlog_y, _vlog_x);
_vlog_y = v_mul(_vlog_y, _vlog_xx);
_vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
_vlog_z = v_div(_vlog_y, _vlog_z);
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
_vlog_z = v_add(_vlog_z, _vlog_x);
_vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
// log(0) -> -INF
_TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
_vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
// log(NEG), log(NAN) -> NAN
_TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
_vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
// log(INF) -> INF
_TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
_vlog_z = v_select(mask_inf, x, _vlog_z);
return _vlog_z;
}
//! @}
//! @name Sine and Cosine
//! @{
template<typename _TpVec16F, typename _TpVec16S>
inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
_TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec16S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_trunc(_vy);
emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
_vy = v_cvt_f16(emm2);
_TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
_TpVec16F _vxx = v_mul(_vx, _vx);
_TpVec16F y1, y2;
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
y1 = v_fma(y1, _vxx, v_coscof_p2);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
y2 = v_fma(y2, _vxx, v_sincof_p2);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
_TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
_TpVec16F ysin, ycos;
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
_TpVec16F ysin, ycos;
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
return ycos;
}
template<typename _TpVec32F, typename _TpVec32S>
inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
_TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec32S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_trunc(_vy);
emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
_vy = v_cvt_f32(emm2);
_TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
_TpVec32F _vxx = v_mul(_vx, _vx);
_TpVec32F y1, y2;
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
y1 = v_fma(y1, _vxx, v_coscof_p2);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
y2 = v_fma(y2, _vxx, v_sincof_p2);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
_TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
_TpVec32F ysin, ycos;
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
_TpVec32F ysin, ycos;
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
return ycos;
}
template<typename _TpVec64F, typename _TpVec64S>
inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
_TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
_TpVec64S emm2;
sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
_vx = v_abs(x);
_vy = v_mul(_vx, v_cephes_FOPI);
emm2 = v_expand_low(v_trunc(_vy));
emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
_vy = v_cvt_f64(emm2);
_TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
_vx = v_fma(_vy, v_minus_DP1, _vx);
_vx = v_fma(_vy, v_minus_DP2, _vx);
_vx = v_fma(_vy, v_minus_DP3, _vx);
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
_TpVec64F _vxx = v_mul(_vx, _vx);
_TpVec64F y1, y2;
y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
y1 = v_fma(y1, _vxx, v_cos_C3);
y1 = v_fma(y1, _vxx, v_cos_C4);
y1 = v_fma(y1, _vxx, v_cos_C5);
y1 = v_fma(y1, _vxx, v_cos_C6);
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
y2 = v_fma(y2, _vxx, v_sin_C3);
y2 = v_fma(y2, _vxx, v_sin_C4);
y2 = v_fma(y2, _vxx, v_sin_C5);
y2 = v_fma(y2, _vxx, v_sin_C6);
y2 = v_mul(y2, _vxx);
y2 = v_fma(y2, _vx, _vx);
ysin = v_select(poly_mask, y2, y1);
ycos = v_select(poly_mask, y1, y2);
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
_TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
_TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
ysin = v_select(mask_nan, v_nan, ysin);
ycos = v_select(mask_nan, v_nan, ycos);
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
_TpVec64F ysin, ycos;
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
return ysin;
}
template<typename _TpVec64F, typename _TpVec64S>
inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
_TpVec64F ysin, ycos;
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
return ycos;
}
//! @}
/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
*/
//! @name Error Function
//! @{
template<typename _TpVec32F, typename _TpVec32S>
inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
coef1 = v_setall_<_TpVec32F>(1.061405429f),
coef2 = v_setall_<_TpVec32F>(-1.453152027f),
coef3 = v_setall_<_TpVec32F>(1.421413741f),
coef4 = v_setall_<_TpVec32F>(-0.284496736f),
coef5 = v_setall_<_TpVec32F>(0.254829592f),
ones = v_setall_<_TpVec32F>(1.0f),
neg_zeros = v_setall_<_TpVec32F>(-0.f);
_TpVec32F t = v_abs(v);
// sign(v)
_TpVec32F sign_mask = v_and(neg_zeros, v);
t = v_div(ones, v_fma(coef0, t, ones));
_TpVec32F r = v_fma(coef1, t, coef2);
r = v_fma(r, t, coef3);
r = v_fma(r, t, coef4);
r = v_fma(r, t, coef5);
// - v * v
_TpVec32F v2 = v_mul(v, v);
_TpVec32F mv2 = v_xor(neg_zeros, v2);
// - exp(- v * v)
_TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
_TpVec32F neg_exp = v_xor(neg_zeros, exp);
_TpVec32F res = v_mul(t, neg_exp);
res = v_fma(r, res, ones);
return v_xor(sign_mask, res);
}
//! @}
#endif // OPENCV_HAL_INTRIN_MATH_HPP

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,180 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
#define OPENCV_HAL_INTRIN_SSE_EM_HPP
namespace cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
inline tp _v128_##fun(const tp& a) \
{ return _mm_##fun(a); }
#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b) \
{ return _mm_##fun(a, b); }
#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
{ return _mm_##fun(a, b, c); }
///////////////////////////// XOP /////////////////////////////
// [todo] define CV_XOP
#if 1 // CV_XOP
inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
{
const __m128i delta = _mm_set1_epi32((int)0x80000000);
return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
}
// wrapping XOP
#else
OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
#endif // !CV_XOP
///////////////////////////// SSE4.1 /////////////////////////////
#if !CV_SSE4_1
/** Swizzle **/
inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
/** Convert **/
// 8 >> 16
inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi8(a, z);
}
inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
// 8 >> 32
inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
}
inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
{
__m128i r = _mm_unpacklo_epi8(a, a);
r = _mm_unpacklo_epi8(r, r);
return _mm_srai_epi32(r, 24);
}
// 16 >> 32
inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi16(a, z);
}
inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
// 32 >> 64
inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpacklo_epi32(a, z);
}
inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
/** Arithmetic **/
inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
{
__m128i c0 = _mm_mul_epu32(a, b);
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
return _mm_unpacklo_epi64(d0, d1);
}
/** Math **/
inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
// wrapping SSE4.1
#else
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
#endif // !CV_SSE4_1
///////////////////////////// Revolutionary /////////////////////////////
/** Convert **/
// 16 << 8
inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpackhi_epi8(a, z);
}
inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
// 32 << 16
inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpackhi_epi16(a, z);
}
inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
// 64 << 32
inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
{
const __m128i z = _mm_setzero_si128();
return _mm_unpackhi_epi32(a, z);
}
inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
/** Miscellaneous **/
inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
{
const __m128i m = _mm_set1_epi32(65535);
__m128i am = _v128_min_epu32(a, m);
__m128i bm = _v128_min_epu32(b, m);
#if CV_SSE4_1
return _mm_packus_epi32(am, bm);
#else
const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
am = _mm_sub_epi32(am, d);
bm = _mm_sub_epi32(bm, d);
am = _mm_packs_epi32(am, bm);
return _mm_sub_epi16(am, nd);
#endif
}
template<int i>
inline int64 _v128_extract_epi64(const __m128i& a)
{
#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
#define CV__SIMD_NATIVE_mm_extract_epi64 1
return _mm_extract_epi64(a, i);
#else
CV_DECL_ALIGNED(16) int64 tmp[2];
_mm_store_si128((__m128i*)tmp, a);
return tmp[i];
#endif
}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
} // cv::
#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,186 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp
#if CV_SIMD128 || CV_SIMD128_CPP
template<typename _T> struct Type2Vec128_Traits;
#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \
template<> struct Type2Vec128_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2);
#if CV_SIMD128_64F
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2);
#endif
template<typename _T> static inline
typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a);
template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
template<> inline Type2Vec128_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
template<> inline Type2Vec128_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
template<> inline Type2Vec128_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
template<> inline Type2Vec128_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
#if CV_SIMD128_64F
template<> inline Type2Vec128_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
#endif
#endif // SIMD128
#if CV_SIMD256
template<typename _T> struct Type2Vec256_Traits;
#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \
template<> struct Type2Vec256_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4);
#if CV_SIMD256_64F
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4);
#endif
template<typename _T> static inline
typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a);
template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const uchar& a) { return v256_setall_u8(a); }
template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const schar& a) { return v256_setall_s8(a); }
template<> inline Type2Vec256_Traits<ushort>::vec_type v256_setall<ushort>(const ushort& a) { return v256_setall_u16(a); }
template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const short& a) { return v256_setall_s16(a); }
template<> inline Type2Vec256_Traits< uint>::vec_type v256_setall< uint>(const uint& a) { return v256_setall_u32(a); }
template<> inline Type2Vec256_Traits< int>::vec_type v256_setall< int>(const int& a) { return v256_setall_s32(a); }
template<> inline Type2Vec256_Traits<uint64>::vec_type v256_setall<uint64>(const uint64& a) { return v256_setall_u64(a); }
template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const int64& a) { return v256_setall_s64(a); }
template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const float& a) { return v256_setall_f32(a); }
#if CV_SIMD256_64F
template<> inline Type2Vec256_Traits<double>::vec_type v256_setall<double>(const double& a) { return v256_setall_f64(a); }
#endif
#endif // SIMD256
#if CV_SIMD512
template<typename _T> struct Type2Vec512_Traits;
#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \
template<> struct Type2Vec512_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8);
#if CV_SIMD512_64F
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8);
#endif
template<typename _T> static inline
typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a);
template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const uchar& a) { return v512_setall_u8(a); }
template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const schar& a) { return v512_setall_s8(a); }
template<> inline Type2Vec512_Traits<ushort>::vec_type v512_setall<ushort>(const ushort& a) { return v512_setall_u16(a); }
template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const short& a) { return v512_setall_s16(a); }
template<> inline Type2Vec512_Traits< uint>::vec_type v512_setall< uint>(const uint& a) { return v512_setall_u32(a); }
template<> inline Type2Vec512_Traits< int>::vec_type v512_setall< int>(const int& a) { return v512_setall_s32(a); }
template<> inline Type2Vec512_Traits<uint64>::vec_type v512_setall<uint64>(const uint64& a) { return v512_setall_u64(a); }
template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const int64& a) { return v512_setall_s64(a); }
template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const float& a) { return v512_setall_f32(a); }
#if CV_SIMD512_64F
template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const double& a) { return v512_setall_f64(a); }
#endif
#endif // SIMD512
#if CV_SIMD_SCALABLE
template<typename _T> struct Type2Vec_Traits;
#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
template<> struct Type2Vec_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
#if CV_SIMD_SCALABLE_64F
CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
#endif
template<typename _T> static inline
typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
template<> inline Type2Vec_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
template<> inline Type2Vec_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
#if CV_SIMD_SCALABLE_64F
template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
#endif
#endif
#if CV_SIMD_SCALABLE
template<typename _T> static inline
typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
#elif CV_SIMD_WIDTH == 16
template<typename _T> static inline
typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
#elif CV_SIMD_WIDTH == 32
template<typename _T> static inline
typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); }
#elif CV_SIMD_WIDTH == 64
template<typename _T> static inline
typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); }
#else
#error "Build configuration error, unsupported CV_SIMD_WIDTH"
#endif
#endif // OPENCV_HAL_INTRIN_HPP