点阵完成,加入opencv
This commit is contained in:
260
3rdpart/OpenCV/include/opencv2/core/hal/hal.hpp
Normal file
260
3rdpart/OpenCV/include/opencv2/core/hal/hal.hpp
Normal file
@@ -0,0 +1,260 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_HAL_HPP
|
||||
#define OPENCV_HAL_HPP
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
#include "opencv2/core/cvstd.hpp"
|
||||
#include "opencv2/core/hal/interface.h"
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
//! @addtogroup core_hal_functions
|
||||
//! @{
|
||||
|
||||
CV_EXPORTS int normHamming(const uchar* a, int n);
|
||||
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
|
||||
|
||||
CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
|
||||
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
|
||||
|
||||
CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
|
||||
CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
|
||||
CV_EXPORTS int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors);
|
||||
CV_EXPORTS int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors);
|
||||
|
||||
CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
|
||||
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
|
||||
int m_a, int n_a, int n_d, int flags);
|
||||
CV_EXPORTS void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
|
||||
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
|
||||
int m_a, int n_a, int n_d, int flags);
|
||||
CV_EXPORTS void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
|
||||
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
|
||||
int m_a, int n_a, int n_d, int flags);
|
||||
CV_EXPORTS void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
|
||||
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
|
||||
int m_a, int n_a, int n_d, int flags);
|
||||
|
||||
CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
|
||||
CV_EXPORTS float normL1_(const float* a, const float* b, int n);
|
||||
CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
|
||||
|
||||
CV_EXPORTS void exp32f(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void exp64f(const double* src, double* dst, int n);
|
||||
CV_EXPORTS void log32f(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void log64f(const double* src, double* dst, int n);
|
||||
|
||||
CV_EXPORTS void cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
|
||||
CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
|
||||
CV_EXPORTS void polarToCart32f(const float* mag, const float* angle, float* x, float* y, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void polarToCart64f(const double* mag, const double* angle, double* x, double* y, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
|
||||
CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
|
||||
|
||||
CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
|
||||
CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
|
||||
CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
|
||||
CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
|
||||
|
||||
CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
|
||||
CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
|
||||
CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
|
||||
CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
|
||||
|
||||
CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
|
||||
CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
CV_EXPORTS void recip8u( const uchar *, size_t, const uchar * src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip8s( const schar *, size_t, const schar * src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip16u( const ushort *, size_t, const ushort * src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip16s( const short *, size_t, const short * src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip32s( const int *, size_t, const int * src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip32f( const float *, size_t, const float * src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip64f( const double *, size_t, const double * src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
|
||||
CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
|
||||
|
||||
CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
|
||||
CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );
|
||||
|
||||
CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
|
||||
CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
|
||||
|
||||
struct CV_EXPORTS DFT1D
|
||||
{
|
||||
static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
|
||||
virtual void apply(const uchar *src, uchar *dst) = 0;
|
||||
virtual ~DFT1D() {}
|
||||
};
|
||||
|
||||
struct CV_EXPORTS DFT2D
|
||||
{
|
||||
static Ptr<DFT2D> create(int width, int height, int depth,
|
||||
int src_channels, int dst_channels,
|
||||
int flags, int nonzero_rows = 0);
|
||||
virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
|
||||
virtual ~DFT2D() {}
|
||||
};
|
||||
|
||||
struct CV_EXPORTS DCT2D
|
||||
{
|
||||
static Ptr<DCT2D> create(int width, int height, int depth, int flags);
|
||||
virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
|
||||
virtual ~DCT2D() {}
|
||||
};
|
||||
|
||||
//! @} core_hal
|
||||
|
||||
//=============================================================================
|
||||
// for binary compatibility with 3.0
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
|
||||
CV_EXPORTS void exp(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void exp(const double* src, double* dst, int n);
|
||||
CV_EXPORTS void log(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void log(const double* src, double* dst, int n);
|
||||
|
||||
CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
|
||||
CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
|
||||
CV_EXPORTS void sqrt(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void sqrt(const double* src, double* dst, int len);
|
||||
CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
|
||||
|
||||
//! @endcond
|
||||
|
||||
}} //cv::hal
|
||||
|
||||
#endif //OPENCV_HAL_HPP
|
||||
190
3rdpart/OpenCV/include/opencv2/core/hal/interface.h
Normal file
190
3rdpart/OpenCV/include/opencv2/core/hal/interface.h
Normal file
@@ -0,0 +1,190 @@
|
||||
#ifndef OPENCV_CORE_HAL_INTERFACE_H
|
||||
#define OPENCV_CORE_HAL_INTERFACE_H
|
||||
|
||||
//! @addtogroup core_hal_interface
|
||||
//! @{
|
||||
|
||||
//! @name Return codes
|
||||
//! @{
|
||||
#define CV_HAL_ERROR_OK 0
|
||||
#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
|
||||
#define CV_HAL_ERROR_UNKNOWN -1
|
||||
//! @}
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <cstddef>
|
||||
#else
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#endif
|
||||
|
||||
//! @name Data types
|
||||
//! primitive types
|
||||
//! - schar - signed 1 byte integer
|
||||
//! - uchar - unsigned 1 byte integer
|
||||
//! - short - signed 2 byte integer
|
||||
//! - ushort - unsigned 2 byte integer
|
||||
//! - int - signed 4 byte integer
|
||||
//! - uint - unsigned 4 byte integer
|
||||
//! - int64 - signed 8 byte integer
|
||||
//! - uint64 - unsigned 8 byte integer
|
||||
//! @{
|
||||
#if !defined _MSC_VER && !defined __BORLANDC__
|
||||
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
|
||||
# include <cstdint>
|
||||
# ifdef __NEWLIB__
|
||||
typedef unsigned int uint;
|
||||
# else
|
||||
typedef std::uint32_t uint;
|
||||
# endif
|
||||
# else
|
||||
# include <stdint.h>
|
||||
typedef uint32_t uint;
|
||||
# endif
|
||||
#else
|
||||
typedef unsigned uint;
|
||||
#endif
|
||||
|
||||
typedef signed char schar;
|
||||
|
||||
#ifndef __IPL_H__
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
#endif
|
||||
|
||||
#if defined _MSC_VER || defined __BORLANDC__
|
||||
typedef __int64 int64;
|
||||
typedef unsigned __int64 uint64;
|
||||
# define CV_BIG_INT(n) n##I64
|
||||
# define CV_BIG_UINT(n) n##UI64
|
||||
#else
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
# define CV_BIG_INT(n) n##LL
|
||||
# define CV_BIG_UINT(n) n##ULL
|
||||
#endif
|
||||
|
||||
#define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
|
||||
|
||||
#define CV_CN_MAX 512
|
||||
#define CV_CN_SHIFT 3
|
||||
#define CV_DEPTH_MAX (1 << CV_CN_SHIFT)
|
||||
|
||||
#define CV_8U 0
|
||||
#define CV_8S 1
|
||||
#define CV_16U 2
|
||||
#define CV_16S 3
|
||||
#define CV_32S 4
|
||||
#define CV_32F 5
|
||||
#define CV_64F 6
|
||||
#define CV_16F 7
|
||||
|
||||
#define CV_MAT_DEPTH_MASK (CV_DEPTH_MAX - 1)
|
||||
#define CV_MAT_DEPTH(flags) ((flags) & CV_MAT_DEPTH_MASK)
|
||||
|
||||
#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
|
||||
#define CV_MAKE_TYPE CV_MAKETYPE
|
||||
|
||||
#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
|
||||
#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
|
||||
#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
|
||||
#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
|
||||
#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
|
||||
|
||||
#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
|
||||
#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
|
||||
#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
|
||||
#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
|
||||
#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
|
||||
|
||||
#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
|
||||
#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
|
||||
#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
|
||||
#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
|
||||
#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
|
||||
|
||||
#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
|
||||
#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
|
||||
#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
|
||||
#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
|
||||
#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
|
||||
|
||||
#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
|
||||
#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
|
||||
#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
|
||||
#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
|
||||
#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
|
||||
|
||||
#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
|
||||
#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
|
||||
#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
|
||||
#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
|
||||
#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
|
||||
|
||||
#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
|
||||
#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
|
||||
#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
|
||||
#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
|
||||
#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
|
||||
|
||||
#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
|
||||
#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
|
||||
#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
|
||||
#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
|
||||
#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
|
||||
//! @}
|
||||
|
||||
//! @name Comparison operation
|
||||
//! @sa cv::CmpTypes
|
||||
//! @{
|
||||
#define CV_HAL_CMP_EQ 0
|
||||
#define CV_HAL_CMP_GT 1
|
||||
#define CV_HAL_CMP_GE 2
|
||||
#define CV_HAL_CMP_LT 3
|
||||
#define CV_HAL_CMP_LE 4
|
||||
#define CV_HAL_CMP_NE 5
|
||||
//! @}
|
||||
|
||||
//! @name Border processing modes
|
||||
//! @sa cv::BorderTypes
|
||||
//! @{
|
||||
#define CV_HAL_BORDER_CONSTANT 0
|
||||
#define CV_HAL_BORDER_REPLICATE 1
|
||||
#define CV_HAL_BORDER_REFLECT 2
|
||||
#define CV_HAL_BORDER_WRAP 3
|
||||
#define CV_HAL_BORDER_REFLECT_101 4
|
||||
#define CV_HAL_BORDER_TRANSPARENT 5
|
||||
#define CV_HAL_BORDER_ISOLATED 16
|
||||
//! @}
|
||||
|
||||
//! @name DFT flags
|
||||
//! @{
|
||||
#define CV_HAL_DFT_INVERSE 1
|
||||
#define CV_HAL_DFT_SCALE 2
|
||||
#define CV_HAL_DFT_ROWS 4
|
||||
#define CV_HAL_DFT_COMPLEX_OUTPUT 16
|
||||
#define CV_HAL_DFT_REAL_OUTPUT 32
|
||||
#define CV_HAL_DFT_TWO_STAGE 64
|
||||
#define CV_HAL_DFT_STAGE_COLS 128
|
||||
#define CV_HAL_DFT_IS_CONTINUOUS 512
|
||||
#define CV_HAL_DFT_IS_INPLACE 1024
|
||||
//! @}
|
||||
|
||||
//! @name SVD flags
|
||||
//! @{
|
||||
#define CV_HAL_SVD_NO_UV 1
|
||||
#define CV_HAL_SVD_SHORT_UV 2
|
||||
#define CV_HAL_SVD_MODIFY_A 4
|
||||
#define CV_HAL_SVD_FULL_UV 8
|
||||
//! @}
|
||||
|
||||
//! @name Gemm flags
|
||||
//! @{
|
||||
#define CV_HAL_GEMM_1_T 1
|
||||
#define CV_HAL_GEMM_2_T 2
|
||||
#define CV_HAL_GEMM_3_T 4
|
||||
//! @}
|
||||
|
||||
//! @}
|
||||
|
||||
#endif
|
||||
988
3rdpart/OpenCV/include/opencv2/core/hal/intrin.hpp
Normal file
988
3rdpart/OpenCV/include/opencv2/core/hal/intrin.hpp
Normal file
@@ -0,0 +1,988 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_HPP
|
||||
#define OPENCV_HAL_INTRIN_HPP
|
||||
|
||||
#include <cmath>
|
||||
#include <float.h>
|
||||
#include <stdlib.h>
|
||||
#include "opencv2/core/cvdef.h"
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__ == 12
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_ADD(a, b) ((a) + (b))
|
||||
#define OPENCV_HAL_AND(a, b) ((a) & (b))
|
||||
#define OPENCV_HAL_NOP(a) (a)
|
||||
#define OPENCV_HAL_1ST(a, b) (a)
|
||||
|
||||
namespace {
|
||||
inline unsigned int trailingZeros32(unsigned int value) {
|
||||
#if defined(_MSC_VER)
|
||||
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
unsigned long index = 0;
|
||||
_BitScanForward(&index, value);
|
||||
return (unsigned int)index;
|
||||
#elif defined(__clang__)
|
||||
// clang-cl doesn't export _tzcnt_u32 for non BMI systems
|
||||
return value ? __builtin_ctz(value) : 32;
|
||||
#else
|
||||
return _tzcnt_u32(value);
|
||||
#endif
|
||||
#elif defined(__GNUC__) || defined(__GNUG__)
|
||||
return __builtin_ctz(value);
|
||||
#elif defined(__ICC) || defined(__INTEL_COMPILER)
|
||||
return _bit_scan_forward(value);
|
||||
#elif defined(__clang__)
|
||||
return llvm.cttz.i32(value, true);
|
||||
#else
|
||||
static const int MultiplyDeBruijnBitPosition[32] = {
|
||||
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
||||
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
|
||||
return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// unlike HAL API, which is in cv::hal,
|
||||
// we put intrinsics into cv namespace to make its
|
||||
// access from within opencv code more accessible
|
||||
namespace cv {
|
||||
|
||||
namespace hal {
|
||||
|
||||
enum StoreMode
|
||||
{
|
||||
STORE_UNALIGNED = 0,
|
||||
STORE_ALIGNED = 1,
|
||||
STORE_ALIGNED_NOCACHE = 2
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// TODO FIXIT: Don't use "God" traits. Split on separate cases.
|
||||
template<typename _Tp> struct V_TypeTraits
|
||||
{
|
||||
};
|
||||
|
||||
#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
|
||||
template<> struct V_TypeTraits<type> \
|
||||
{ \
|
||||
typedef type value_type; \
|
||||
typedef int_type_ int_type; \
|
||||
typedef abs_type_ abs_type; \
|
||||
typedef uint_type_ uint_type; \
|
||||
typedef w_type_ w_type; \
|
||||
typedef q_type_ q_type; \
|
||||
typedef sum_type_ sum_type; \
|
||||
\
|
||||
static inline int_type reinterpret_int(type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.l = x; \
|
||||
return v.i; \
|
||||
} \
|
||||
\
|
||||
static inline type reinterpret_from_int(int_type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.i = x; \
|
||||
return v.l; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
|
||||
template<> struct V_TypeTraits<type> \
|
||||
{ \
|
||||
typedef type value_type; \
|
||||
typedef int_type_ int_type; \
|
||||
typedef abs_type_ abs_type; \
|
||||
typedef uint_type_ uint_type; \
|
||||
typedef w_type_ w_type; \
|
||||
typedef sum_type_ sum_type; \
|
||||
\
|
||||
static inline int_type reinterpret_int(type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.l = x; \
|
||||
return v.i; \
|
||||
} \
|
||||
\
|
||||
static inline type reinterpret_from_int(int_type x) \
|
||||
{ \
|
||||
union { type l; int_type i; } v; \
|
||||
v.i = x; \
|
||||
return v.l; \
|
||||
} \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
|
||||
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
|
||||
#ifdef CV_FORCE_SIMD128_CPP
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#elif defined(CV_CPU_DISPATCH_MODE)
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#else
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
|
||||
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
|
||||
#endif
|
||||
#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
template <typename _VecTp> inline _VecTp v_setzero_();
|
||||
template <typename _VecTp> inline _VecTp v_setall_(uchar);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(schar);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(ushort);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(short);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(unsigned);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(int);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(uint64);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(int64);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(float);
|
||||
template <typename _VecTp> inline _VecTp v_setall_(double);
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CV_DOXYGEN
|
||||
# undef CV_AVX2
|
||||
# undef CV_SSE2
|
||||
# undef CV_NEON
|
||||
# undef CV_VSX
|
||||
# undef CV_FP16
|
||||
# undef CV_MSA
|
||||
# undef CV_RVV
|
||||
#endif
|
||||
|
||||
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#define CV__SIMD_FORWARD 128
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#endif
|
||||
|
||||
#if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
|
||||
|
||||
#include "opencv2/core/hal/intrin_sse_em.hpp"
|
||||
#include "opencv2/core/hal/intrin_sse.hpp"
|
||||
|
||||
#elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
|
||||
|
||||
#include "opencv2/core/hal/intrin_neon.hpp"
|
||||
|
||||
#elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#define CV_SIMD128_CPP 0
|
||||
#include "opencv2/core/hal/intrin_rvv071.hpp"
|
||||
|
||||
#elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
|
||||
|
||||
#include "opencv2/core/hal/intrin_vsx.hpp"
|
||||
|
||||
#elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
|
||||
|
||||
#include "opencv2/core/hal/intrin_msa.hpp"
|
||||
|
||||
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#include "opencv2/core/hal/intrin_wasm.hpp"
|
||||
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
|
||||
|
||||
#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
|
||||
|
||||
#include "opencv2/core/hal/intrin_lsx.hpp"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
// AVX2 can be used together with SSE2, so
|
||||
// we define those two sets of intrinsics at once.
|
||||
// Most of the intrinsics do not conflict (the proper overloaded variant is
|
||||
// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
|
||||
// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
|
||||
// Correspondingly, the wide intrinsics (which are mapped to the "widest"
|
||||
// available instruction set) will get vx_ prefix
|
||||
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
|
||||
#if CV_AVX2
|
||||
|
||||
#define CV__SIMD_FORWARD 256
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#include "opencv2/core/hal/intrin_avx.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
// AVX512 can be used together with SSE2 and AVX2, so
|
||||
// we define those sets of intrinsics at once.
|
||||
// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
|
||||
// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
|
||||
#if CV_AVX512_SKX
|
||||
|
||||
#define CV__SIMD_FORWARD 512
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#include "opencv2/core/hal/intrin_avx512.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
#if CV_LASX
|
||||
|
||||
#define CV__SIMD_FORWARD 256
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#include "opencv2/core/hal/intrin_lasx.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv {
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128
|
||||
#define CV_SIMD128 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_CPP
|
||||
#define CV_SIMD128_CPP 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_64F
|
||||
#define CV_SIMD128_64F 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD256
|
||||
#define CV_SIMD256 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD256_64F
|
||||
#define CV_SIMD256_64F 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD512
|
||||
#define CV_SIMD512 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD512_64F
|
||||
#define CV_SIMD512_64F 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_FP16
|
||||
#define CV_SIMD128_FP16 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD256_FP16
|
||||
#define CV_SIMD256_FP16 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD512_FP16
|
||||
#define CV_SIMD512_FP16 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD_SCALABLE
|
||||
#define CV_SIMD_SCALABLE 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD_SCALABLE_64F
|
||||
#define CV_SIMD_SCALABLE_64F 0
|
||||
#endif
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
template<typename _Tp> struct V_RegTraits
|
||||
{
|
||||
};
|
||||
|
||||
#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
|
||||
template<> struct V_RegTraits<_reg> \
|
||||
{ \
|
||||
typedef _reg reg; \
|
||||
typedef _u_reg u_reg; \
|
||||
typedef _w_reg w_reg; \
|
||||
typedef _q_reg q_reg; \
|
||||
typedef _int_reg int_reg; \
|
||||
typedef _round_reg round_reg; \
|
||||
}
|
||||
|
||||
#if CV_SIMD128 || CV_SIMD128_CPP
|
||||
CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
|
||||
#if CV_SIMD128_64F || CV_SIMD128_CPP
|
||||
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
|
||||
#else
|
||||
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
|
||||
#endif
|
||||
CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
|
||||
#if CV_SIMD128_64F
|
||||
CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if CV_SIMD256
|
||||
CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
|
||||
CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
|
||||
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
|
||||
#endif
|
||||
|
||||
#if CV_SIMD512
|
||||
CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
|
||||
CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
|
||||
#endif
|
||||
#if CV_SIMD_SCALABLE
|
||||
CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
|
||||
CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
|
||||
CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
|
||||
CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
|
||||
#endif
|
||||
//! @endcond
|
||||
|
||||
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
|
||||
#define CV__SIMD_NAMESPACE simd512
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD 1
|
||||
#define CV_SIMD_64F CV_SIMD512_64F
|
||||
#define CV_SIMD_FP16 CV_SIMD512_FP16
|
||||
#define CV_SIMD_WIDTH 64
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
|
||||
typedef v_uint8x64 v_uint8;
|
||||
//! @brief Maximum available vector register capacity 8-bit signed integer values
|
||||
typedef v_int8x64 v_int8;
|
||||
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
|
||||
typedef v_uint16x32 v_uint16;
|
||||
//! @brief Maximum available vector register capacity 16-bit signed integer values
|
||||
typedef v_int16x32 v_int16;
|
||||
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
|
||||
typedef v_uint32x16 v_uint32;
|
||||
//! @brief Maximum available vector register capacity 32-bit signed integer values
|
||||
typedef v_int32x16 v_int32;
|
||||
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
|
||||
typedef v_uint64x8 v_uint64;
|
||||
//! @brief Maximum available vector register capacity 64-bit signed integer values
|
||||
typedef v_int64x8 v_int64;
|
||||
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
|
||||
typedef v_float32x16 v_float32;
|
||||
#if CV_SIMD512_64F
|
||||
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
|
||||
typedef v_float64x8 v_float64;
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
#define VXPREFIX(func) v512##func
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
|
||||
#define CV__SIMD_NAMESPACE simd256
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD 1
|
||||
#define CV_SIMD_64F CV_SIMD256_64F
|
||||
#define CV_SIMD_FP16 CV_SIMD256_FP16
|
||||
#define CV_SIMD_WIDTH 32
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
|
||||
typedef v_uint8x32 v_uint8;
|
||||
//! @brief Maximum available vector register capacity 8-bit signed integer values
|
||||
typedef v_int8x32 v_int8;
|
||||
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
|
||||
typedef v_uint16x16 v_uint16;
|
||||
//! @brief Maximum available vector register capacity 16-bit signed integer values
|
||||
typedef v_int16x16 v_int16;
|
||||
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
|
||||
typedef v_uint32x8 v_uint32;
|
||||
//! @brief Maximum available vector register capacity 32-bit signed integer values
|
||||
typedef v_int32x8 v_int32;
|
||||
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
|
||||
typedef v_uint64x4 v_uint64;
|
||||
//! @brief Maximum available vector register capacity 64-bit signed integer values
|
||||
typedef v_int64x4 v_int64;
|
||||
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
|
||||
typedef v_float32x8 v_float32;
|
||||
#if CV_SIMD256_64F
|
||||
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
|
||||
typedef v_float64x4 v_float64;
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
#define VXPREFIX(func) v256##func
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
|
||||
#if defined CV_SIMD128_CPP
|
||||
#define CV__SIMD_NAMESPACE simd128_cpp
|
||||
#else
|
||||
#define CV__SIMD_NAMESPACE simd128
|
||||
#endif
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD CV_SIMD128
|
||||
#define CV_SIMD_64F CV_SIMD128_64F
|
||||
#define CV_SIMD_WIDTH 16
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
|
||||
typedef v_uint8x16 v_uint8;
|
||||
//! @brief Maximum available vector register capacity 8-bit signed integer values
|
||||
typedef v_int8x16 v_int8;
|
||||
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
|
||||
typedef v_uint16x8 v_uint16;
|
||||
//! @brief Maximum available vector register capacity 16-bit signed integer values
|
||||
typedef v_int16x8 v_int16;
|
||||
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
|
||||
typedef v_uint32x4 v_uint32;
|
||||
//! @brief Maximum available vector register capacity 32-bit signed integer values
|
||||
typedef v_int32x4 v_int32;
|
||||
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
|
||||
typedef v_uint64x2 v_uint64;
|
||||
//! @brief Maximum available vector register capacity 64-bit signed integer values
|
||||
typedef v_int64x2 v_int64;
|
||||
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
|
||||
typedef v_float32x4 v_float32;
|
||||
#if CV_SIMD128_64F
|
||||
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
|
||||
typedef v_float64x2 v_float64;
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
#define VXPREFIX(func) v##func
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
|
||||
#elif CV_SIMD_SCALABLE
|
||||
#define CV__SIMD_NAMESPACE simd
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD 0
|
||||
#define CV_SIMD_WIDTH 128 /* 1024/8 */
|
||||
|
||||
#define VXPREFIX(func) v##func
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
|
||||
#endif
|
||||
|
||||
//! @cond IGNORED
|
||||
#ifndef CV_SIMD_64F
|
||||
#define CV_SIMD_64F 0
|
||||
#endif
|
||||
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
//! @name Wide init with value
|
||||
//! @{
|
||||
//! @brief Create maximum available capacity vector with elements set to a specific value
|
||||
inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
|
||||
inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
|
||||
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
|
||||
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
|
||||
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
|
||||
inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
|
||||
inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
|
||||
inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
|
||||
inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide init with zero
|
||||
//! @{
|
||||
//! @brief Create maximum available capacity vector with elements set to zero
|
||||
inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
|
||||
inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
|
||||
inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
|
||||
inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
|
||||
inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
|
||||
inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
|
||||
inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
|
||||
inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
|
||||
inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide load from memory
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents from memory
|
||||
inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide load from memory(aligned)
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents from memory(aligned)
|
||||
inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide load lower half from memory
|
||||
//! @{
|
||||
//! @brief Load lower half of maximum available capacity register from memory
|
||||
inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide load halfs from memory
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents from two memory blocks
|
||||
inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide LUT of elements
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents with array elements by provided indexes
|
||||
inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide LUT of element pairs
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents with array element pairs by provided indexes
|
||||
inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
|
||||
#endif
|
||||
//! @}
|
||||
|
||||
//! @name Wide LUT of element quads
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents with array element quads by provided indexes
|
||||
inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
|
||||
//! @}
|
||||
|
||||
//! @name Wide load with double expansion
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents from memory with double expand
|
||||
inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
|
||||
//! @}
|
||||
|
||||
//! @name Wide load with quad expansion
|
||||
//! @{
|
||||
//! @brief Load maximum available capacity register contents from memory with quad expand
|
||||
inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
|
||||
inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
|
||||
//! @}
|
||||
|
||||
/** @brief SIMD processing state cleanup call */
|
||||
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
|
||||
|
||||
#if !CV_SIMD_SCALABLE
|
||||
// Compatibility layer
|
||||
#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
|
||||
template<typename T> struct VTraits {
|
||||
static inline int vlanes() { return T::nlanes; }
|
||||
enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
|
||||
using lane_type = typename T::lane_type;
|
||||
};
|
||||
|
||||
//////////// get0 ////////////
|
||||
#define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
|
||||
{ \
|
||||
return v.get0(); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint64)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int64)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int8x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int16x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int32x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int64x2)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int8x32)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int16x16)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int32x8)
|
||||
OPENCV_HAL_WRAP_GRT0(v_int64x4)
|
||||
OPENCV_HAL_WRAP_GRT0(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
|
||||
return v_add(v_add(f1, f2), f3, vf...); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
|
||||
return v_mul(v_mul(f1, f2), f3, vf...); \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
|
||||
inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
|
||||
inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
|
||||
{ \
|
||||
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32)
|
||||
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
|
||||
#endif
|
||||
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
|
||||
#endif
|
||||
|
||||
#endif //!CV_SIMD_SCALABLE
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
// backward compatibility
|
||||
template<typename _Tp, typename _Tvec> static inline
|
||||
void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
|
||||
// backward compatibility
|
||||
template<typename _Tp, typename _Tvec> static inline
|
||||
void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
|
||||
|
||||
//! @endcond
|
||||
|
||||
|
||||
//! @}
|
||||
#undef VXPREFIX
|
||||
} // namespace
|
||||
|
||||
|
||||
#ifndef CV_SIMD_FP16
|
||||
#define CV_SIMD_FP16 0 //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD
|
||||
#define CV_SIMD 0
|
||||
#endif
|
||||
|
||||
#include "simd_utils.impl.hpp"
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
} // cv::
|
||||
|
||||
//! @endcond
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__ == 12
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#endif
|
||||
3189
3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx.hpp
Normal file
3189
3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx.hpp
Normal file
File diff suppressed because it is too large
Load Diff
3101
3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx512.hpp
Normal file
3101
3rdpart/OpenCV/include/opencv2/core/hal/intrin_avx512.hpp
Normal file
File diff suppressed because it is too large
Load Diff
3388
3rdpart/OpenCV/include/opencv2/core/hal/intrin_cpp.hpp
Normal file
3388
3rdpart/OpenCV/include/opencv2/core/hal/intrin_cpp.hpp
Normal file
File diff suppressed because it is too large
Load Diff
191
3rdpart/OpenCV/include/opencv2/core/hal/intrin_forward.hpp
Normal file
191
3rdpart/OpenCV/include/opencv2/core/hal/intrin_forward.hpp
Normal file
@@ -0,0 +1,191 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#ifndef CV__SIMD_FORWARD
|
||||
#error "Need to pre-define forward width"
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
/** Types **/
|
||||
#if CV__SIMD_FORWARD == 1024
|
||||
// [todo] 1024
|
||||
#error "1024-long ops not implemented yet"
|
||||
#elif CV__SIMD_FORWARD == 512
|
||||
// 512
|
||||
#define __CV_VX(fun) v512_##fun
|
||||
#define __CV_V_UINT8 v_uint8x64
|
||||
#define __CV_V_INT8 v_int8x64
|
||||
#define __CV_V_UINT16 v_uint16x32
|
||||
#define __CV_V_INT16 v_int16x32
|
||||
#define __CV_V_UINT32 v_uint32x16
|
||||
#define __CV_V_INT32 v_int32x16
|
||||
#define __CV_V_UINT64 v_uint64x8
|
||||
#define __CV_V_INT64 v_int64x8
|
||||
#define __CV_V_FLOAT32 v_float32x16
|
||||
#define __CV_V_FLOAT64 v_float64x8
|
||||
struct v_uint8x64;
|
||||
struct v_int8x64;
|
||||
struct v_uint16x32;
|
||||
struct v_int16x32;
|
||||
struct v_uint32x16;
|
||||
struct v_int32x16;
|
||||
struct v_uint64x8;
|
||||
struct v_int64x8;
|
||||
struct v_float32x16;
|
||||
struct v_float64x8;
|
||||
#elif CV__SIMD_FORWARD == 256
|
||||
// 256
|
||||
#define __CV_VX(fun) v256_##fun
|
||||
#define __CV_V_UINT8 v_uint8x32
|
||||
#define __CV_V_INT8 v_int8x32
|
||||
#define __CV_V_UINT16 v_uint16x16
|
||||
#define __CV_V_INT16 v_int16x16
|
||||
#define __CV_V_UINT32 v_uint32x8
|
||||
#define __CV_V_INT32 v_int32x8
|
||||
#define __CV_V_UINT64 v_uint64x4
|
||||
#define __CV_V_INT64 v_int64x4
|
||||
#define __CV_V_FLOAT32 v_float32x8
|
||||
#define __CV_V_FLOAT64 v_float64x4
|
||||
struct v_uint8x32;
|
||||
struct v_int8x32;
|
||||
struct v_uint16x16;
|
||||
struct v_int16x16;
|
||||
struct v_uint32x8;
|
||||
struct v_int32x8;
|
||||
struct v_uint64x4;
|
||||
struct v_int64x4;
|
||||
struct v_float32x8;
|
||||
struct v_float64x4;
|
||||
#else
|
||||
// 128
|
||||
#define __CV_VX(fun) v_##fun
|
||||
#define __CV_V_UINT8 v_uint8x16
|
||||
#define __CV_V_INT8 v_int8x16
|
||||
#define __CV_V_UINT16 v_uint16x8
|
||||
#define __CV_V_INT16 v_int16x8
|
||||
#define __CV_V_UINT32 v_uint32x4
|
||||
#define __CV_V_INT32 v_int32x4
|
||||
#define __CV_V_UINT64 v_uint64x2
|
||||
#define __CV_V_INT64 v_int64x2
|
||||
#define __CV_V_FLOAT32 v_float32x4
|
||||
#define __CV_V_FLOAT64 v_float64x2
|
||||
struct v_uint8x16;
|
||||
struct v_int8x16;
|
||||
struct v_uint16x8;
|
||||
struct v_int16x8;
|
||||
struct v_uint32x4;
|
||||
struct v_int32x4;
|
||||
struct v_uint64x2;
|
||||
struct v_int64x2;
|
||||
struct v_float32x4;
|
||||
struct v_float64x2;
|
||||
#endif
|
||||
|
||||
/** Value reordering **/
|
||||
|
||||
// Expansion
|
||||
void v_expand(const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
|
||||
void v_expand(const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
|
||||
void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
|
||||
void v_expand(const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
|
||||
void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
|
||||
void v_expand(const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
|
||||
// Low Expansion
|
||||
__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
|
||||
__CV_V_INT16 v_expand_low(const __CV_V_INT8&);
|
||||
__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
|
||||
__CV_V_INT32 v_expand_low(const __CV_V_INT16&);
|
||||
__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
|
||||
__CV_V_INT64 v_expand_low(const __CV_V_INT32&);
|
||||
// High Expansion
|
||||
__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
|
||||
__CV_V_INT16 v_expand_high(const __CV_V_INT8&);
|
||||
__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
|
||||
__CV_V_INT32 v_expand_high(const __CV_V_INT16&);
|
||||
__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
|
||||
__CV_V_INT64 v_expand_high(const __CV_V_INT32&);
|
||||
// Load & Low Expansion
|
||||
__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
|
||||
__CV_V_INT16 __CV_VX(load_expand)(const schar*);
|
||||
__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
|
||||
__CV_V_INT32 __CV_VX(load_expand)(const short*);
|
||||
__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
|
||||
__CV_V_INT64 __CV_VX(load_expand)(const int*);
|
||||
// Load lower 8-bit and expand into 32-bit
|
||||
__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
|
||||
__CV_V_INT32 __CV_VX(load_expand_q)(const schar*);
|
||||
|
||||
// Saturating Pack
|
||||
__CV_V_UINT8 v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
|
||||
__CV_V_INT8 v_pack(const __CV_V_INT16&, const __CV_V_INT16&);
|
||||
__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
|
||||
__CV_V_INT16 v_pack(const __CV_V_INT32&, const __CV_V_INT32&);
|
||||
// Non-saturating Pack
|
||||
__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
|
||||
__CV_V_INT32 v_pack(const __CV_V_INT64&, const __CV_V_INT64&);
|
||||
// Pack signed integers with unsigned saturation
|
||||
__CV_V_UINT8 v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
|
||||
__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
|
||||
|
||||
/** Arithmetic, bitwise and comparison operations **/
|
||||
|
||||
// Non-saturating multiply
|
||||
#if CV_VSX
|
||||
template<typename Tvec>
|
||||
Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
|
||||
#else
|
||||
__CV_V_UINT8 v_mul_wrap(const __CV_V_UINT8&, const __CV_V_UINT8&);
|
||||
__CV_V_INT8 v_mul_wrap(const __CV_V_INT8&, const __CV_V_INT8&);
|
||||
__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
|
||||
__CV_V_INT16 v_mul_wrap(const __CV_V_INT16&, const __CV_V_INT16&);
|
||||
#endif
|
||||
|
||||
// Multiply and expand
|
||||
#if CV_VSX
|
||||
template<typename Tvec, typename Twvec>
|
||||
void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
|
||||
#else
|
||||
void v_mul_expand(const __CV_V_UINT8&, const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&);
|
||||
void v_mul_expand(const __CV_V_INT8&, const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&);
|
||||
void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
|
||||
void v_mul_expand(const __CV_V_INT16&, const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&);
|
||||
void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
|
||||
void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
|
||||
#endif
|
||||
|
||||
// Conversions
|
||||
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
|
||||
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
|
||||
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
|
||||
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
|
||||
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
|
||||
|
||||
/** Cleanup **/
|
||||
#undef CV__SIMD_FORWARD
|
||||
#undef __CV_VX
|
||||
#undef __CV_V_UINT8
|
||||
#undef __CV_V_INT8
|
||||
#undef __CV_V_UINT16
|
||||
#undef __CV_V_INT16
|
||||
#undef __CV_V_UINT32
|
||||
#undef __CV_V_INT32
|
||||
#undef __CV_V_UINT64
|
||||
#undef __CV_V_INT64
|
||||
#undef __CV_V_FLOAT32
|
||||
#undef __CV_V_FLOAT64
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
} // cv::
|
||||
3036
3rdpart/OpenCV/include/opencv2/core/hal/intrin_lasx.hpp
Normal file
3036
3rdpart/OpenCV/include/opencv2/core/hal/intrin_lasx.hpp
Normal file
File diff suppressed because it is too large
Load Diff
111
3rdpart/OpenCV/include/opencv2/core/hal/intrin_legacy_ops.h
Normal file
111
3rdpart/OpenCV/include/opencv2/core/hal/intrin_legacy_ops.h
Normal file
@@ -0,0 +1,111 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
// This file has been created for compatibility with older versions of Universal Intrinscs
|
||||
// Binary operators for vector types has been removed since version 4.11
|
||||
// Include this file manually after OpenCV headers if you need these operators
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
|
||||
#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
|
||||
|
||||
#ifdef __OPENCV_BUILD
|
||||
#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#warning "Operators might conflict with built-in functions on RISC-V platform"
|
||||
#endif
|
||||
|
||||
#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
|
||||
#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
|
||||
#endif
|
||||
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
#define BIN_OP(OP, FUN) \
|
||||
template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
|
||||
|
||||
#define BIN_A_OP(OP, FUN) \
|
||||
template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
|
||||
|
||||
#define UN_OP(OP, FUN) \
|
||||
template <typename R> R operator OP (const R & val) { return FUN(val); }
|
||||
|
||||
BIN_OP(+, v_add)
|
||||
BIN_OP(-, v_sub)
|
||||
BIN_OP(*, v_mul)
|
||||
BIN_OP(/, v_div)
|
||||
BIN_OP(&, v_and)
|
||||
BIN_OP(|, v_or)
|
||||
BIN_OP(^, v_xor)
|
||||
|
||||
BIN_OP(==, v_eq)
|
||||
BIN_OP(!=, v_ne)
|
||||
BIN_OP(<, v_lt)
|
||||
BIN_OP(>, v_gt)
|
||||
BIN_OP(<=, v_le)
|
||||
BIN_OP(>=, v_ge)
|
||||
|
||||
BIN_A_OP(+=, v_add)
|
||||
BIN_A_OP(-=, v_sub)
|
||||
BIN_A_OP(*=, v_mul)
|
||||
BIN_A_OP(/=, v_div)
|
||||
BIN_A_OP(&=, v_and)
|
||||
BIN_A_OP(|=, v_or)
|
||||
BIN_A_OP(^=, v_xor)
|
||||
|
||||
UN_OP(~, v_not)
|
||||
|
||||
// TODO: shift operators?
|
||||
|
||||
}} // cv::hal::
|
||||
|
||||
//==============================================================================
|
||||
|
||||
#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
inline static void opencv_operator_compile_test()
|
||||
{
|
||||
using namespace cv;
|
||||
v_float32 a, b, c;
|
||||
uint8_t shift = 1;
|
||||
a = b + c;
|
||||
a = b - c;
|
||||
a = b * c;
|
||||
a = b / c;
|
||||
a = b & c;
|
||||
a = b | c;
|
||||
a = b ^ c;
|
||||
// a = b >> shift;
|
||||
// a = b << shift;
|
||||
|
||||
a = (b == c);
|
||||
a = (b != c);
|
||||
a = (b < c);}}
|
||||
a = (b > c);
|
||||
a = (b <= c);
|
||||
a = (b >= c);
|
||||
|
||||
a += b;
|
||||
a -= b;
|
||||
a *= b;
|
||||
a /= b;
|
||||
a &= b;
|
||||
a |= b;
|
||||
a ^= b;
|
||||
// a <<= shift;
|
||||
// a >>= shift;
|
||||
|
||||
a = ~b;
|
||||
}
|
||||
|
||||
}} // cv::hal::
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
|
||||
2546
3rdpart/OpenCV/include/opencv2/core/hal/intrin_lsx.hpp
Normal file
2546
3rdpart/OpenCV/include/opencv2/core/hal/intrin_lsx.hpp
Normal file
File diff suppressed because it is too large
Load Diff
687
3rdpart/OpenCV/include/opencv2/core/hal/intrin_math.hpp
Normal file
687
3rdpart/OpenCV/include/opencv2/core/hal/intrin_math.hpp
Normal file
@@ -0,0 +1,687 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
/* Universal Intrinsics implementation of sin, cos, exp and log
|
||||
|
||||
Inspired by Intel Approximate Math library, and based on the
|
||||
corresponding algorithms of the cephes math library
|
||||
*/
|
||||
|
||||
/* Copyright (C) 2010,2011 RJVB - extensions */
|
||||
/* Copyright (C) 2011 Julien Pommier
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
(this is the zlib license)
|
||||
*/
|
||||
#ifndef OPENCV_HAL_INTRIN_MATH_HPP
|
||||
#define OPENCV_HAL_INTRIN_MATH_HPP
|
||||
|
||||
//! @name Exponential
|
||||
//! @{
|
||||
// Implementation is the same as float32 vector.
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
|
||||
const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
|
||||
const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
|
||||
const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
|
||||
const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
|
||||
const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
|
||||
const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
|
||||
const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
|
||||
const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
|
||||
const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
|
||||
const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
|
||||
const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
|
||||
const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
|
||||
const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
|
||||
|
||||
_TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
|
||||
_TpVec16S _vexp_mm;
|
||||
const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f16);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f16(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
|
||||
_vexp_mm = v_shl(_vexp_mm, 10);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
|
||||
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
|
||||
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
_TpVec16F mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
|
||||
const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
|
||||
const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
|
||||
const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
|
||||
const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
|
||||
const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
|
||||
const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
|
||||
const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
|
||||
const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
|
||||
const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
|
||||
const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
|
||||
const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
|
||||
const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
|
||||
const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
|
||||
|
||||
_TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
|
||||
_TpVec32S _vexp_mm;
|
||||
const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f32);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f32(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
|
||||
_vexp_mm = v_shl(_vexp_mm, 23);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
|
||||
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
|
||||
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
_TpVec32F mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
|
||||
const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
|
||||
const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
|
||||
const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
|
||||
const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
|
||||
const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
|
||||
const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
|
||||
const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
|
||||
const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
|
||||
const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
|
||||
const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
|
||||
const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
|
||||
const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
|
||||
const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
|
||||
const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
|
||||
const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
|
||||
|
||||
_TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
|
||||
_TpVec64S _vexp_mm;
|
||||
const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f64);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
|
||||
_vexp_mm = v_expand_low(v_floor(_vexp_));
|
||||
_vexp_ = v_cvt_f64(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
|
||||
_vexp_mm = v_shl(_vexp_mm, 52);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
|
||||
_vexp_y = v_mul(_vexp_y, _vexp_x);
|
||||
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
|
||||
|
||||
_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
|
||||
_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
|
||||
_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
_TpVec64F mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
|
||||
}
|
||||
//! @}
|
||||
|
||||
//! @name Natural Logarithm
|
||||
//! @{
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
|
||||
const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
|
||||
const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
|
||||
const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
|
||||
const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
|
||||
const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
|
||||
const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
|
||||
const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
|
||||
const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
|
||||
const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
|
||||
const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
|
||||
const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
|
||||
const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
|
||||
const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
|
||||
|
||||
_TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
|
||||
_TpVec16S _vlog_ux, _vlog_emm0;
|
||||
const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
|
||||
|
||||
_vlog_ux = v_reinterpret_as_s16(x);
|
||||
_vlog_emm0 = v_shr(_vlog_ux, 10);
|
||||
|
||||
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
|
||||
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
|
||||
_vlog_x = v_reinterpret_as_f16(_vlog_ux);
|
||||
|
||||
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
|
||||
_vlog_e = v_cvt_f16(_vlog_emm0);
|
||||
|
||||
_vlog_e = v_add(_vlog_e, _vlog_one_fp16);
|
||||
|
||||
_TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
|
||||
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
|
||||
_vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
|
||||
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
|
||||
_vlog_x = v_add(_vlog_x, _vlog_tmp);
|
||||
|
||||
_vlog_z = v_mul(_vlog_x, _vlog_x);
|
||||
|
||||
_vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_x);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_z);
|
||||
|
||||
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
|
||||
|
||||
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
|
||||
|
||||
_vlog_x = v_add(_vlog_x, _vlog_y);
|
||||
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
|
||||
// log(0) -> -INF
|
||||
_TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
|
||||
_vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
|
||||
// log(NEG), log(NAN) -> NAN
|
||||
_TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
|
||||
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
|
||||
// log(INF) -> INF
|
||||
_TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
|
||||
_vlog_x = v_select(mask_inf, x, _vlog_x);
|
||||
return _vlog_x;
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
|
||||
const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
|
||||
const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
|
||||
const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
|
||||
const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
|
||||
const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
|
||||
const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
|
||||
const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
|
||||
const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
|
||||
const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
|
||||
const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
|
||||
const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
|
||||
const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
|
||||
const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
|
||||
|
||||
_TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
|
||||
_TpVec32S _vlog_ux, _vlog_emm0;
|
||||
const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
|
||||
|
||||
_vlog_ux = v_reinterpret_as_s32(x);
|
||||
_vlog_emm0 = v_shr(_vlog_ux, 23);
|
||||
|
||||
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
|
||||
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
|
||||
_vlog_x = v_reinterpret_as_f32(_vlog_ux);
|
||||
|
||||
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
|
||||
_vlog_e = v_cvt_f32(_vlog_emm0);
|
||||
|
||||
_vlog_e = v_add(_vlog_e, _vlog_one_fp32);
|
||||
|
||||
_TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
|
||||
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
|
||||
_vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
|
||||
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
|
||||
_vlog_x = v_add(_vlog_x, _vlog_tmp);
|
||||
|
||||
_vlog_z = v_mul(_vlog_x, _vlog_x);
|
||||
|
||||
_vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_x);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_z);
|
||||
|
||||
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
|
||||
|
||||
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
|
||||
|
||||
_vlog_x = v_add(_vlog_x, _vlog_y);
|
||||
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
|
||||
// log(0) -> -INF
|
||||
_TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
|
||||
_vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
|
||||
// log(NEG), log(NAN) -> NAN
|
||||
_TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
|
||||
_vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
|
||||
// log(INF) -> INF
|
||||
_TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
|
||||
_vlog_x = v_select(mask_inf, x, _vlog_x);
|
||||
return _vlog_x;
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
|
||||
const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
|
||||
const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
|
||||
const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
|
||||
const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
|
||||
const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
|
||||
const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
|
||||
const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
|
||||
const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
|
||||
const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
|
||||
const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
|
||||
const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
|
||||
const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
|
||||
const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
|
||||
|
||||
const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
|
||||
const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
|
||||
|
||||
_TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
|
||||
_TpVec64S _vlog_ux, _vlog_emm0;
|
||||
const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
|
||||
|
||||
_vlog_ux = v_reinterpret_as_s64(x);
|
||||
_vlog_emm0 = v_shr(_vlog_ux, 52);
|
||||
|
||||
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
|
||||
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
|
||||
_vlog_x = v_reinterpret_as_f64(_vlog_ux);
|
||||
|
||||
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
|
||||
_vlog_e = v_cvt_f64(_vlog_emm0);
|
||||
|
||||
_vlog_e = v_add(_vlog_e, _vlog_one_fp64);
|
||||
|
||||
_TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
|
||||
_vlog_tmp = v_and(_vlog_x, _vlog_mask);
|
||||
_vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
|
||||
_vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
|
||||
_vlog_x = v_add(_vlog_x, _vlog_tmp);
|
||||
|
||||
_vlog_xx = v_mul(_vlog_x, _vlog_x);
|
||||
|
||||
_vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
|
||||
_vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_x);
|
||||
_vlog_y = v_mul(_vlog_y, _vlog_xx);
|
||||
|
||||
_vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
|
||||
_vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
|
||||
|
||||
_vlog_z = v_div(_vlog_y, _vlog_z);
|
||||
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
|
||||
_vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
|
||||
|
||||
_vlog_z = v_add(_vlog_z, _vlog_x);
|
||||
_vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
|
||||
|
||||
// log(0) -> -INF
|
||||
_TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
|
||||
_vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
|
||||
// log(NEG), log(NAN) -> NAN
|
||||
_TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
|
||||
_vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
|
||||
// log(INF) -> INF
|
||||
_TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
|
||||
_vlog_z = v_select(mask_inf, x, _vlog_z);
|
||||
return _vlog_z;
|
||||
}
|
||||
//! @}
|
||||
|
||||
//! @name Sine and Cosine
|
||||
//! @{
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
|
||||
const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
|
||||
const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
|
||||
const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
|
||||
const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
|
||||
const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
|
||||
const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
|
||||
const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
|
||||
const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
|
||||
const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
|
||||
const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
|
||||
const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
|
||||
const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
|
||||
|
||||
_TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
|
||||
_TpVec16S emm2;
|
||||
|
||||
sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
|
||||
_vx = v_abs(x);
|
||||
_vy = v_mul(_vx, v_cephes_FOPI);
|
||||
|
||||
emm2 = v_trunc(_vy);
|
||||
emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
|
||||
emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
|
||||
_vy = v_cvt_f16(emm2);
|
||||
|
||||
_TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
|
||||
|
||||
_vx = v_fma(_vy, v_minus_DP1, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP2, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP3, _vx);
|
||||
|
||||
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
|
||||
sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
|
||||
|
||||
_TpVec16F _vxx = v_mul(_vx, _vx);
|
||||
_TpVec16F y1, y2;
|
||||
|
||||
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
|
||||
y1 = v_fma(y1, _vxx, v_coscof_p2);
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
|
||||
|
||||
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
|
||||
y2 = v_fma(y2, _vxx, v_sincof_p2);
|
||||
y2 = v_mul(y2, _vxx);
|
||||
y2 = v_fma(y2, _vx, _vx);
|
||||
|
||||
ysin = v_select(poly_mask, y2, y1);
|
||||
ycos = v_select(poly_mask, y1, y2);
|
||||
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
|
||||
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
|
||||
|
||||
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
|
||||
_TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
|
||||
_TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
|
||||
ysin = v_select(mask_nan, v_nan, ysin);
|
||||
ycos = v_select(mask_nan, v_nan, ycos);
|
||||
}
|
||||
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
|
||||
_TpVec16F ysin, ycos;
|
||||
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
|
||||
return ysin;
|
||||
}
|
||||
|
||||
template<typename _TpVec16F, typename _TpVec16S>
|
||||
inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
|
||||
_TpVec16F ysin, ycos;
|
||||
v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
|
||||
return ycos;
|
||||
}
|
||||
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
|
||||
const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
|
||||
const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
|
||||
const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
|
||||
const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
|
||||
const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
|
||||
const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
|
||||
const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
|
||||
const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
|
||||
const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
|
||||
const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
|
||||
const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
|
||||
const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
|
||||
|
||||
_TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
|
||||
_TpVec32S emm2;
|
||||
|
||||
sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
|
||||
_vx = v_abs(x);
|
||||
_vy = v_mul(_vx, v_cephes_FOPI);
|
||||
|
||||
emm2 = v_trunc(_vy);
|
||||
emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
|
||||
emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
|
||||
_vy = v_cvt_f32(emm2);
|
||||
|
||||
_TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
|
||||
|
||||
_vx = v_fma(_vy, v_minus_DP1, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP2, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP3, _vx);
|
||||
|
||||
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
|
||||
sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
|
||||
|
||||
_TpVec32F _vxx = v_mul(_vx, _vx);
|
||||
_TpVec32F y1, y2;
|
||||
|
||||
y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
|
||||
y1 = v_fma(y1, _vxx, v_coscof_p2);
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
|
||||
|
||||
y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
|
||||
y2 = v_fma(y2, _vxx, v_sincof_p2);
|
||||
y2 = v_mul(y2, _vxx);
|
||||
y2 = v_fma(y2, _vx, _vx);
|
||||
|
||||
ysin = v_select(poly_mask, y2, y1);
|
||||
ycos = v_select(poly_mask, y1, y2);
|
||||
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
|
||||
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
|
||||
|
||||
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
|
||||
_TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
|
||||
_TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
|
||||
ysin = v_select(mask_nan, v_nan, ysin);
|
||||
ycos = v_select(mask_nan, v_nan, ycos);
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
|
||||
_TpVec32F ysin, ycos;
|
||||
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
|
||||
return ysin;
|
||||
}
|
||||
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
|
||||
_TpVec32F ysin, ycos;
|
||||
v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
|
||||
return ycos;
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
|
||||
const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
|
||||
const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
|
||||
const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
|
||||
const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
|
||||
const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
|
||||
const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
|
||||
const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
|
||||
const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
|
||||
const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
|
||||
const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
|
||||
const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
|
||||
const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
|
||||
const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
|
||||
const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
|
||||
const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
|
||||
const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
|
||||
const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
|
||||
const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
|
||||
|
||||
_TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
|
||||
_TpVec64S emm2;
|
||||
|
||||
sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
|
||||
_vx = v_abs(x);
|
||||
_vy = v_mul(_vx, v_cephes_FOPI);
|
||||
|
||||
emm2 = v_expand_low(v_trunc(_vy));
|
||||
emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
|
||||
emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
|
||||
_vy = v_cvt_f64(emm2);
|
||||
|
||||
_TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
|
||||
|
||||
_vx = v_fma(_vy, v_minus_DP1, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP2, _vx);
|
||||
_vx = v_fma(_vy, v_minus_DP3, _vx);
|
||||
|
||||
sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
|
||||
sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
|
||||
|
||||
_TpVec64F _vxx = v_mul(_vx, _vx);
|
||||
_TpVec64F y1, y2;
|
||||
|
||||
y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C3);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C4);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C5);
|
||||
y1 = v_fma(y1, _vxx, v_cos_C6);
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
|
||||
y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
|
||||
|
||||
y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C3);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C4);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C5);
|
||||
y2 = v_fma(y2, _vxx, v_sin_C6);
|
||||
y2 = v_mul(y2, _vxx);
|
||||
y2 = v_fma(y2, _vx, _vx);
|
||||
|
||||
ysin = v_select(poly_mask, y2, y1);
|
||||
ycos = v_select(poly_mask, y1, y2);
|
||||
ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
|
||||
ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
|
||||
|
||||
// sincos(NAN) -> NAN, sincos(±INF) -> NAN
|
||||
_TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
|
||||
_TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
|
||||
ysin = v_select(mask_nan, v_nan, ysin);
|
||||
ycos = v_select(mask_nan, v_nan, ycos);
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
|
||||
_TpVec64F ysin, ycos;
|
||||
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
|
||||
return ysin;
|
||||
}
|
||||
|
||||
template<typename _TpVec64F, typename _TpVec64S>
|
||||
inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
|
||||
_TpVec64F ysin, ycos;
|
||||
v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
|
||||
return ycos;
|
||||
}
|
||||
//! @}
|
||||
|
||||
|
||||
/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
|
||||
https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
|
||||
*/
|
||||
|
||||
//! @name Error Function
|
||||
//! @{
|
||||
template<typename _TpVec32F, typename _TpVec32S>
|
||||
inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
|
||||
const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
|
||||
coef1 = v_setall_<_TpVec32F>(1.061405429f),
|
||||
coef2 = v_setall_<_TpVec32F>(-1.453152027f),
|
||||
coef3 = v_setall_<_TpVec32F>(1.421413741f),
|
||||
coef4 = v_setall_<_TpVec32F>(-0.284496736f),
|
||||
coef5 = v_setall_<_TpVec32F>(0.254829592f),
|
||||
ones = v_setall_<_TpVec32F>(1.0f),
|
||||
neg_zeros = v_setall_<_TpVec32F>(-0.f);
|
||||
_TpVec32F t = v_abs(v);
|
||||
// sign(v)
|
||||
_TpVec32F sign_mask = v_and(neg_zeros, v);
|
||||
|
||||
t = v_div(ones, v_fma(coef0, t, ones));
|
||||
_TpVec32F r = v_fma(coef1, t, coef2);
|
||||
r = v_fma(r, t, coef3);
|
||||
r = v_fma(r, t, coef4);
|
||||
r = v_fma(r, t, coef5);
|
||||
// - v * v
|
||||
_TpVec32F v2 = v_mul(v, v);
|
||||
_TpVec32F mv2 = v_xor(neg_zeros, v2);
|
||||
// - exp(- v * v)
|
||||
_TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
|
||||
_TpVec32F neg_exp = v_xor(neg_zeros, exp);
|
||||
_TpVec32F res = v_mul(t, neg_exp);
|
||||
res = v_fma(r, res, ones);
|
||||
return v_xor(sign_mask, res);
|
||||
}
|
||||
//! @}
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_MATH_HPP
|
||||
1886
3rdpart/OpenCV/include/opencv2/core/hal/intrin_msa.hpp
Normal file
1886
3rdpart/OpenCV/include/opencv2/core/hal/intrin_msa.hpp
Normal file
File diff suppressed because it is too large
Load Diff
2680
3rdpart/OpenCV/include/opencv2/core/hal/intrin_neon.hpp
Normal file
2680
3rdpart/OpenCV/include/opencv2/core/hal/intrin_neon.hpp
Normal file
File diff suppressed because it is too large
Load Diff
2888
3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv071.hpp
Normal file
2888
3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv071.hpp
Normal file
File diff suppressed because it is too large
Load Diff
2194
3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_scalable.hpp
Normal file
2194
3rdpart/OpenCV/include/opencv2/core/hal/intrin_rvv_scalable.hpp
Normal file
File diff suppressed because it is too large
Load Diff
3483
3rdpart/OpenCV/include/opencv2/core/hal/intrin_sse.hpp
Normal file
3483
3rdpart/OpenCV/include/opencv2/core/hal/intrin_sse.hpp
Normal file
File diff suppressed because it is too large
Load Diff
180
3rdpart/OpenCV/include/opencv2/core/hal/intrin_sse_em.hpp
Normal file
180
3rdpart/OpenCV/include/opencv2/core/hal/intrin_sse_em.hpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
|
||||
#define OPENCV_HAL_INTRIN_SSE_EM_HPP
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
|
||||
inline tp _v128_##fun(const tp& a) \
|
||||
{ return _mm_##fun(a); }
|
||||
|
||||
#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
|
||||
inline tp _v128_##fun(const tp& a, const tp& b) \
|
||||
{ return _mm_##fun(a, b); }
|
||||
|
||||
#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
|
||||
inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
|
||||
{ return _mm_##fun(a, b, c); }
|
||||
|
||||
///////////////////////////// XOP /////////////////////////////
|
||||
|
||||
// [todo] define CV_XOP
|
||||
#if 1 // CV_XOP
|
||||
inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
const __m128i delta = _mm_set1_epi32((int)0x80000000);
|
||||
return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
|
||||
}
|
||||
// wrapping XOP
|
||||
#else
|
||||
OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
|
||||
#endif // !CV_XOP
|
||||
|
||||
///////////////////////////// SSE4.1 /////////////////////////////
|
||||
|
||||
#if !CV_SSE4_1
|
||||
|
||||
/** Swizzle **/
|
||||
inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
|
||||
{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
|
||||
|
||||
/** Convert **/
|
||||
// 8 >> 16
|
||||
inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi8(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
|
||||
{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
|
||||
// 8 >> 32
|
||||
inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
|
||||
{
|
||||
__m128i r = _mm_unpacklo_epi8(a, a);
|
||||
r = _mm_unpacklo_epi8(r, r);
|
||||
return _mm_srai_epi32(r, 24);
|
||||
}
|
||||
// 16 >> 32
|
||||
inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi16(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
|
||||
{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
|
||||
// 32 >> 64
|
||||
inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi32(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
|
||||
{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
|
||||
|
||||
/** Arithmetic **/
|
||||
inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
__m128i c0 = _mm_mul_epu32(a, b);
|
||||
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
|
||||
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
|
||||
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
|
||||
return _mm_unpacklo_epi64(d0, d1);
|
||||
}
|
||||
|
||||
/** Math **/
|
||||
inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
|
||||
{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
|
||||
|
||||
// wrapping SSE4.1
|
||||
#else
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
|
||||
OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
|
||||
#endif // !CV_SSE4_1
|
||||
|
||||
///////////////////////////// Revolutionary /////////////////////////////
|
||||
|
||||
/** Convert **/
|
||||
// 16 << 8
|
||||
inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpackhi_epi8(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
|
||||
{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
|
||||
// 32 << 16
|
||||
inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpackhi_epi16(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
|
||||
{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
|
||||
// 64 << 32
|
||||
inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
|
||||
{
|
||||
const __m128i z = _mm_setzero_si128();
|
||||
return _mm_unpackhi_epi32(a, z);
|
||||
}
|
||||
inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
|
||||
{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
|
||||
|
||||
/** Miscellaneous **/
|
||||
inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
const __m128i m = _mm_set1_epi32(65535);
|
||||
__m128i am = _v128_min_epu32(a, m);
|
||||
__m128i bm = _v128_min_epu32(b, m);
|
||||
#if CV_SSE4_1
|
||||
return _mm_packus_epi32(am, bm);
|
||||
#else
|
||||
const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
|
||||
am = _mm_sub_epi32(am, d);
|
||||
bm = _mm_sub_epi32(bm, d);
|
||||
am = _mm_packs_epi32(am, bm);
|
||||
return _mm_sub_epi16(am, nd);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int i>
|
||||
inline int64 _v128_extract_epi64(const __m128i& a)
|
||||
{
|
||||
#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
|
||||
#define CV__SIMD_NATIVE_mm_extract_epi64 1
|
||||
return _mm_extract_epi64(a, i);
|
||||
#else
|
||||
CV_DECL_ALIGNED(16) int64 tmp[2];
|
||||
_mm_store_si128((__m128i*)tmp, a);
|
||||
return tmp[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
//! @endcond
|
||||
|
||||
} // cv::
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
|
||||
1619
3rdpart/OpenCV/include/opencv2/core/hal/intrin_vsx.hpp
Normal file
1619
3rdpart/OpenCV/include/opencv2/core/hal/intrin_vsx.hpp
Normal file
File diff suppressed because it is too large
Load Diff
2801
3rdpart/OpenCV/include/opencv2/core/hal/intrin_wasm.hpp
Normal file
2801
3rdpart/OpenCV/include/opencv2/core/hal/intrin_wasm.hpp
Normal file
File diff suppressed because it is too large
Load Diff
1558
3rdpart/OpenCV/include/opencv2/core/hal/msa_macros.h
Normal file
1558
3rdpart/OpenCV/include/opencv2/core/hal/msa_macros.h
Normal file
File diff suppressed because it is too large
Load Diff
186
3rdpart/OpenCV/include/opencv2/core/hal/simd_utils.impl.hpp
Normal file
186
3rdpart/OpenCV/include/opencv2/core/hal/simd_utils.impl.hpp
Normal file
@@ -0,0 +1,186 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
|
||||
#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp
|
||||
|
||||
|
||||
#if CV_SIMD128 || CV_SIMD128_CPP
|
||||
|
||||
template<typename _T> struct Type2Vec128_Traits;
|
||||
#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \
|
||||
template<> struct Type2Vec128_Traits<type_> \
|
||||
{ \
|
||||
typedef vec_type_ vec_type; \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2);
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2);
|
||||
#if CV_SIMD128_64F
|
||||
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2);
|
||||
#endif
|
||||
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a);
|
||||
|
||||
template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
|
||||
template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
|
||||
template<> inline Type2Vec128_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
|
||||
template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
|
||||
template<> inline Type2Vec128_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
|
||||
template<> inline Type2Vec128_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
|
||||
template<> inline Type2Vec128_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
|
||||
template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
|
||||
template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
|
||||
#if CV_SIMD128_64F
|
||||
template<> inline Type2Vec128_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
|
||||
#endif
|
||||
|
||||
#endif // SIMD128
|
||||
|
||||
|
||||
#if CV_SIMD256
|
||||
|
||||
template<typename _T> struct Type2Vec256_Traits;
|
||||
#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \
|
||||
template<> struct Type2Vec256_Traits<type_> \
|
||||
{ \
|
||||
typedef vec_type_ vec_type; \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4);
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4);
|
||||
#if CV_SIMD256_64F
|
||||
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4);
|
||||
#endif
|
||||
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a);
|
||||
|
||||
template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const uchar& a) { return v256_setall_u8(a); }
|
||||
template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const schar& a) { return v256_setall_s8(a); }
|
||||
template<> inline Type2Vec256_Traits<ushort>::vec_type v256_setall<ushort>(const ushort& a) { return v256_setall_u16(a); }
|
||||
template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const short& a) { return v256_setall_s16(a); }
|
||||
template<> inline Type2Vec256_Traits< uint>::vec_type v256_setall< uint>(const uint& a) { return v256_setall_u32(a); }
|
||||
template<> inline Type2Vec256_Traits< int>::vec_type v256_setall< int>(const int& a) { return v256_setall_s32(a); }
|
||||
template<> inline Type2Vec256_Traits<uint64>::vec_type v256_setall<uint64>(const uint64& a) { return v256_setall_u64(a); }
|
||||
template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const int64& a) { return v256_setall_s64(a); }
|
||||
template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const float& a) { return v256_setall_f32(a); }
|
||||
#if CV_SIMD256_64F
|
||||
template<> inline Type2Vec256_Traits<double>::vec_type v256_setall<double>(const double& a) { return v256_setall_f64(a); }
|
||||
#endif
|
||||
|
||||
#endif // SIMD256
|
||||
|
||||
|
||||
#if CV_SIMD512
|
||||
|
||||
template<typename _T> struct Type2Vec512_Traits;
|
||||
#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \
|
||||
template<> struct Type2Vec512_Traits<type_> \
|
||||
{ \
|
||||
typedef vec_type_ vec_type; \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8);
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8);
|
||||
#if CV_SIMD512_64F
|
||||
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8);
|
||||
#endif
|
||||
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a);
|
||||
|
||||
template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const uchar& a) { return v512_setall_u8(a); }
|
||||
template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const schar& a) { return v512_setall_s8(a); }
|
||||
template<> inline Type2Vec512_Traits<ushort>::vec_type v512_setall<ushort>(const ushort& a) { return v512_setall_u16(a); }
|
||||
template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const short& a) { return v512_setall_s16(a); }
|
||||
template<> inline Type2Vec512_Traits< uint>::vec_type v512_setall< uint>(const uint& a) { return v512_setall_u32(a); }
|
||||
template<> inline Type2Vec512_Traits< int>::vec_type v512_setall< int>(const int& a) { return v512_setall_s32(a); }
|
||||
template<> inline Type2Vec512_Traits<uint64>::vec_type v512_setall<uint64>(const uint64& a) { return v512_setall_u64(a); }
|
||||
template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const int64& a) { return v512_setall_s64(a); }
|
||||
template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const float& a) { return v512_setall_f32(a); }
|
||||
#if CV_SIMD512_64F
|
||||
template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const double& a) { return v512_setall_f64(a); }
|
||||
#endif
|
||||
|
||||
#endif // SIMD512
|
||||
|
||||
#if CV_SIMD_SCALABLE
|
||||
template<typename _T> struct Type2Vec_Traits;
|
||||
#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
|
||||
template<> struct Type2Vec_Traits<type_> \
|
||||
{ \
|
||||
typedef vec_type_ vec_type; \
|
||||
}
|
||||
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
|
||||
#endif
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
|
||||
|
||||
template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
|
||||
template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
|
||||
template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
|
||||
template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
|
||||
template<> inline Type2Vec_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
|
||||
template<> inline Type2Vec_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
|
||||
template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
|
||||
template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
|
||||
template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if CV_SIMD_SCALABLE
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
|
||||
#elif CV_SIMD_WIDTH == 16
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); }
|
||||
#elif CV_SIMD_WIDTH == 64
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); }
|
||||
#else
|
||||
#error "Build configuration error, unsupported CV_SIMD_WIDTH"
|
||||
#endif
|
||||
|
||||
|
||||
#endif // OPENCV_HAL_INTRIN_HPP
|
||||
Reference in New Issue
Block a user