Math/interface/SSEVec.h

0001 #ifndef DataFormat_Math_SSEVec_H
0002 #define DataFormat_Math_SSEVec_H
0003
0004 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
0005     !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__) && !defined(__riscv)
0006 #if defined(__GNUC__)
0007 #include <x86intrin.h>
0008 #define CMS_USE_SSE
0009 #ifdef __SSE4_1__
0010 // slower than sse3
0011 // #define CMS_USE_SSE4
0012 #endif
0013 #ifdef __AVX2__
0014 #define CMS_USE_AVX2
0015 #endif /* __AVX2__ */
0016 #endif /* defined(__GNUC__) */
0017 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
0018
0019 #include <cmath>
0020
0021 namespace mathSSE {
0022   template <typename T>
0023   inline T sqrt(T t) {
0024     return std::sqrt(t);
0025   }
0026 }  // namespace mathSSE
0027
0028 namespace mathSSE {
0029 #ifdef CMS_USE_SSE
0030   //dot
0031   inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_dot_ps(__m128 v1, __m128 v2) {
0032 #ifdef CMS_USE_SSE4
0033     /// this is slower than the scalar version!
0034     return _mm_dp_ps(v1, v2, 0xff);
0035 #else
0036
0037     __m128 mul = _mm_mul_ps(v1, v2);
0038 #ifdef __SSE3__
0039     mul = _mm_hadd_ps(mul, mul);
0040     return _mm_hadd_ps(mul, mul);
0041 #else
0042     __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
0043     mul = _mm_add_ps(mul, swp);
0044     swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
0045     return _mm_add_ps(mul, swp);
0046 #endif
0047 #endif
0048   }
0049
0050   // cross (just 3x3)
0051   inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_cross_ps(__m128 v1, __m128 v2) {
0052     // same order is  _MM_SHUFFLE(3,2,1,0)
0053     //                                               x2, z1,z1
0054     __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
0055     //                                               y1, x2,y2
0056     __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
0057
0058     __m128 v5 = _mm_mul_ps(v3, v4);
0059
0060     //                                         x1, z2,z2
0061     v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
0062     //                                        y2, x1,y1
0063     v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
0064
0065     v3 = _mm_mul_ps(v3, v4);
0066     const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
0067     __m128i ret = __m128i(_mm_sub_ps(v5, v3));
0068     return __m128(_mm_xor_si128(ret, neg));
0069   }
0070 #endif  // CMS_USE_SSE
0071
0072 #ifdef CMS_USE_AVX2
0073   inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_dot_pd(__m256d v1, __m256d v2) {
0074     __m256d mul = _mm256_mul_pd(v1, v2);
0075     mul = _mm256_hadd_pd(mul, mul);
0076     __m256d tmp = _mm256_permute2f128_pd(mul, mul, 1);
0077     return _mm256_add_pd(mul, tmp);
0078   }
0079
0080   inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_cross_pd(__m256d v1, __m256d v2) {
0081     __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
0082     v3 = _mm256_permute_pd(v3, 0);
0083
0084     __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
0085     v4 = _mm256_permute_pd(v4, 5);
0086
0087     __m256d v5 = _mm256_mul_pd(v3, v4);
0088
0089     v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
0090     v3 = _mm256_permute_pd(v3, 0);
0091
0092     v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
0093     v4 = _mm256_permute_pd(v4, 5);
0094
0095     v3 = _mm256_mul_pd(v3, v4);
0096     __m256d ret = _mm256_sub_pd(v5, v3);
0097     const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
0098     return __m256d(_mm256_xor_si256(__m256i(ret), neg));
0099   }
0100
0101 #endif  //  CMS_USE_AVX2
0102
0103   template <typename T>
0104   struct OldVec {
0105     T theX;
0106     T theY;
0107     T theZ;
0108     T theW;
0109   } __attribute__((aligned(16)));
0110 #ifdef CMS_USE_AVX2
0111   template <>
0112   struct OldVec<double> {
0113     double theX;
0114     double theY;
0115     double theZ;
0116     double theW;
0117   } __attribute__((aligned(32)));
0118 #endif
0119
0120   template <typename T>
0121   union Vec4;
0122
0123   template <typename T>
0124   union Vec2 {
0125     Vec2() {
0126       arr[0] = 0;
0127       arr[1] = 0;
0128     }
0129     Vec2(T f1, T f2) {
0130       arr[0] = f1;
0131       arr[1] = f2;
0132     }
0133     explicit Vec2(T f1) {
0134       arr[0] = f1;
0135       arr[1] = f1;
0136     }
0137
0138     void set(T f1, T f2) {
0139       arr[0] = f1;
0140       arr[1] = f2;
0141     }
0142
0143     template <int N>
0144     Vec2 get1() const {
0145       return Vec2(arr[N], arr[N]);
0146     }
0147
0148     /*
0149     Vec2 get1(unsigned int n) const {
0150       return Vec2(arr[n],arr[n]);
0151     }
0152     */
0153
0154     template <typename U>
0155     Vec2(Vec2<U> v) {
0156       arr[0] = v[0];
0157       arr[1] = v[1];
0158     }
0159
0160     inline Vec2(Vec4<T> v4);
0161
0162     T &operator[](unsigned int n) { return arr[n]; }
0163
0164     T operator[](unsigned int n) const { return arr[n]; }
0165
0166     T arr[2];
0167   };
0168
0169   template <typename T>
0170   union Vec4 {
0171     Vec4() {
0172       arr[0] = 0;
0173       arr[1] = 0;
0174       arr[2] = 0;
0175       arr[3] = 0;
0176     }
0177     Vec4(float f1, float f2, float f3, float f4 = 0) {
0178       arr[0] = f1;
0179       arr[1] = f2;
0180       arr[2] = f3;
0181       arr[3] = f4;
0182     }
0183     explicit Vec4(float f1) { set1(f1); }
0184     void set(float f1, float f2, float f3, float f4 = 0) {
0185       arr[0] = f1;
0186       arr[1] = f2;
0187       arr[2] = f3;
0188       arr[3] = f4;
0189     }
0190     void set1(float f1) {
0191       arr[0] = f1;
0192       arr[1] = f1;
0193       arr[2] = f1;
0194       arr[3] = f1;
0195     }
0196     template <int N>
0197     Vec4 get1() const {
0198       return Vec4(arr[N], arr[N], arr[N], arr[N]);
0199     }
0200     /*
0201     Vec4 get1(unsigned int n) const {
0202       return Vec4(arr[n],arr[n],arr[n],arr[n]);
0203     }
0204     */
0205
0206     Vec2<T> xy() const { return Vec2<T>(arr[0], arr[1]); }
0207     Vec2<T> zw() const { return Vec2<T>(arr[2], arr[3]); }
0208
0209     T __attribute__((aligned(16))) arr[4];
0210     OldVec<T> o;
0211   };
0212
0213   template <typename T>
0214   inline Vec2<T>::Vec2(Vec4<T> v4) {
0215     arr[0] = v4[0];
0216     arr[1] = v4[1];
0217   }
0218
0219 #ifdef CMS_USE_SSE
0220
0221   template <>
0222   union Vec2<double>;
0223   template <>
0224   union Vec4<double>;
0225
0226   template <typename T>
0227   union Mask2 {};
0228   template <typename T>
0229   union Mask4 {};
0230
0231   template <>
0232   union Mask4<float> {
0233     __m128 vec;
0234     unsigned int __attribute__((aligned(16))) mask[4];
0235     Mask4() { vec = _mm_setzero_ps(); }
0236     Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0237       mask[0] = m1;
0238       mask[1] = m2;
0239       mask[2] = m3;
0240       mask[3] = m4;
0241     }
0242   };
0243
0244 #ifdef CMS_USE_AVX2
0245   template <>
0246   union Mask4<double> {
0247     __m256d vec;
0248     unsigned long long __attribute__((aligned(32))) mask[4];
0249     Mask4() { vec = _mm256_setzero_pd(); }
0250     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0251       mask[0] = m1;
0252       mask[1] = m2;
0253       mask[2] = m3;
0254       mask[3] = m4;
0255     }
0256   };
0257 #else
0258   template <>
0259   union Mask4<double> {
0260     __m128d vec[2];
0261     unsigned long long __attribute__((aligned(16))) mask[4];
0262     Mask4() {
0263       vec[0] = _mm_setzero_pd();
0264       vec[1] = _mm_setzero_pd();
0265     }
0266     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0267       mask[0] = m1;
0268       mask[1] = m2;
0269       mask[2] = m3;
0270       mask[3] = m4;
0271     }
0272   };
0273 #endif
0274
0275   template <>
0276   union Mask2<double> {
0277     __m128d vec;
0278     unsigned long long __attribute__((aligned(16))) mask[2];
0279     Mask2() { vec = _mm_setzero_pd(); }
0280     Mask2(unsigned long long m1, unsigned long long m2) {
0281       mask[0] = m1;
0282       mask[1] = m2;
0283     }
0284   };
0285
0286   template <>
0287   union Vec4<float> {
0288     typedef __m128 nativeType;
0289     __m128 vec;
0290     float __attribute__((aligned(16))) arr[4];
0291     OldVec<float> o;
0292
0293     Vec4(__m128 ivec) : vec(ivec) {}
0294
0295     Vec4(OldVec<float> const &ivec) : o(ivec) {}
0296
0297     Vec4() { vec = _mm_setzero_ps(); }
0298
0299     inline Vec4(Vec4<double> ivec);
0300
0301     explicit Vec4(float f1) { set1(f1); }
0302
0303     Vec4(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0304
0305     Vec4(Vec2<float> ivec0, Vec2<float> ivec1) {
0306       arr[0] = ivec0.arr[0];
0307       arr[1] = ivec0.arr[1];
0308       arr[2] = ivec1.arr[0];
0309       arr[3] = ivec1.arr[1];
0310     }
0311
0312     Vec4(Vec2<float> ivec0, float f3, float f4 = 0) {
0313       arr[0] = ivec0.arr[0];
0314       arr[1] = ivec0.arr[1];
0315       arr[2] = f3;
0316       arr[3] = f4;
0317     }
0318
0319     Vec4(Vec2<float> ivec0) {
0320       vec = _mm_setzero_ps();
0321       arr[0] = ivec0.arr[0];
0322       arr[1] = ivec0.arr[1];
0323     }
0324
0325     // for masking
0326     void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0327       Mask4<float> mask(m1, m2, m3, m4);
0328       vec = mask.vec;
0329     }
0330
0331     void set(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0332
0333     void set1(float f1) { vec = _mm_set1_ps(f1); }
0334
0335     template <int N>
0336     Vec4 get1() const {
0337       return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
0338     }
0339
0340     float &operator[](unsigned int n) { return arr[n]; }
0341
0342     float operator[](unsigned int n) const { return arr[n]; }
0343
0344     Vec2<float> xy() const { return Vec2<float>(arr[0], arr[1]); }
0345     Vec2<float> zw() const { return Vec2<float>(arr[2], arr[3]); }
0346   };
0347
0348   template <>
0349   union Vec2<double> {
0350     typedef __m128d nativeType;
0351     __m128d vec;
0352     double __attribute__((aligned(16))) arr[2];
0353
0354     Vec2(__m128d ivec) : vec(ivec) {}
0355
0356     Vec2() { vec = _mm_setzero_pd(); }
0357
0358     inline Vec2(Vec2<float> ivec);
0359
0360     Vec2(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0361
0362     explicit Vec2(double f1) { set1(f1); }
0363
0364     inline Vec2(Vec4<double> v4);
0365
0366     // for masking
0367     void setMask(unsigned long long m1, unsigned long long m2) {
0368       Mask2<double> mask(m1, m2);
0369       vec = mask.vec;
0370     }
0371
0372     void set(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0373
0374     void set1(double f1) { vec = _mm_set1_pd(f1); }
0375
0376     template <int N>
0377     Vec2 get1() const {
0378       return Vec2(arr[N], arr[N]);
0379     }
0380     /*
0381     Vec2 get1(unsigned int n) const {
0382       return Vec2(arr[n],arr[n]);
0383     }
0384     */
0385     double &operator[](unsigned int n) { return arr[n]; }
0386     double operator[](unsigned int n) const { return arr[n]; }
0387   };
0388
0389 #ifdef CMS_USE_AVX2
0390 }  // namespace mathSSE
0391 #include "private/AVXVec.h"
0392
0393 namespace mathSSE {
0394 #else
0395   template <>
0396   union Vec4<double> {
0397     __m128d vec[2];
0398     double __attribute__((aligned(16))) arr[4];
0399     OldVec<double> o;
0400
0401     Vec4(__m128d ivec[]) {
0402       vec[0] = ivec[0];
0403       vec[1] = ivec[1];
0404     }
0405
0406     Vec4(__m128d ivec0, __m128d ivec1) {
0407       vec[0] = ivec0;
0408       vec[1] = ivec1;
0409     }
0410
0411     Vec4() {
0412       vec[0] = _mm_setzero_pd();
0413       vec[1] = _mm_setzero_pd();
0414     }
0415
0416     Vec4(Vec4<float> ivec) {
0417       vec[0] = _mm_cvtps_pd(ivec.vec);
0418       vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
0419     }
0420
0421     explicit Vec4(double f1) { set1(f1); }
0422
0423     Vec4(double f1, double f2, double f3, double f4 = 0) {
0424       arr[0] = f1;
0425       arr[1] = f2;
0426       arr[2] = f3;
0427       arr[3] = f4;
0428     }
0429
0430     Vec4(Vec2<double> ivec0, Vec2<double> ivec1) {
0431       vec[0] = ivec0.vec;
0432       vec[1] = ivec1.vec;
0433     }
0434
0435     Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
0436       vec[0] = ivec0.vec;
0437       arr[2] = f3;
0438       arr[3] = f4;
0439     }
0440
0441     Vec4(Vec2<double> ivec0) {
0442       vec[0] = ivec0.vec;
0443       vec[1] = _mm_setzero_pd();
0444     }
0445
0446     Vec4(OldVec<double> const &ivec) : o(ivec) {}
0447
0448     // for masking
0449     void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0450       Mask4<double> mask(m1, m2, m3, m4);
0451       vec[0] = mask.vec[0];
0452       vec[1] = mask.vec[1];
0453     }
0454
0455     void set(double f1, double f2, double f3, double f4 = 0) {
0456       arr[0] = f1;
0457       arr[1] = f2;
0458       arr[2] = f3;
0459       arr[3] = f4;
0460     }
0461
0462     void set1(double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
0463
0464     template <int N>
0465     Vec4 get1() const {
0466       return Vec4(arr[N], arr[N], arr[N], arr[N]);
0467     }
0468
0469     double &operator[](unsigned int n) { return arr[n]; }
0470
0471     double operator[](unsigned int n) const { return arr[n]; }
0472
0473     Vec2<double> xy() const { return vec[0]; }
0474     Vec2<double> zw() const { return vec[1]; }
0475   };
0476
0477   inline Vec4<float>::Vec4(Vec4<double> ivec) {
0478     vec = _mm_cvtpd_ps(ivec.vec[0]);
0479     __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
0480     vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
0481   }
0482 #endif  // CMS_USE_AVX2
0483
0484 #endif  // CMS_USE_SSE
0485
0486   typedef Vec2<float> Vec2F;
0487   typedef Vec4<float> Vec4F;
0488   typedef Vec4<float> Vec3F;
0489   typedef Vec2<double> Vec2D;
0490   typedef Vec4<double> Vec3D;
0491   typedef Vec4<double> Vec4D;
0492
0493   template <typename T>
0494   struct As3D {
0495     Vec4<T> const &v;
0496     As3D(Vec4<T> const &iv) : v(iv) {}
0497   };
0498
0499   template <typename T>
0500   inline As3D<T> as3D(Vec4<T> const &v) {
0501     return v;
0502   }
0503
0504 }  // namespace mathSSE
0505
0506 #ifdef CMS_USE_SSE
0507
0508 //float op
0509
0510 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0511   return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
0512 }
0513
0514 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpeq_ps(a.vec, b.vec); }
0515
0516 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpgt_ps(a.vec, b.vec); }
0517
0518 #ifdef __SSE3__
0519 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_hadd_ps(a.vec, b.vec); }
0520 #endif
0521
0522 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
0523   const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
0524   return _mm_xor_ps(a.vec, neg);
0525 }
0526
0527 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_and_ps(a.vec, b.vec); }
0528 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_or_ps(a.vec, b.vec); }
0529 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_xor_ps(a.vec, b.vec); }
0530 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_andnot_ps(a.vec, b.vec); }
0531
0532 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_add_ps(a.vec, b.vec); }
0533
0534 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_sub_ps(a.vec, b.vec); }
0535
0536 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_mul_ps(a.vec, b.vec); }
0537
0538 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_div_ps(a.vec, b.vec); }
0539
0540 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_min_ps(a.vec, b.vec); }
0541
0542 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_max_ps(a.vec, b.vec); }
0543
0544 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0545
0546 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b, float a) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0547
0548 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b, float a) { return _mm_div_ps(b.vec, _mm_set1_ps(a)); }
0549
0550 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0551   using mathSSE::_mm_dot_ps;
0552   float s;
0553   _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
0554   return s;
0555 }
0556
0557 inline float dot3(mathSSE::Vec4F a, mathSSE::Vec4F b) { return dot(a, b); }
0558
0559 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0560   using mathSSE::_mm_cross_ps;
0561   return _mm_cross_ps(a.vec, b.vec);
0562 }
0563
0564 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0565   mathSSE::Vec4F mul = a * b;
0566 #ifdef __SSE3__
0567   mul = hadd(mul, mul);
0568 #else
0569   __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
0570   mul.vec = _mm_add_ps(mul.vec, swp);
0571 #endif
0572   float s;
0573   _mm_store_ss(&s, mul.vec);
0574   return s;
0575 }
0576
0577 //
0578 // float op 2d (use the 4d one...)
0579 //
0580 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) { return -mathSSE::Vec4F(a); }
0581
0582 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) + mathSSE::Vec4F(b); }
0583
0584 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) - mathSSE::Vec4F(b); }
0585
0586 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) * mathSSE::Vec4F(b); }
0587
0588 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) / mathSSE::Vec4F(b); }
0589
0590 inline mathSSE::Vec4F min(mathSSE::Vec2F a, mathSSE::Vec2F b) { return min(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0591
0592 inline mathSSE::Vec4F max(mathSSE::Vec2F a, mathSSE::Vec2F b) { return ::max(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0593
0594 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) { return s * mathSSE::Vec4F(a); }
0595
0596 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) { return s * mathSSE::Vec4F(a); }
0597
0598 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) { return mathSSE::Vec4F(a) / s; }
0599
0600 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0601
0602 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
0603
0604 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0605
0606 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
0607
0608 ///
0609 // double op 2d
0610 //
0611
0612 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
0613 #ifdef CMS_USE_AVX2
0614   vec = _mm256_castpd256_pd128(v4.vec);
0615 #else
0616   vec = v4.vec[0];
0617 #endif
0618 }
0619
0620 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
0621   arr[0] = ivec.arr[0];
0622   arr[1] = ivec.arr[1];
0623 }
0624
0625 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
0626   const __m128d neg = _mm_set_pd(-0.0, -0.0);
0627   return _mm_xor_pd(a.vec, neg);
0628 }
0629
0630 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_and_pd(a.vec, b.vec); }
0631 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_or_pd(a.vec, b.vec); }
0632 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_xor_pd(a.vec, b.vec); }
0633 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_andnot_pd(a.vec, b.vec); }
0634
0635 #ifdef __SSE3__
0636 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_hadd_pd(a.vec, b.vec); }
0637 #endif
0638
0639 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_add_pd(a.vec, b.vec); }
0640
0641 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_sub_pd(a.vec, b.vec); }
0642
0643 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_mul_pd(a.vec, b.vec); }
0644
0645 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_div_pd(a.vec, b.vec); }
0646
0647 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_min_pd(a.vec, b.vec); }
0648
0649 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_max_pd(a.vec, b.vec); }
0650
0651 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0652
0653 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b, double a) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0654
0655 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b, double a) { return _mm_div_pd(b.vec, _mm_set1_pd(a)); }
0656
0657 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0658
0659 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0660   __m128d res = _mm_mul_pd(a.vec, b.vec);
0661 #ifdef __SSE3__
0662   res = _mm_hadd_pd(res, res);
0663 #else
0664   res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
0665 #endif
0666   double s;
0667   _mm_store_sd(&s, res);
0668   return s;
0669 }
0670
0671 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0672
0673 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0674   __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
0675   res = _mm_mul_pd(a.vec, res);
0676   res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
0677   double s;
0678   _mm_store_sd(&s, res);
0679   return s;
0680 }
0681
0682 #ifndef CMS_USE_AVX2
0683 // double op 3d
0684
0685 #ifdef __SSE3__
0686 // consistent with AVX...
0687 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0688   return mathSSE::Vec4D(hadd(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0689                         hadd(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0690 }
0691 #endif
0692
0693 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0694   return _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0], b.vec[0])) == 0x3 &&
0695          _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1], b.vec[1])) == 0x3;
0696 }
0697
0698 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
0699   const __m128d neg = _mm_set_pd(-0.0, -0.0);
0700   return mathSSE::Vec4D(_mm_xor_pd(a.vec[0], neg), _mm_xor_pd(a.vec[1], neg));
0701 }
0702
0703 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0704   return mathSSE::Vec4D(_mm_add_pd(a.vec[0], b.vec[0]), _mm_add_pd(a.vec[1], b.vec[1]));
0705 }
0706
0707 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0708   return mathSSE::Vec4D(_mm_sub_pd(a.vec[0], b.vec[0]), _mm_sub_pd(a.vec[1], b.vec[1]));
0709 }
0710 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0711   return mathSSE::Vec4D(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_pd(a.vec[1], b.vec[1]));
0712 }
0713 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0714   return mathSSE::Vec4D(_mm_div_pd(a.vec[0], b.vec[0]), _mm_div_pd(a.vec[1], b.vec[1]));
0715 }
0716
0717 inline mathSSE::Vec4D min(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0718   using namespace mathSSE;
0719   return mathSSE::Vec4D(::min(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0720                         ::min(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0721 }
0722
0723 inline mathSSE::Vec4D max(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0724   using namespace mathSSE;
0725   return mathSSE::Vec4D(::max(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0726                         ::max(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0727 }
0728
0729 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
0730   __m128d res = _mm_set1_pd(a);
0731   return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0732 }
0733
0734 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
0735   __m128d res = _mm_set1_pd(a);
0736   return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0737 }
0738
0739 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
0740   a = 1. / a;
0741   return a * b;
0742 }
0743
0744 // mix algebra (creates ambiguities...)
0745 /*
0746 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
0747   return a + mathSSE::Vec4D(b);
0748 }
0749 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
0750   return a + mathSSE::Vec4D(b);
0751 }
0752 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
0753   return a - mathSSE::Vec4D(b);
0754 }
0755 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
0756   return a - mathSSE::Vec4D(b);
0757 }
0758 */
0759
0760 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0761
0762 inline double __attribute__((always_inline)) __attribute__((pure)) dot3(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0763   return dot(a, b);
0764 }
0765
0766 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0767   __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
0768   res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
0769   double s;
0770   _mm_store_sd(&s, res);
0771   return s;
0772 }
0773
0774 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0775
0776 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0777   const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
0778   // lh .z * rh .x , lh .z * rh .y
0779   __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
0780   // rh .z * lh .x , rh .z * lh .y
0781   __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
0782   __m128d m1 = _mm_sub_pd(l1, l2);                // l1 - l2
0783   m1 = _mm_shuffle_pd(m1, m1, 1);                 // switch the elements
0784   m1 = __m128d(_mm_xor_si128(__m128i(m1), neg));  // change the sign of the first element
0785   // lh .x * rh .y , lh .y * rh .x
0786   l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
0787   // lh .x * rh .y - lh .y * rh .x
0788   __m128d m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
0789   return mathSSE::Vec4D(m1, m2);
0790 }
0791
0792 inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0793   return dot(a.xy(), b.xy());
0794 }
0795
0796 #endif  // CMS_USE_AVX2
0797
0798 // sqrt
0799 namespace mathSSE {
0800   template <>
0801   inline Vec4F sqrt(Vec4F v) {
0802     return _mm_sqrt_ps(v.vec);
0803   }
0804   template <>
0805   inline Vec2F sqrt(Vec2F v) {
0806     return sqrt(Vec4F(v));
0807   }
0808   template <>
0809   inline Vec2D sqrt(Vec2D v) {
0810     return _mm_sqrt_pd(v.vec);
0811   }
0812 #ifdef CMS_USE_AVX2
0813   template <>
0814   inline Vec4D sqrt(Vec4D v) {
0815     return _mm256_sqrt_pd(v.vec);
0816   }
0817 #else
0818   template <>
0819   inline Vec4D sqrt(Vec4D v) {
0820     return Vec4D(_mm_sqrt_pd(v.vec[0]), _mm_sqrt_pd(v.vec[1]));
0821   }
0822 #endif
0823 }  // namespace mathSSE
0824
0825 #endif  // CMS_USE_SSE
0826
0827 #include <iosfwd>
0828 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2D const &v);
0829 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2F const &v);
0830 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4F const &v);
0831 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4D const &v);
0832
0833 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float> const &v);
0834 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double> const &v);
0835
0836 #endif  // DataFormat_Math_SSEVec_H