Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-03-17 10:50:37

0001 #ifndef DataFormat_Math_SSEVec_H
0002 #define DataFormat_Math_SSEVec_H
0003 
0004 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
0005     !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__)
0006 #if defined(__GNUC__)
0007 #include <x86intrin.h>
0008 #define CMS_USE_SSE
0009 #ifdef __AVX2__
0010 #define CMS_USE_AVX2
0011 #endif /* __AVX2__ */
0012 #endif /* defined(__GNUC__) */
0013 #endif /* !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) */
0014 
0015 #include <cmath>
0016 
0017 namespace mathSSE {
0018   template <typename T>
0019   inline T sqrt(T t) {
0020     return std::sqrt(t);
0021   }
0022 }  // namespace mathSSE
0023 
0024 namespace mathSSE {
0025 #ifdef CMS_USE_SSE
0026   //dot
0027   inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_dot_ps(__m128 v1, __m128 v2) {
0028 #ifdef __SSE4_1__
0029     return _mm_dp_ps(v1, v2, 0xff);
0030 #else
0031     __m128 mul = _mm_mul_ps(v1, v2);
0032 #ifdef __SSE3__
0033     mul = _mm_hadd_ps(mul, mul);
0034     return _mm_hadd_ps(mul, mul);
0035 #else
0036     __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
0037     mul = _mm_add_ps(mul, swp);
0038     swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
0039     return _mm_add_ps(mul, swp);
0040 #endif
0041 #endif
0042   }
0043 
0044   // cross (just 3x3)
0045   inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_cross_ps(__m128 v1, __m128 v2) {
0046     // same order is  _MM_SHUFFLE(3,2,1,0)
0047     //                                               x2, z1,z1
0048     __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
0049     //                                               y1, x2,y2
0050     __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
0051 
0052     __m128 v5 = _mm_mul_ps(v3, v4);
0053 
0054     //                                         x1, z2,z2
0055     v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
0056     //                                        y2, x1,y1
0057     v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
0058 
0059     v3 = _mm_mul_ps(v3, v4);
0060     const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
0061     __m128i ret = __m128i(_mm_sub_ps(v5, v3));
0062     return __m128(_mm_xor_si128(ret, neg));
0063   }
0064 #endif  // CMS_USE_SSE
0065 
0066 #ifdef CMS_USE_AVX2
0067   inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_dot_pd(__m256d v1, __m256d v2) {
0068     __m256d mul = _mm256_mul_pd(v1, v2);
0069     mul = _mm256_hadd_pd(mul, mul);
0070     __m256d tmp = _mm256_permute2f128_pd(mul, mul, 1);
0071     return _mm256_add_pd(mul, tmp);
0072   }
0073 
0074   inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_cross_pd(__m256d v1, __m256d v2) {
0075     __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
0076     v3 = _mm256_permute_pd(v3, 0);
0077 
0078     __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
0079     v4 = _mm256_permute_pd(v4, 5);
0080 
0081     __m256d v5 = _mm256_mul_pd(v3, v4);
0082 
0083     v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
0084     v3 = _mm256_permute_pd(v3, 0);
0085 
0086     v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
0087     v4 = _mm256_permute_pd(v4, 5);
0088 
0089     v3 = _mm256_mul_pd(v3, v4);
0090     __m256d ret = _mm256_sub_pd(v5, v3);
0091     const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
0092     return __m256d(_mm256_xor_si256(__m256i(ret), neg));
0093   }
0094 
0095 #endif  //  CMS_USE_AVX2
0096 
0097   template <typename T>
0098   struct OldVec {
0099     T theX;
0100     T theY;
0101     T theZ;
0102     T theW;
0103   } __attribute__((aligned(16)));
0104 #ifdef CMS_USE_AVX2
0105   template <>
0106   struct OldVec<double> {
0107     double theX;
0108     double theY;
0109     double theZ;
0110     double theW;
0111   } __attribute__((aligned(32)));
0112 #endif
0113 
0114   template <typename T>
0115   union Vec4;
0116 
0117   template <typename T>
0118   union Vec2 {
0119     Vec2() {
0120       arr[0] = 0;
0121       arr[1] = 0;
0122     }
0123     Vec2(T f1, T f2) {
0124       arr[0] = f1;
0125       arr[1] = f2;
0126     }
0127     explicit Vec2(T f1) {
0128       arr[0] = f1;
0129       arr[1] = f1;
0130     }
0131 
0132     void set(T f1, T f2) {
0133       arr[0] = f1;
0134       arr[1] = f2;
0135     }
0136 
0137     template <int N>
0138     Vec2 get1() const {
0139       return Vec2(arr[N], arr[N]);
0140     }
0141 
0142     /*
0143     Vec2 get1(unsigned int n) const {
0144       return Vec2(arr[n],arr[n]);
0145     }
0146     */
0147 
0148     template <typename U>
0149     Vec2(Vec2<U> v) {
0150       arr[0] = v[0];
0151       arr[1] = v[1];
0152     }
0153 
0154     inline Vec2(Vec4<T> v4);
0155 
0156     T &operator[](unsigned int n) { return arr[n]; }
0157 
0158     T operator[](unsigned int n) const { return arr[n]; }
0159 
0160     T arr[2];
0161   };
0162 
0163   template <typename T>
0164   union Vec4 {
0165     Vec4() {
0166       arr[0] = 0;
0167       arr[1] = 0;
0168       arr[2] = 0;
0169       arr[3] = 0;
0170     }
0171     Vec4(float f1, float f2, float f3, float f4 = 0) {
0172       arr[0] = f1;
0173       arr[1] = f2;
0174       arr[2] = f3;
0175       arr[3] = f4;
0176     }
0177     explicit Vec4(float f1) { set1(f1); }
0178     void set(float f1, float f2, float f3, float f4 = 0) {
0179       arr[0] = f1;
0180       arr[1] = f2;
0181       arr[2] = f3;
0182       arr[3] = f4;
0183     }
0184     void set1(float f1) {
0185       arr[0] = f1;
0186       arr[1] = f1;
0187       arr[2] = f1;
0188       arr[3] = f1;
0189     }
0190     template <int N>
0191     Vec4 get1() const {
0192       return Vec4(arr[N], arr[N], arr[N], arr[N]);
0193     }
0194     /*
0195     Vec4 get1(unsigned int n) const {
0196       return Vec4(arr[n],arr[n],arr[n],arr[n]);
0197     }
0198     */
0199 
0200     Vec2<T> xy() const { return Vec2<T>(arr[0], arr[1]); }
0201     Vec2<T> zw() const { return Vec2<T>(arr[2], arr[3]); }
0202 
0203     T __attribute__((aligned(16))) arr[4];
0204     OldVec<T> o;
0205   };
0206 
0207   template <typename T>
0208   inline Vec2<T>::Vec2(Vec4<T> v4) {
0209     arr[0] = v4[0];
0210     arr[1] = v4[1];
0211   }
0212 
0213 #ifdef CMS_USE_SSE
0214 
0215   template <>
0216   union Vec2<double>;
0217   template <>
0218   union Vec4<double>;
0219 
0220   template <typename T>
0221   union Mask2 {};
0222   template <typename T>
0223   union Mask4 {};
0224 
0225   template <>
0226   union Mask4<float> {
0227     __m128 vec;
0228     unsigned int __attribute__((aligned(16))) mask[4];
0229     Mask4() { vec = _mm_setzero_ps(); }
0230     Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0231       mask[0] = m1;
0232       mask[1] = m2;
0233       mask[2] = m3;
0234       mask[3] = m4;
0235     }
0236   };
0237 
0238 #ifdef CMS_USE_AVX2
0239   template <>
0240   union Mask4<double> {
0241     __m256d vec;
0242     unsigned long long __attribute__((aligned(32))) mask[4];
0243     Mask4() { vec = _mm256_setzero_pd(); }
0244     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0245       mask[0] = m1;
0246       mask[1] = m2;
0247       mask[2] = m3;
0248       mask[3] = m4;
0249     }
0250   };
0251 #else
0252   template <>
0253   union Mask4<double> {
0254     __m128d vec[2];
0255     unsigned long long __attribute__((aligned(16))) mask[4];
0256     Mask4() {
0257       vec[0] = _mm_setzero_pd();
0258       vec[1] = _mm_setzero_pd();
0259     }
0260     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0261       mask[0] = m1;
0262       mask[1] = m2;
0263       mask[2] = m3;
0264       mask[3] = m4;
0265     }
0266   };
0267 #endif
0268 
0269   template <>
0270   union Mask2<double> {
0271     __m128d vec;
0272     unsigned long long __attribute__((aligned(16))) mask[2];
0273     Mask2() { vec = _mm_setzero_pd(); }
0274     Mask2(unsigned long long m1, unsigned long long m2) {
0275       mask[0] = m1;
0276       mask[1] = m2;
0277     }
0278   };
0279 
0280   template <>
0281   union Vec4<float> {
0282     typedef __m128 nativeType;
0283     __m128 vec;
0284     float __attribute__((aligned(16))) arr[4];
0285     OldVec<float> o;
0286 
0287     Vec4(__m128 ivec) : vec(ivec) {}
0288 
0289     Vec4(OldVec<float> const &ivec) : o(ivec) {}
0290 
0291     Vec4() { vec = _mm_setzero_ps(); }
0292 
0293     inline Vec4(Vec4<double> ivec);
0294 
0295     explicit Vec4(float f1) { set1(f1); }
0296 
0297     Vec4(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0298 
0299     Vec4(Vec2<float> ivec0, Vec2<float> ivec1) {
0300       arr[0] = ivec0.arr[0];
0301       arr[1] = ivec0.arr[1];
0302       arr[2] = ivec1.arr[0];
0303       arr[3] = ivec1.arr[1];
0304     }
0305 
0306     Vec4(Vec2<float> ivec0, float f3, float f4 = 0) {
0307       arr[0] = ivec0.arr[0];
0308       arr[1] = ivec0.arr[1];
0309       arr[2] = f3;
0310       arr[3] = f4;
0311     }
0312 
0313     Vec4(Vec2<float> ivec0) {
0314       vec = _mm_setzero_ps();
0315       arr[0] = ivec0.arr[0];
0316       arr[1] = ivec0.arr[1];
0317     }
0318 
0319     // for masking
0320     void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0321       Mask4<float> mask(m1, m2, m3, m4);
0322       vec = mask.vec;
0323     }
0324 
0325     void set(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0326 
0327     void set1(float f1) { vec = _mm_set1_ps(f1); }
0328 
0329     template <int N>
0330     Vec4 get1() const {
0331       return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
0332     }
0333 
0334     float &operator[](unsigned int n) { return arr[n]; }
0335 
0336     float operator[](unsigned int n) const { return arr[n]; }
0337 
0338     Vec2<float> xy() const { return Vec2<float>(arr[0], arr[1]); }
0339     Vec2<float> zw() const { return Vec2<float>(arr[2], arr[3]); }
0340   };
0341 
0342   template <>
0343   union Vec2<double> {
0344     typedef __m128d nativeType;
0345     __m128d vec;
0346     double __attribute__((aligned(16))) arr[2];
0347 
0348     Vec2(__m128d ivec) : vec(ivec) {}
0349 
0350     Vec2() { vec = _mm_setzero_pd(); }
0351 
0352     inline Vec2(Vec2<float> ivec);
0353 
0354     Vec2(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0355 
0356     explicit Vec2(double f1) { set1(f1); }
0357 
0358     inline Vec2(Vec4<double> v4);
0359 
0360     // for masking
0361     void setMask(unsigned long long m1, unsigned long long m2) {
0362       Mask2<double> mask(m1, m2);
0363       vec = mask.vec;
0364     }
0365 
0366     void set(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0367 
0368     void set1(double f1) { vec = _mm_set1_pd(f1); }
0369 
0370     template <int N>
0371     Vec2 get1() const {
0372       return Vec2(arr[N], arr[N]);
0373     }
0374     /*
0375     Vec2 get1(unsigned int n) const {
0376       return Vec2(arr[n],arr[n]);
0377     }
0378     */
0379     double &operator[](unsigned int n) { return arr[n]; }
0380     double operator[](unsigned int n) const { return arr[n]; }
0381   };
0382 
0383 #ifdef CMS_USE_AVX2
0384 }  // namespace mathSSE
0385 #include "private/AVXVec.h"
0386 
0387 namespace mathSSE {
0388 #else
0389   template <>
0390   union Vec4<double> {
0391     __m128d vec[2];
0392     double __attribute__((aligned(16))) arr[4];
0393     OldVec<double> o;
0394 
0395     Vec4(__m128d ivec[]) {
0396       vec[0] = ivec[0];
0397       vec[1] = ivec[1];
0398     }
0399 
0400     Vec4(__m128d ivec0, __m128d ivec1) {
0401       vec[0] = ivec0;
0402       vec[1] = ivec1;
0403     }
0404 
0405     Vec4() {
0406       vec[0] = _mm_setzero_pd();
0407       vec[1] = _mm_setzero_pd();
0408     }
0409 
0410     Vec4(Vec4<float> ivec) {
0411       vec[0] = _mm_cvtps_pd(ivec.vec);
0412       vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
0413     }
0414 
0415     explicit Vec4(double f1) { set1(f1); }
0416 
0417     Vec4(double f1, double f2, double f3, double f4 = 0) {
0418       arr[0] = f1;
0419       arr[1] = f2;
0420       arr[2] = f3;
0421       arr[3] = f4;
0422     }
0423 
0424     Vec4(Vec2<double> ivec0, Vec2<double> ivec1) {
0425       vec[0] = ivec0.vec;
0426       vec[1] = ivec1.vec;
0427     }
0428 
0429     Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
0430       vec[0] = ivec0.vec;
0431       arr[2] = f3;
0432       arr[3] = f4;
0433     }
0434 
0435     Vec4(Vec2<double> ivec0) {
0436       vec[0] = ivec0.vec;
0437       vec[1] = _mm_setzero_pd();
0438     }
0439 
0440     Vec4(OldVec<double> const &ivec) : o(ivec) {}
0441 
0442     // for masking
0443     void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0444       Mask4<double> mask(m1, m2, m3, m4);
0445       vec[0] = mask.vec[0];
0446       vec[1] = mask.vec[1];
0447     }
0448 
0449     void set(double f1, double f2, double f3, double f4 = 0) {
0450       arr[0] = f1;
0451       arr[1] = f2;
0452       arr[2] = f3;
0453       arr[3] = f4;
0454     }
0455 
0456     void set1(double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
0457 
0458     template <int N>
0459     Vec4 get1() const {
0460       return Vec4(arr[N], arr[N], arr[N], arr[N]);
0461     }
0462 
0463     double &operator[](unsigned int n) { return arr[n]; }
0464 
0465     double operator[](unsigned int n) const { return arr[n]; }
0466 
0467     Vec2<double> xy() const { return vec[0]; }
0468     Vec2<double> zw() const { return vec[1]; }
0469   };
0470 
0471   inline Vec4<float>::Vec4(Vec4<double> ivec) {
0472     vec = _mm_cvtpd_ps(ivec.vec[0]);
0473     __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
0474     vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
0475   }
0476 #endif  // CMS_USE_AVX2
0477 
0478 #endif  // CMS_USE_SSE
0479 
0480   typedef Vec2<float> Vec2F;
0481   typedef Vec4<float> Vec4F;
0482   typedef Vec4<float> Vec3F;
0483   typedef Vec2<double> Vec2D;
0484   typedef Vec4<double> Vec3D;
0485   typedef Vec4<double> Vec4D;
0486 
0487   template <typename T>
0488   struct As3D {
0489     Vec4<T> const &v;
0490     As3D(Vec4<T> const &iv) : v(iv) {}
0491   };
0492 
0493   template <typename T>
0494   inline As3D<T> as3D(Vec4<T> const &v) {
0495     return v;
0496   }
0497 
0498 }  // namespace mathSSE
0499 
0500 #ifdef CMS_USE_SSE
0501 
0502 //float op
0503 
0504 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0505   return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
0506 }
0507 
0508 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpeq_ps(a.vec, b.vec); }
0509 
0510 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpgt_ps(a.vec, b.vec); }
0511 
0512 #ifdef __SSE3__
0513 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_hadd_ps(a.vec, b.vec); }
0514 #endif
0515 
0516 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
0517   const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
0518   return _mm_xor_ps(a.vec, neg);
0519 }
0520 
0521 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_and_ps(a.vec, b.vec); }
0522 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_or_ps(a.vec, b.vec); }
0523 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_xor_ps(a.vec, b.vec); }
0524 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_andnot_ps(a.vec, b.vec); }
0525 
0526 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_add_ps(a.vec, b.vec); }
0527 
0528 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_sub_ps(a.vec, b.vec); }
0529 
0530 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_mul_ps(a.vec, b.vec); }
0531 
0532 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_div_ps(a.vec, b.vec); }
0533 
0534 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_min_ps(a.vec, b.vec); }
0535 
0536 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_max_ps(a.vec, b.vec); }
0537 
0538 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0539 
0540 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b, float a) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0541 
0542 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b, float a) { return _mm_div_ps(b.vec, _mm_set1_ps(a)); }
0543 
0544 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0545   using mathSSE::_mm_dot_ps;
0546   float s;
0547   _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
0548   return s;
0549 }
0550 
0551 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0552   using mathSSE::_mm_cross_ps;
0553   return _mm_cross_ps(a.vec, b.vec);
0554 }
0555 
0556 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0557   mathSSE::Vec4F mul = a * b;
0558 #ifdef __SSE3__
0559   mul = hadd(mul, mul);
0560 #else
0561   __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
0562   mul.vec = _mm_add_ps(mul.vec, swp);
0563 #endif
0564   float s;
0565   _mm_store_ss(&s, mul.vec);
0566   return s;
0567 }
0568 
0569 //
0570 // float op 2d (use the 4d one...)
0571 //
0572 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) { return -mathSSE::Vec4F(a); }
0573 
0574 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) + mathSSE::Vec4F(b); }
0575 
0576 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) - mathSSE::Vec4F(b); }
0577 
0578 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) * mathSSE::Vec4F(b); }
0579 
0580 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) / mathSSE::Vec4F(b); }
0581 
0582 inline mathSSE::Vec4F min(mathSSE::Vec2F a, mathSSE::Vec2F b) { return min(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0583 
0584 inline mathSSE::Vec4F max(mathSSE::Vec2F a, mathSSE::Vec2F b) { return ::max(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0585 
0586 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) { return s * mathSSE::Vec4F(a); }
0587 
0588 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) { return s * mathSSE::Vec4F(a); }
0589 
0590 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) { return mathSSE::Vec4F(a) / s; }
0591 
0592 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0593 
0594 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
0595 
0596 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0597 
0598 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
0599 
0600 ///
0601 // double op 2d
0602 //
0603 
0604 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
0605 #ifdef CMS_USE_AVX2
0606   vec = _mm256_castpd256_pd128(v4.vec);
0607 #else
0608   vec = v4.vec[0];
0609 #endif
0610 }
0611 
0612 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
0613   arr[0] = ivec.arr[0];
0614   arr[1] = ivec.arr[1];
0615 }
0616 
0617 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
0618   const __m128d neg = _mm_set_pd(-0.0, -0.0);
0619   return _mm_xor_pd(a.vec, neg);
0620 }
0621 
0622 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_and_pd(a.vec, b.vec); }
0623 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_or_pd(a.vec, b.vec); }
0624 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_xor_pd(a.vec, b.vec); }
0625 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_andnot_pd(a.vec, b.vec); }
0626 
0627 #ifdef __SSE3__
0628 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_hadd_pd(a.vec, b.vec); }
0629 #endif
0630 
0631 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_add_pd(a.vec, b.vec); }
0632 
0633 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_sub_pd(a.vec, b.vec); }
0634 
0635 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_mul_pd(a.vec, b.vec); }
0636 
0637 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_div_pd(a.vec, b.vec); }
0638 
0639 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_min_pd(a.vec, b.vec); }
0640 
0641 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_max_pd(a.vec, b.vec); }
0642 
0643 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0644 
0645 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b, double a) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0646 
0647 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b, double a) { return _mm_div_pd(b.vec, _mm_set1_pd(a)); }
0648 
0649 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0650 
0651 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0652   __m128d res = _mm_mul_pd(a.vec, b.vec);
0653 #ifdef __SSE3__
0654   res = _mm_hadd_pd(res, res);
0655 #else
0656   res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
0657 #endif
0658   double s;
0659   _mm_store_sd(&s, res);
0660   return s;
0661 }
0662 
0663 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0664 
0665 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0666   __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
0667   res = _mm_mul_pd(a.vec, res);
0668   res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
0669   double s;
0670   _mm_store_sd(&s, res);
0671   return s;
0672 }
0673 
0674 #ifndef CMS_USE_AVX2
0675 // double op 3d
0676 
0677 #ifdef __SSE3__
0678 // consistent with AVX...
0679 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0680   return mathSSE::Vec4D(hadd(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0681                         hadd(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0682 }
0683 #endif
0684 
0685 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0686   return _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0], b.vec[0])) == 0x3 &&
0687          _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1], b.vec[1])) == 0x3;
0688 }
0689 
0690 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
0691   const __m128d neg = _mm_set_pd(-0.0, -0.0);
0692   return mathSSE::Vec4D(_mm_xor_pd(a.vec[0], neg), _mm_xor_pd(a.vec[1], neg));
0693 }
0694 
0695 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0696   return mathSSE::Vec4D(_mm_add_pd(a.vec[0], b.vec[0]), _mm_add_pd(a.vec[1], b.vec[1]));
0697 }
0698 
0699 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0700   return mathSSE::Vec4D(_mm_sub_pd(a.vec[0], b.vec[0]), _mm_sub_pd(a.vec[1], b.vec[1]));
0701 }
0702 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0703   return mathSSE::Vec4D(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_pd(a.vec[1], b.vec[1]));
0704 }
0705 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0706   return mathSSE::Vec4D(_mm_div_pd(a.vec[0], b.vec[0]), _mm_div_pd(a.vec[1], b.vec[1]));
0707 }
0708 
0709 inline mathSSE::Vec4D min(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0710   using namespace mathSSE;
0711   return mathSSE::Vec4D(::min(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0712                         ::min(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0713 }
0714 
0715 inline mathSSE::Vec4D max(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0716   using namespace mathSSE;
0717   return mathSSE::Vec4D(::max(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0718                         ::max(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0719 }
0720 
0721 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
0722   __m128d res = _mm_set1_pd(a);
0723   return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0724 }
0725 
0726 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
0727   __m128d res = _mm_set1_pd(a);
0728   return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0729 }
0730 
0731 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
0732   a = 1. / a;
0733   return a * b;
0734 }
0735 
0736 // mix algebra (creates ambiguities...)
0737 /*
0738 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
0739   return a + mathSSE::Vec4D(b);
0740 }
0741 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
0742   return a + mathSSE::Vec4D(b);
0743 }
0744 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
0745   return a - mathSSE::Vec4D(b);
0746 }
0747 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
0748   return a - mathSSE::Vec4D(b);
0749 }
0750 */
0751 
0752 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0753 
0754 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0755   __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
0756   res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
0757   double s;
0758   _mm_store_sd(&s, res);
0759   return s;
0760 }
0761 
0762 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0763 
0764 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0765   const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
0766   // lh .z * rh .x , lh .z * rh .y
0767   __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
0768   // rh .z * lh .x , rh .z * lh .y
0769   __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
0770   __m128d m1 = _mm_sub_pd(l1, l2);                // l1 - l2
0771   m1 = _mm_shuffle_pd(m1, m1, 1);                 // switch the elements
0772   m1 = __m128d(_mm_xor_si128(__m128i(m1), neg));  // change the sign of the first element
0773   // lh .x * rh .y , lh .y * rh .x
0774   l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
0775   // lh .x * rh .y - lh .y * rh .x
0776   __m128d m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
0777   return mathSSE::Vec4D(m1, m2);
0778 }
0779 
0780 inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0781   return dot(a.xy(), b.xy());
0782 }
0783 
0784 #endif  // CMS_USE_AVX2
0785 
0786 // sqrt
0787 namespace mathSSE {
0788   template <>
0789   inline Vec4F sqrt(Vec4F v) {
0790     return _mm_sqrt_ps(v.vec);
0791   }
0792   template <>
0793   inline Vec2F sqrt(Vec2F v) {
0794     return sqrt(Vec4F(v));
0795   }
0796   template <>
0797   inline Vec2D sqrt(Vec2D v) {
0798     return _mm_sqrt_pd(v.vec);
0799   }
0800 #ifdef CMS_USE_AVX2
0801   template <>
0802   inline Vec4D sqrt(Vec4D v) {
0803     return _mm256_sqrt_pd(v.vec);
0804   }
0805 #else
0806   template <>
0807   inline Vec4D sqrt(Vec4D v) {
0808     return Vec4D(_mm_sqrt_pd(v.vec[0]), _mm_sqrt_pd(v.vec[1]));
0809   }
0810 #endif
0811 }  // namespace mathSSE
0812 
0813 #endif  // CMS_USE_SSE
0814 
0815 #include <iosfwd>
0816 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2D const &v);
0817 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2F const &v);
0818 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4F const &v);
0819 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4D const &v);
0820 
0821 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float> const &v);
0822 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double> const &v);
0823 
0824 #endif  // DataFormat_Math_SSEVec_H