File indexing completed on 2023-03-17 10:50:37
0001 #ifndef DataFormat_Math_SSEVec_H
0002 #define DataFormat_Math_SSEVec_H
0003
0004 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
0005 !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__)
0006 #if defined(__GNUC__)
0007 #include <x86intrin.h>
0008 #define CMS_USE_SSE
0009 #ifdef __AVX2__
0010 #define CMS_USE_AVX2
0011 #endif
0012 #endif
0013 #endif
0014
0015 #include <cmath>
0016
0017 namespace mathSSE {
0018 template <typename T>
0019 inline T sqrt(T t) {
0020 return std::sqrt(t);
0021 }
0022 }
0023
0024 namespace mathSSE {
0025 #ifdef CMS_USE_SSE
0026
0027 inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_dot_ps(__m128 v1, __m128 v2) {
0028 #ifdef __SSE4_1__
0029 return _mm_dp_ps(v1, v2, 0xff);
0030 #else
0031 __m128 mul = _mm_mul_ps(v1, v2);
0032 #ifdef __SSE3__
0033 mul = _mm_hadd_ps(mul, mul);
0034 return _mm_hadd_ps(mul, mul);
0035 #else
0036 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
0037 mul = _mm_add_ps(mul, swp);
0038 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
0039 return _mm_add_ps(mul, swp);
0040 #endif
0041 #endif
0042 }
0043
0044
0045 inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_cross_ps(__m128 v1, __m128 v2) {
0046
0047
0048 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
0049
0050 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
0051
0052 __m128 v5 = _mm_mul_ps(v3, v4);
0053
0054
0055 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
0056
0057 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
0058
0059 v3 = _mm_mul_ps(v3, v4);
0060 const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
0061 __m128i ret = __m128i(_mm_sub_ps(v5, v3));
0062 return __m128(_mm_xor_si128(ret, neg));
0063 }
0064 #endif
0065
0066 #ifdef CMS_USE_AVX2
0067 inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_dot_pd(__m256d v1, __m256d v2) {
0068 __m256d mul = _mm256_mul_pd(v1, v2);
0069 mul = _mm256_hadd_pd(mul, mul);
0070 __m256d tmp = _mm256_permute2f128_pd(mul, mul, 1);
0071 return _mm256_add_pd(mul, tmp);
0072 }
0073
0074 inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_cross_pd(__m256d v1, __m256d v2) {
0075 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
0076 v3 = _mm256_permute_pd(v3, 0);
0077
0078 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
0079 v4 = _mm256_permute_pd(v4, 5);
0080
0081 __m256d v5 = _mm256_mul_pd(v3, v4);
0082
0083 v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
0084 v3 = _mm256_permute_pd(v3, 0);
0085
0086 v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
0087 v4 = _mm256_permute_pd(v4, 5);
0088
0089 v3 = _mm256_mul_pd(v3, v4);
0090 __m256d ret = _mm256_sub_pd(v5, v3);
0091 const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
0092 return __m256d(_mm256_xor_si256(__m256i(ret), neg));
0093 }
0094
0095 #endif
0096
0097 template <typename T>
0098 struct OldVec {
0099 T theX;
0100 T theY;
0101 T theZ;
0102 T theW;
0103 } __attribute__((aligned(16)));
0104 #ifdef CMS_USE_AVX2
0105 template <>
0106 struct OldVec<double> {
0107 double theX;
0108 double theY;
0109 double theZ;
0110 double theW;
0111 } __attribute__((aligned(32)));
0112 #endif
0113
0114 template <typename T>
0115 union Vec4;
0116
0117 template <typename T>
0118 union Vec2 {
0119 Vec2() {
0120 arr[0] = 0;
0121 arr[1] = 0;
0122 }
0123 Vec2(T f1, T f2) {
0124 arr[0] = f1;
0125 arr[1] = f2;
0126 }
0127 explicit Vec2(T f1) {
0128 arr[0] = f1;
0129 arr[1] = f1;
0130 }
0131
0132 void set(T f1, T f2) {
0133 arr[0] = f1;
0134 arr[1] = f2;
0135 }
0136
0137 template <int N>
0138 Vec2 get1() const {
0139 return Vec2(arr[N], arr[N]);
0140 }
0141
0142
0143
0144
0145
0146
0147
0148 template <typename U>
0149 Vec2(Vec2<U> v) {
0150 arr[0] = v[0];
0151 arr[1] = v[1];
0152 }
0153
0154 inline Vec2(Vec4<T> v4);
0155
0156 T &operator[](unsigned int n) { return arr[n]; }
0157
0158 T operator[](unsigned int n) const { return arr[n]; }
0159
0160 T arr[2];
0161 };
0162
0163 template <typename T>
0164 union Vec4 {
0165 Vec4() {
0166 arr[0] = 0;
0167 arr[1] = 0;
0168 arr[2] = 0;
0169 arr[3] = 0;
0170 }
0171 Vec4(float f1, float f2, float f3, float f4 = 0) {
0172 arr[0] = f1;
0173 arr[1] = f2;
0174 arr[2] = f3;
0175 arr[3] = f4;
0176 }
0177 explicit Vec4(float f1) { set1(f1); }
0178 void set(float f1, float f2, float f3, float f4 = 0) {
0179 arr[0] = f1;
0180 arr[1] = f2;
0181 arr[2] = f3;
0182 arr[3] = f4;
0183 }
0184 void set1(float f1) {
0185 arr[0] = f1;
0186 arr[1] = f1;
0187 arr[2] = f1;
0188 arr[3] = f1;
0189 }
0190 template <int N>
0191 Vec4 get1() const {
0192 return Vec4(arr[N], arr[N], arr[N], arr[N]);
0193 }
0194
0195
0196
0197
0198
0199
0200 Vec2<T> xy() const { return Vec2<T>(arr[0], arr[1]); }
0201 Vec2<T> zw() const { return Vec2<T>(arr[2], arr[3]); }
0202
0203 T __attribute__((aligned(16))) arr[4];
0204 OldVec<T> o;
0205 };
0206
0207 template <typename T>
0208 inline Vec2<T>::Vec2(Vec4<T> v4) {
0209 arr[0] = v4[0];
0210 arr[1] = v4[1];
0211 }
0212
0213 #ifdef CMS_USE_SSE
0214
0215 template <>
0216 union Vec2<double>;
0217 template <>
0218 union Vec4<double>;
0219
0220 template <typename T>
0221 union Mask2 {};
0222 template <typename T>
0223 union Mask4 {};
0224
0225 template <>
0226 union Mask4<float> {
0227 __m128 vec;
0228 unsigned int __attribute__((aligned(16))) mask[4];
0229 Mask4() { vec = _mm_setzero_ps(); }
0230 Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0231 mask[0] = m1;
0232 mask[1] = m2;
0233 mask[2] = m3;
0234 mask[3] = m4;
0235 }
0236 };
0237
0238 #ifdef CMS_USE_AVX2
0239 template <>
0240 union Mask4<double> {
0241 __m256d vec;
0242 unsigned long long __attribute__((aligned(32))) mask[4];
0243 Mask4() { vec = _mm256_setzero_pd(); }
0244 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0245 mask[0] = m1;
0246 mask[1] = m2;
0247 mask[2] = m3;
0248 mask[3] = m4;
0249 }
0250 };
0251 #else
0252 template <>
0253 union Mask4<double> {
0254 __m128d vec[2];
0255 unsigned long long __attribute__((aligned(16))) mask[4];
0256 Mask4() {
0257 vec[0] = _mm_setzero_pd();
0258 vec[1] = _mm_setzero_pd();
0259 }
0260 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0261 mask[0] = m1;
0262 mask[1] = m2;
0263 mask[2] = m3;
0264 mask[3] = m4;
0265 }
0266 };
0267 #endif
0268
0269 template <>
0270 union Mask2<double> {
0271 __m128d vec;
0272 unsigned long long __attribute__((aligned(16))) mask[2];
0273 Mask2() { vec = _mm_setzero_pd(); }
0274 Mask2(unsigned long long m1, unsigned long long m2) {
0275 mask[0] = m1;
0276 mask[1] = m2;
0277 }
0278 };
0279
0280 template <>
0281 union Vec4<float> {
0282 typedef __m128 nativeType;
0283 __m128 vec;
0284 float __attribute__((aligned(16))) arr[4];
0285 OldVec<float> o;
0286
0287 Vec4(__m128 ivec) : vec(ivec) {}
0288
0289 Vec4(OldVec<float> const &ivec) : o(ivec) {}
0290
0291 Vec4() { vec = _mm_setzero_ps(); }
0292
0293 inline Vec4(Vec4<double> ivec);
0294
0295 explicit Vec4(float f1) { set1(f1); }
0296
0297 Vec4(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0298
0299 Vec4(Vec2<float> ivec0, Vec2<float> ivec1) {
0300 arr[0] = ivec0.arr[0];
0301 arr[1] = ivec0.arr[1];
0302 arr[2] = ivec1.arr[0];
0303 arr[3] = ivec1.arr[1];
0304 }
0305
0306 Vec4(Vec2<float> ivec0, float f3, float f4 = 0) {
0307 arr[0] = ivec0.arr[0];
0308 arr[1] = ivec0.arr[1];
0309 arr[2] = f3;
0310 arr[3] = f4;
0311 }
0312
0313 Vec4(Vec2<float> ivec0) {
0314 vec = _mm_setzero_ps();
0315 arr[0] = ivec0.arr[0];
0316 arr[1] = ivec0.arr[1];
0317 }
0318
0319
0320 void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0321 Mask4<float> mask(m1, m2, m3, m4);
0322 vec = mask.vec;
0323 }
0324
0325 void set(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0326
0327 void set1(float f1) { vec = _mm_set1_ps(f1); }
0328
0329 template <int N>
0330 Vec4 get1() const {
0331 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
0332 }
0333
0334 float &operator[](unsigned int n) { return arr[n]; }
0335
0336 float operator[](unsigned int n) const { return arr[n]; }
0337
0338 Vec2<float> xy() const { return Vec2<float>(arr[0], arr[1]); }
0339 Vec2<float> zw() const { return Vec2<float>(arr[2], arr[3]); }
0340 };
0341
0342 template <>
0343 union Vec2<double> {
0344 typedef __m128d nativeType;
0345 __m128d vec;
0346 double __attribute__((aligned(16))) arr[2];
0347
0348 Vec2(__m128d ivec) : vec(ivec) {}
0349
0350 Vec2() { vec = _mm_setzero_pd(); }
0351
0352 inline Vec2(Vec2<float> ivec);
0353
0354 Vec2(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0355
0356 explicit Vec2(double f1) { set1(f1); }
0357
0358 inline Vec2(Vec4<double> v4);
0359
0360
0361 void setMask(unsigned long long m1, unsigned long long m2) {
0362 Mask2<double> mask(m1, m2);
0363 vec = mask.vec;
0364 }
0365
0366 void set(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0367
0368 void set1(double f1) { vec = _mm_set1_pd(f1); }
0369
0370 template <int N>
0371 Vec2 get1() const {
0372 return Vec2(arr[N], arr[N]);
0373 }
0374
0375
0376
0377
0378
0379 double &operator[](unsigned int n) { return arr[n]; }
0380 double operator[](unsigned int n) const { return arr[n]; }
0381 };
0382
0383 #ifdef CMS_USE_AVX2
0384 }
0385 #include "private/AVXVec.h"
0386
0387 namespace mathSSE {
0388 #else
0389 template <>
0390 union Vec4<double> {
0391 __m128d vec[2];
0392 double __attribute__((aligned(16))) arr[4];
0393 OldVec<double> o;
0394
0395 Vec4(__m128d ivec[]) {
0396 vec[0] = ivec[0];
0397 vec[1] = ivec[1];
0398 }
0399
0400 Vec4(__m128d ivec0, __m128d ivec1) {
0401 vec[0] = ivec0;
0402 vec[1] = ivec1;
0403 }
0404
0405 Vec4() {
0406 vec[0] = _mm_setzero_pd();
0407 vec[1] = _mm_setzero_pd();
0408 }
0409
0410 Vec4(Vec4<float> ivec) {
0411 vec[0] = _mm_cvtps_pd(ivec.vec);
0412 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
0413 }
0414
0415 explicit Vec4(double f1) { set1(f1); }
0416
0417 Vec4(double f1, double f2, double f3, double f4 = 0) {
0418 arr[0] = f1;
0419 arr[1] = f2;
0420 arr[2] = f3;
0421 arr[3] = f4;
0422 }
0423
0424 Vec4(Vec2<double> ivec0, Vec2<double> ivec1) {
0425 vec[0] = ivec0.vec;
0426 vec[1] = ivec1.vec;
0427 }
0428
0429 Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
0430 vec[0] = ivec0.vec;
0431 arr[2] = f3;
0432 arr[3] = f4;
0433 }
0434
0435 Vec4(Vec2<double> ivec0) {
0436 vec[0] = ivec0.vec;
0437 vec[1] = _mm_setzero_pd();
0438 }
0439
0440 Vec4(OldVec<double> const &ivec) : o(ivec) {}
0441
0442
0443 void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0444 Mask4<double> mask(m1, m2, m3, m4);
0445 vec[0] = mask.vec[0];
0446 vec[1] = mask.vec[1];
0447 }
0448
0449 void set(double f1, double f2, double f3, double f4 = 0) {
0450 arr[0] = f1;
0451 arr[1] = f2;
0452 arr[2] = f3;
0453 arr[3] = f4;
0454 }
0455
0456 void set1(double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
0457
0458 template <int N>
0459 Vec4 get1() const {
0460 return Vec4(arr[N], arr[N], arr[N], arr[N]);
0461 }
0462
0463 double &operator[](unsigned int n) { return arr[n]; }
0464
0465 double operator[](unsigned int n) const { return arr[n]; }
0466
0467 Vec2<double> xy() const { return vec[0]; }
0468 Vec2<double> zw() const { return vec[1]; }
0469 };
0470
0471 inline Vec4<float>::Vec4(Vec4<double> ivec) {
0472 vec = _mm_cvtpd_ps(ivec.vec[0]);
0473 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
0474 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
0475 }
0476 #endif
0477
0478 #endif
0479
0480 typedef Vec2<float> Vec2F;
0481 typedef Vec4<float> Vec4F;
0482 typedef Vec4<float> Vec3F;
0483 typedef Vec2<double> Vec2D;
0484 typedef Vec4<double> Vec3D;
0485 typedef Vec4<double> Vec4D;
0486
0487 template <typename T>
0488 struct As3D {
0489 Vec4<T> const &v;
0490 As3D(Vec4<T> const &iv) : v(iv) {}
0491 };
0492
0493 template <typename T>
0494 inline As3D<T> as3D(Vec4<T> const &v) {
0495 return v;
0496 }
0497
0498 }
0499
0500 #ifdef CMS_USE_SSE
0501
0502
0503
0504 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0505 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
0506 }
0507
0508 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpeq_ps(a.vec, b.vec); }
0509
0510 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpgt_ps(a.vec, b.vec); }
0511
0512 #ifdef __SSE3__
0513 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_hadd_ps(a.vec, b.vec); }
0514 #endif
0515
0516 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
0517 const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
0518 return _mm_xor_ps(a.vec, neg);
0519 }
0520
0521 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_and_ps(a.vec, b.vec); }
0522 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_or_ps(a.vec, b.vec); }
0523 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_xor_ps(a.vec, b.vec); }
0524 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_andnot_ps(a.vec, b.vec); }
0525
0526 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_add_ps(a.vec, b.vec); }
0527
0528 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_sub_ps(a.vec, b.vec); }
0529
0530 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_mul_ps(a.vec, b.vec); }
0531
0532 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_div_ps(a.vec, b.vec); }
0533
0534 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_min_ps(a.vec, b.vec); }
0535
0536 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_max_ps(a.vec, b.vec); }
0537
0538 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0539
0540 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b, float a) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0541
0542 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b, float a) { return _mm_div_ps(b.vec, _mm_set1_ps(a)); }
0543
0544 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0545 using mathSSE::_mm_dot_ps;
0546 float s;
0547 _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
0548 return s;
0549 }
0550
0551 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0552 using mathSSE::_mm_cross_ps;
0553 return _mm_cross_ps(a.vec, b.vec);
0554 }
0555
0556 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0557 mathSSE::Vec4F mul = a * b;
0558 #ifdef __SSE3__
0559 mul = hadd(mul, mul);
0560 #else
0561 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
0562 mul.vec = _mm_add_ps(mul.vec, swp);
0563 #endif
0564 float s;
0565 _mm_store_ss(&s, mul.vec);
0566 return s;
0567 }
0568
0569
0570
0571
0572 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) { return -mathSSE::Vec4F(a); }
0573
0574 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) + mathSSE::Vec4F(b); }
0575
0576 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) - mathSSE::Vec4F(b); }
0577
0578 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) * mathSSE::Vec4F(b); }
0579
0580 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) / mathSSE::Vec4F(b); }
0581
0582 inline mathSSE::Vec4F min(mathSSE::Vec2F a, mathSSE::Vec2F b) { return min(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0583
0584 inline mathSSE::Vec4F max(mathSSE::Vec2F a, mathSSE::Vec2F b) { return ::max(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0585
0586 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) { return s * mathSSE::Vec4F(a); }
0587
0588 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) { return s * mathSSE::Vec4F(a); }
0589
0590 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) { return mathSSE::Vec4F(a) / s; }
0591
0592 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0593
0594 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
0595
0596 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0597
0598 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
0599
0600
0601
0602
0603
0604 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
0605 #ifdef CMS_USE_AVX2
0606 vec = _mm256_castpd256_pd128(v4.vec);
0607 #else
0608 vec = v4.vec[0];
0609 #endif
0610 }
0611
0612 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
0613 arr[0] = ivec.arr[0];
0614 arr[1] = ivec.arr[1];
0615 }
0616
0617 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
0618 const __m128d neg = _mm_set_pd(-0.0, -0.0);
0619 return _mm_xor_pd(a.vec, neg);
0620 }
0621
0622 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_and_pd(a.vec, b.vec); }
0623 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_or_pd(a.vec, b.vec); }
0624 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_xor_pd(a.vec, b.vec); }
0625 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_andnot_pd(a.vec, b.vec); }
0626
0627 #ifdef __SSE3__
0628 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_hadd_pd(a.vec, b.vec); }
0629 #endif
0630
0631 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_add_pd(a.vec, b.vec); }
0632
0633 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_sub_pd(a.vec, b.vec); }
0634
0635 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_mul_pd(a.vec, b.vec); }
0636
0637 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_div_pd(a.vec, b.vec); }
0638
0639 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_min_pd(a.vec, b.vec); }
0640
0641 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_max_pd(a.vec, b.vec); }
0642
0643 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0644
0645 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b, double a) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0646
0647 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b, double a) { return _mm_div_pd(b.vec, _mm_set1_pd(a)); }
0648
0649 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0650
0651 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0652 __m128d res = _mm_mul_pd(a.vec, b.vec);
0653 #ifdef __SSE3__
0654 res = _mm_hadd_pd(res, res);
0655 #else
0656 res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
0657 #endif
0658 double s;
0659 _mm_store_sd(&s, res);
0660 return s;
0661 }
0662
0663 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0664
0665 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0666 __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
0667 res = _mm_mul_pd(a.vec, res);
0668 res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
0669 double s;
0670 _mm_store_sd(&s, res);
0671 return s;
0672 }
0673
0674 #ifndef CMS_USE_AVX2
0675
0676
0677 #ifdef __SSE3__
0678
0679 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0680 return mathSSE::Vec4D(hadd(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0681 hadd(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0682 }
0683 #endif
0684
0685 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0686 return _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0], b.vec[0])) == 0x3 &&
0687 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1], b.vec[1])) == 0x3;
0688 }
0689
0690 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
0691 const __m128d neg = _mm_set_pd(-0.0, -0.0);
0692 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0], neg), _mm_xor_pd(a.vec[1], neg));
0693 }
0694
0695 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0696 return mathSSE::Vec4D(_mm_add_pd(a.vec[0], b.vec[0]), _mm_add_pd(a.vec[1], b.vec[1]));
0697 }
0698
0699 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0700 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0], b.vec[0]), _mm_sub_pd(a.vec[1], b.vec[1]));
0701 }
0702 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0703 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_pd(a.vec[1], b.vec[1]));
0704 }
0705 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0706 return mathSSE::Vec4D(_mm_div_pd(a.vec[0], b.vec[0]), _mm_div_pd(a.vec[1], b.vec[1]));
0707 }
0708
0709 inline mathSSE::Vec4D min(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0710 using namespace mathSSE;
0711 return mathSSE::Vec4D(::min(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0712 ::min(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0713 }
0714
0715 inline mathSSE::Vec4D max(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0716 using namespace mathSSE;
0717 return mathSSE::Vec4D(::max(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0718 ::max(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0719 }
0720
0721 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
0722 __m128d res = _mm_set1_pd(a);
0723 return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0724 }
0725
0726 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
0727 __m128d res = _mm_set1_pd(a);
0728 return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0729 }
0730
0731 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
0732 a = 1. / a;
0733 return a * b;
0734 }
0735
0736
0737
0738
0739
0740
0741
0742
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0753
0754 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0755 __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
0756 res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
0757 double s;
0758 _mm_store_sd(&s, res);
0759 return s;
0760 }
0761
0762 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0763
0764 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0765 const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
0766
0767 __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
0768
0769 __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
0770 __m128d m1 = _mm_sub_pd(l1, l2);
0771 m1 = _mm_shuffle_pd(m1, m1, 1);
0772 m1 = __m128d(_mm_xor_si128(__m128i(m1), neg));
0773
0774 l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
0775
0776 __m128d m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
0777 return mathSSE::Vec4D(m1, m2);
0778 }
0779
0780 inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0781 return dot(a.xy(), b.xy());
0782 }
0783
0784 #endif
0785
0786
0787 namespace mathSSE {
0788 template <>
0789 inline Vec4F sqrt(Vec4F v) {
0790 return _mm_sqrt_ps(v.vec);
0791 }
0792 template <>
0793 inline Vec2F sqrt(Vec2F v) {
0794 return sqrt(Vec4F(v));
0795 }
0796 template <>
0797 inline Vec2D sqrt(Vec2D v) {
0798 return _mm_sqrt_pd(v.vec);
0799 }
0800 #ifdef CMS_USE_AVX2
0801 template <>
0802 inline Vec4D sqrt(Vec4D v) {
0803 return _mm256_sqrt_pd(v.vec);
0804 }
0805 #else
0806 template <>
0807 inline Vec4D sqrt(Vec4D v) {
0808 return Vec4D(_mm_sqrt_pd(v.vec[0]), _mm_sqrt_pd(v.vec[1]));
0809 }
0810 #endif
0811 }
0812
0813 #endif
0814
0815 #include <iosfwd>
0816 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2D const &v);
0817 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2F const &v);
0818 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4F const &v);
0819 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4D const &v);
0820
0821 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float> const &v);
0822 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double> const &v);
0823
0824 #endif