File indexing completed on 2024-07-30 05:19:43
0001 #ifndef DataFormat_Math_SSEVec_H
0002 #define DataFormat_Math_SSEVec_H
0003
0004 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
0005 !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__) && !defined(__riscv)
0006 #if defined(__GNUC__)
0007 #include <x86intrin.h>
0008 #define CMS_USE_SSE
0009 #ifdef __SSE4_1__
0010
0011
0012 #endif
0013 #ifdef __AVX2__
0014 #define CMS_USE_AVX2
0015 #endif
0016 #endif
0017 #endif
0018
0019 #include <cmath>
0020
0021 namespace mathSSE {
0022 template <typename T>
0023 inline T sqrt(T t) {
0024 return std::sqrt(t);
0025 }
0026 }
0027
0028 namespace mathSSE {
0029 #ifdef CMS_USE_SSE
0030
0031 inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_dot_ps(__m128 v1, __m128 v2) {
0032 #ifdef CMS_USE_SSE4
0033
0034 return _mm_dp_ps(v1, v2, 0xff);
0035 #else
0036
0037 __m128 mul = _mm_mul_ps(v1, v2);
0038 #ifdef __SSE3__
0039 mul = _mm_hadd_ps(mul, mul);
0040 return _mm_hadd_ps(mul, mul);
0041 #else
0042 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
0043 mul = _mm_add_ps(mul, swp);
0044 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
0045 return _mm_add_ps(mul, swp);
0046 #endif
0047 #endif
0048 }
0049
0050
0051 inline __m128 __attribute__((always_inline)) __attribute__((pure)) _mm_cross_ps(__m128 v1, __m128 v2) {
0052
0053
0054 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
0055
0056 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
0057
0058 __m128 v5 = _mm_mul_ps(v3, v4);
0059
0060
0061 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
0062
0063 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
0064
0065 v3 = _mm_mul_ps(v3, v4);
0066 const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
0067 __m128i ret = __m128i(_mm_sub_ps(v5, v3));
0068 return __m128(_mm_xor_si128(ret, neg));
0069 }
0070 #endif
0071
0072 #ifdef CMS_USE_AVX2
0073 inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_dot_pd(__m256d v1, __m256d v2) {
0074 __m256d mul = _mm256_mul_pd(v1, v2);
0075 mul = _mm256_hadd_pd(mul, mul);
0076 __m256d tmp = _mm256_permute2f128_pd(mul, mul, 1);
0077 return _mm256_add_pd(mul, tmp);
0078 }
0079
0080 inline __m256d __attribute__((always_inline)) __attribute__((pure)) _mm256_cross_pd(__m256d v1, __m256d v2) {
0081 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
0082 v3 = _mm256_permute_pd(v3, 0);
0083
0084 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
0085 v4 = _mm256_permute_pd(v4, 5);
0086
0087 __m256d v5 = _mm256_mul_pd(v3, v4);
0088
0089 v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
0090 v3 = _mm256_permute_pd(v3, 0);
0091
0092 v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
0093 v4 = _mm256_permute_pd(v4, 5);
0094
0095 v3 = _mm256_mul_pd(v3, v4);
0096 __m256d ret = _mm256_sub_pd(v5, v3);
0097 const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
0098 return __m256d(_mm256_xor_si256(__m256i(ret), neg));
0099 }
0100
0101 #endif
0102
0103 template <typename T>
0104 struct OldVec {
0105 T theX;
0106 T theY;
0107 T theZ;
0108 T theW;
0109 } __attribute__((aligned(16)));
0110 #ifdef CMS_USE_AVX2
0111 template <>
0112 struct OldVec<double> {
0113 double theX;
0114 double theY;
0115 double theZ;
0116 double theW;
0117 } __attribute__((aligned(32)));
0118 #endif
0119
0120 template <typename T>
0121 union Vec4;
0122
0123 template <typename T>
0124 union Vec2 {
0125 Vec2() {
0126 arr[0] = 0;
0127 arr[1] = 0;
0128 }
0129 Vec2(T f1, T f2) {
0130 arr[0] = f1;
0131 arr[1] = f2;
0132 }
0133 explicit Vec2(T f1) {
0134 arr[0] = f1;
0135 arr[1] = f1;
0136 }
0137
0138 void set(T f1, T f2) {
0139 arr[0] = f1;
0140 arr[1] = f2;
0141 }
0142
0143 template <int N>
0144 Vec2 get1() const {
0145 return Vec2(arr[N], arr[N]);
0146 }
0147
0148
0149
0150
0151
0152
0153
0154 template <typename U>
0155 Vec2(Vec2<U> v) {
0156 arr[0] = v[0];
0157 arr[1] = v[1];
0158 }
0159
0160 inline Vec2(Vec4<T> v4);
0161
0162 T &operator[](unsigned int n) { return arr[n]; }
0163
0164 T operator[](unsigned int n) const { return arr[n]; }
0165
0166 T arr[2];
0167 };
0168
0169 template <typename T>
0170 union Vec4 {
0171 Vec4() {
0172 arr[0] = 0;
0173 arr[1] = 0;
0174 arr[2] = 0;
0175 arr[3] = 0;
0176 }
0177 Vec4(float f1, float f2, float f3, float f4 = 0) {
0178 arr[0] = f1;
0179 arr[1] = f2;
0180 arr[2] = f3;
0181 arr[3] = f4;
0182 }
0183 explicit Vec4(float f1) { set1(f1); }
0184 void set(float f1, float f2, float f3, float f4 = 0) {
0185 arr[0] = f1;
0186 arr[1] = f2;
0187 arr[2] = f3;
0188 arr[3] = f4;
0189 }
0190 void set1(float f1) {
0191 arr[0] = f1;
0192 arr[1] = f1;
0193 arr[2] = f1;
0194 arr[3] = f1;
0195 }
0196 template <int N>
0197 Vec4 get1() const {
0198 return Vec4(arr[N], arr[N], arr[N], arr[N]);
0199 }
0200
0201
0202
0203
0204
0205
0206 Vec2<T> xy() const { return Vec2<T>(arr[0], arr[1]); }
0207 Vec2<T> zw() const { return Vec2<T>(arr[2], arr[3]); }
0208
0209 T __attribute__((aligned(16))) arr[4];
0210 OldVec<T> o;
0211 };
0212
0213 template <typename T>
0214 inline Vec2<T>::Vec2(Vec4<T> v4) {
0215 arr[0] = v4[0];
0216 arr[1] = v4[1];
0217 }
0218
0219 #ifdef CMS_USE_SSE
0220
0221 template <>
0222 union Vec2<double>;
0223 template <>
0224 union Vec4<double>;
0225
0226 template <typename T>
0227 union Mask2 {};
0228 template <typename T>
0229 union Mask4 {};
0230
0231 template <>
0232 union Mask4<float> {
0233 __m128 vec;
0234 unsigned int __attribute__((aligned(16))) mask[4];
0235 Mask4() { vec = _mm_setzero_ps(); }
0236 Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0237 mask[0] = m1;
0238 mask[1] = m2;
0239 mask[2] = m3;
0240 mask[3] = m4;
0241 }
0242 };
0243
0244 #ifdef CMS_USE_AVX2
0245 template <>
0246 union Mask4<double> {
0247 __m256d vec;
0248 unsigned long long __attribute__((aligned(32))) mask[4];
0249 Mask4() { vec = _mm256_setzero_pd(); }
0250 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0251 mask[0] = m1;
0252 mask[1] = m2;
0253 mask[2] = m3;
0254 mask[3] = m4;
0255 }
0256 };
0257 #else
0258 template <>
0259 union Mask4<double> {
0260 __m128d vec[2];
0261 unsigned long long __attribute__((aligned(16))) mask[4];
0262 Mask4() {
0263 vec[0] = _mm_setzero_pd();
0264 vec[1] = _mm_setzero_pd();
0265 }
0266 Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0267 mask[0] = m1;
0268 mask[1] = m2;
0269 mask[2] = m3;
0270 mask[3] = m4;
0271 }
0272 };
0273 #endif
0274
0275 template <>
0276 union Mask2<double> {
0277 __m128d vec;
0278 unsigned long long __attribute__((aligned(16))) mask[2];
0279 Mask2() { vec = _mm_setzero_pd(); }
0280 Mask2(unsigned long long m1, unsigned long long m2) {
0281 mask[0] = m1;
0282 mask[1] = m2;
0283 }
0284 };
0285
0286 template <>
0287 union Vec4<float> {
0288 typedef __m128 nativeType;
0289 __m128 vec;
0290 float __attribute__((aligned(16))) arr[4];
0291 OldVec<float> o;
0292
0293 Vec4(__m128 ivec) : vec(ivec) {}
0294
0295 Vec4(OldVec<float> const &ivec) : o(ivec) {}
0296
0297 Vec4() { vec = _mm_setzero_ps(); }
0298
0299 inline Vec4(Vec4<double> ivec);
0300
0301 explicit Vec4(float f1) { set1(f1); }
0302
0303 Vec4(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0304
0305 Vec4(Vec2<float> ivec0, Vec2<float> ivec1) {
0306 arr[0] = ivec0.arr[0];
0307 arr[1] = ivec0.arr[1];
0308 arr[2] = ivec1.arr[0];
0309 arr[3] = ivec1.arr[1];
0310 }
0311
0312 Vec4(Vec2<float> ivec0, float f3, float f4 = 0) {
0313 arr[0] = ivec0.arr[0];
0314 arr[1] = ivec0.arr[1];
0315 arr[2] = f3;
0316 arr[3] = f4;
0317 }
0318
0319 Vec4(Vec2<float> ivec0) {
0320 vec = _mm_setzero_ps();
0321 arr[0] = ivec0.arr[0];
0322 arr[1] = ivec0.arr[1];
0323 }
0324
0325
0326 void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
0327 Mask4<float> mask(m1, m2, m3, m4);
0328 vec = mask.vec;
0329 }
0330
0331 void set(float f1, float f2, float f3, float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
0332
0333 void set1(float f1) { vec = _mm_set1_ps(f1); }
0334
0335 template <int N>
0336 Vec4 get1() const {
0337 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N));
0338 }
0339
0340 float &operator[](unsigned int n) { return arr[n]; }
0341
0342 float operator[](unsigned int n) const { return arr[n]; }
0343
0344 Vec2<float> xy() const { return Vec2<float>(arr[0], arr[1]); }
0345 Vec2<float> zw() const { return Vec2<float>(arr[2], arr[3]); }
0346 };
0347
0348 template <>
0349 union Vec2<double> {
0350 typedef __m128d nativeType;
0351 __m128d vec;
0352 double __attribute__((aligned(16))) arr[2];
0353
0354 Vec2(__m128d ivec) : vec(ivec) {}
0355
0356 Vec2() { vec = _mm_setzero_pd(); }
0357
0358 inline Vec2(Vec2<float> ivec);
0359
0360 Vec2(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0361
0362 explicit Vec2(double f1) { set1(f1); }
0363
0364 inline Vec2(Vec4<double> v4);
0365
0366
0367 void setMask(unsigned long long m1, unsigned long long m2) {
0368 Mask2<double> mask(m1, m2);
0369 vec = mask.vec;
0370 }
0371
0372 void set(double f1, double f2) { vec = _mm_set_pd(f2, f1); }
0373
0374 void set1(double f1) { vec = _mm_set1_pd(f1); }
0375
0376 template <int N>
0377 Vec2 get1() const {
0378 return Vec2(arr[N], arr[N]);
0379 }
0380
0381
0382
0383
0384
0385 double &operator[](unsigned int n) { return arr[n]; }
0386 double operator[](unsigned int n) const { return arr[n]; }
0387 };
0388
0389 #ifdef CMS_USE_AVX2
0390 }
0391 #include "private/AVXVec.h"
0392
0393 namespace mathSSE {
0394 #else
0395 template <>
0396 union Vec4<double> {
0397 __m128d vec[2];
0398 double __attribute__((aligned(16))) arr[4];
0399 OldVec<double> o;
0400
0401 Vec4(__m128d ivec[]) {
0402 vec[0] = ivec[0];
0403 vec[1] = ivec[1];
0404 }
0405
0406 Vec4(__m128d ivec0, __m128d ivec1) {
0407 vec[0] = ivec0;
0408 vec[1] = ivec1;
0409 }
0410
0411 Vec4() {
0412 vec[0] = _mm_setzero_pd();
0413 vec[1] = _mm_setzero_pd();
0414 }
0415
0416 Vec4(Vec4<float> ivec) {
0417 vec[0] = _mm_cvtps_pd(ivec.vec);
0418 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
0419 }
0420
0421 explicit Vec4(double f1) { set1(f1); }
0422
0423 Vec4(double f1, double f2, double f3, double f4 = 0) {
0424 arr[0] = f1;
0425 arr[1] = f2;
0426 arr[2] = f3;
0427 arr[3] = f4;
0428 }
0429
0430 Vec4(Vec2<double> ivec0, Vec2<double> ivec1) {
0431 vec[0] = ivec0.vec;
0432 vec[1] = ivec1.vec;
0433 }
0434
0435 Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
0436 vec[0] = ivec0.vec;
0437 arr[2] = f3;
0438 arr[3] = f4;
0439 }
0440
0441 Vec4(Vec2<double> ivec0) {
0442 vec[0] = ivec0.vec;
0443 vec[1] = _mm_setzero_pd();
0444 }
0445
0446 Vec4(OldVec<double> const &ivec) : o(ivec) {}
0447
0448
0449 void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
0450 Mask4<double> mask(m1, m2, m3, m4);
0451 vec[0] = mask.vec[0];
0452 vec[1] = mask.vec[1];
0453 }
0454
0455 void set(double f1, double f2, double f3, double f4 = 0) {
0456 arr[0] = f1;
0457 arr[1] = f2;
0458 arr[2] = f3;
0459 arr[3] = f4;
0460 }
0461
0462 void set1(double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
0463
0464 template <int N>
0465 Vec4 get1() const {
0466 return Vec4(arr[N], arr[N], arr[N], arr[N]);
0467 }
0468
0469 double &operator[](unsigned int n) { return arr[n]; }
0470
0471 double operator[](unsigned int n) const { return arr[n]; }
0472
0473 Vec2<double> xy() const { return vec[0]; }
0474 Vec2<double> zw() const { return vec[1]; }
0475 };
0476
0477 inline Vec4<float>::Vec4(Vec4<double> ivec) {
0478 vec = _mm_cvtpd_ps(ivec.vec[0]);
0479 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
0480 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
0481 }
0482 #endif
0483
0484 #endif
0485
0486 typedef Vec2<float> Vec2F;
0487 typedef Vec4<float> Vec4F;
0488 typedef Vec4<float> Vec3F;
0489 typedef Vec2<double> Vec2D;
0490 typedef Vec4<double> Vec3D;
0491 typedef Vec4<double> Vec4D;
0492
0493 template <typename T>
0494 struct As3D {
0495 Vec4<T> const &v;
0496 As3D(Vec4<T> const &iv) : v(iv) {}
0497 };
0498
0499 template <typename T>
0500 inline As3D<T> as3D(Vec4<T> const &v) {
0501 return v;
0502 }
0503
0504 }
0505
0506 #ifdef CMS_USE_SSE
0507
0508
0509
0510 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0511 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
0512 }
0513
0514 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpeq_ps(a.vec, b.vec); }
0515
0516 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_cmpgt_ps(a.vec, b.vec); }
0517
0518 #ifdef __SSE3__
0519 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_hadd_ps(a.vec, b.vec); }
0520 #endif
0521
0522 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
0523 const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
0524 return _mm_xor_ps(a.vec, neg);
0525 }
0526
0527 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_and_ps(a.vec, b.vec); }
0528 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_or_ps(a.vec, b.vec); }
0529 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_xor_ps(a.vec, b.vec); }
0530 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_andnot_ps(a.vec, b.vec); }
0531
0532 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_add_ps(a.vec, b.vec); }
0533
0534 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_sub_ps(a.vec, b.vec); }
0535
0536 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_mul_ps(a.vec, b.vec); }
0537
0538 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_div_ps(a.vec, b.vec); }
0539
0540 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_min_ps(a.vec, b.vec); }
0541
0542 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) { return _mm_max_ps(a.vec, b.vec); }
0543
0544 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0545
0546 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b, float a) { return _mm_mul_ps(_mm_set1_ps(a), b.vec); }
0547
0548 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b, float a) { return _mm_div_ps(b.vec, _mm_set1_ps(a)); }
0549
0550 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0551 using mathSSE::_mm_dot_ps;
0552 float s;
0553 _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
0554 return s;
0555 }
0556
0557 inline float dot3(mathSSE::Vec4F a, mathSSE::Vec4F b) { return dot(a, b); }
0558
0559 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0560 using mathSSE::_mm_cross_ps;
0561 return _mm_cross_ps(a.vec, b.vec);
0562 }
0563
0564 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
0565 mathSSE::Vec4F mul = a * b;
0566 #ifdef __SSE3__
0567 mul = hadd(mul, mul);
0568 #else
0569 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
0570 mul.vec = _mm_add_ps(mul.vec, swp);
0571 #endif
0572 float s;
0573 _mm_store_ss(&s, mul.vec);
0574 return s;
0575 }
0576
0577
0578
0579
0580 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) { return -mathSSE::Vec4F(a); }
0581
0582 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) + mathSSE::Vec4F(b); }
0583
0584 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) - mathSSE::Vec4F(b); }
0585
0586 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) * mathSSE::Vec4F(b); }
0587
0588 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) { return mathSSE::Vec4F(a) / mathSSE::Vec4F(b); }
0589
0590 inline mathSSE::Vec4F min(mathSSE::Vec2F a, mathSSE::Vec2F b) { return min(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0591
0592 inline mathSSE::Vec4F max(mathSSE::Vec2F a, mathSSE::Vec2F b) { return ::max(mathSSE::Vec4F(a), mathSSE::Vec4F(b)); }
0593
0594 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) { return s * mathSSE::Vec4F(a); }
0595
0596 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) { return s * mathSSE::Vec4F(a); }
0597
0598 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) { return mathSSE::Vec4F(a) / s; }
0599
0600 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0601
0602 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
0603
0604 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) __attribute__((always_inline)) __attribute__((pure));
0605
0606 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) { return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
0607
0608
0609
0610
0611
0612 inline mathSSE::Vec2D::Vec2(Vec4D v4) {
0613 #ifdef CMS_USE_AVX2
0614 vec = _mm256_castpd256_pd128(v4.vec);
0615 #else
0616 vec = v4.vec[0];
0617 #endif
0618 }
0619
0620 inline mathSSE::Vec2D::Vec2(Vec2F ivec) {
0621 arr[0] = ivec.arr[0];
0622 arr[1] = ivec.arr[1];
0623 }
0624
0625 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
0626 const __m128d neg = _mm_set_pd(-0.0, -0.0);
0627 return _mm_xor_pd(a.vec, neg);
0628 }
0629
0630 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_and_pd(a.vec, b.vec); }
0631 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_or_pd(a.vec, b.vec); }
0632 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_xor_pd(a.vec, b.vec); }
0633 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_andnot_pd(a.vec, b.vec); }
0634
0635 #ifdef __SSE3__
0636 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_hadd_pd(a.vec, b.vec); }
0637 #endif
0638
0639 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_add_pd(a.vec, b.vec); }
0640
0641 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_sub_pd(a.vec, b.vec); }
0642
0643 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_mul_pd(a.vec, b.vec); }
0644
0645 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_div_pd(a.vec, b.vec); }
0646
0647 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_min_pd(a.vec, b.vec); }
0648
0649 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) { return _mm_max_pd(a.vec, b.vec); }
0650
0651 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0652
0653 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b, double a) { return _mm_mul_pd(_mm_set1_pd(a), b.vec); }
0654
0655 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b, double a) { return _mm_div_pd(b.vec, _mm_set1_pd(a)); }
0656
0657 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0658
0659 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0660 __m128d res = _mm_mul_pd(a.vec, b.vec);
0661 #ifdef __SSE3__
0662 res = _mm_hadd_pd(res, res);
0663 #else
0664 res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
0665 #endif
0666 double s;
0667 _mm_store_sd(&s, res);
0668 return s;
0669 }
0670
0671 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) __attribute__((always_inline)) __attribute__((pure));
0672
0673 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
0674 __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
0675 res = _mm_mul_pd(a.vec, res);
0676 res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
0677 double s;
0678 _mm_store_sd(&s, res);
0679 return s;
0680 }
0681
0682 #ifndef CMS_USE_AVX2
0683
0684
0685 #ifdef __SSE3__
0686
0687 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0688 return mathSSE::Vec4D(hadd(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0689 hadd(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0690 }
0691 #endif
0692
0693 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0694 return _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0], b.vec[0])) == 0x3 &&
0695 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1], b.vec[1])) == 0x3;
0696 }
0697
0698 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
0699 const __m128d neg = _mm_set_pd(-0.0, -0.0);
0700 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0], neg), _mm_xor_pd(a.vec[1], neg));
0701 }
0702
0703 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0704 return mathSSE::Vec4D(_mm_add_pd(a.vec[0], b.vec[0]), _mm_add_pd(a.vec[1], b.vec[1]));
0705 }
0706
0707 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0708 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0], b.vec[0]), _mm_sub_pd(a.vec[1], b.vec[1]));
0709 }
0710 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0711 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_pd(a.vec[1], b.vec[1]));
0712 }
0713 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0714 return mathSSE::Vec4D(_mm_div_pd(a.vec[0], b.vec[0]), _mm_div_pd(a.vec[1], b.vec[1]));
0715 }
0716
0717 inline mathSSE::Vec4D min(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0718 using namespace mathSSE;
0719 return mathSSE::Vec4D(::min(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0720 ::min(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0721 }
0722
0723 inline mathSSE::Vec4D max(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0724 using namespace mathSSE;
0725 return mathSSE::Vec4D(::max(mathSSE::Vec2D(a.vec[0]), mathSSE::Vec2D(b.vec[0])),
0726 ::max(mathSSE::Vec2D(a.vec[1]), mathSSE::Vec2D(b.vec[1])));
0727 }
0728
0729 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
0730 __m128d res = _mm_set1_pd(a);
0731 return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0732 }
0733
0734 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
0735 __m128d res = _mm_set1_pd(a);
0736 return mathSSE::Vec4D(_mm_mul_pd(res, b.vec[0]), _mm_mul_pd(res, b.vec[1]));
0737 }
0738
0739 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
0740 a = 1. / a;
0741 return a * b;
0742 }
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757
0758
0759
0760 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0761
0762 inline double __attribute__((always_inline)) __attribute__((pure)) dot3(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0763 return dot(a, b);
0764 }
0765
0766 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0767 __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
0768 res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
0769 double s;
0770 _mm_store_sd(&s, res);
0771 return s;
0772 }
0773
0774 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__((pure));
0775
0776 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0777 const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
0778
0779 __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
0780
0781 __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
0782 __m128d m1 = _mm_sub_pd(l1, l2);
0783 m1 = _mm_shuffle_pd(m1, m1, 1);
0784 m1 = __m128d(_mm_xor_si128(__m128i(m1), neg));
0785
0786 l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
0787
0788 __m128d m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
0789 return mathSSE::Vec4D(m1, m2);
0790 }
0791
0792 inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
0793 return dot(a.xy(), b.xy());
0794 }
0795
0796 #endif
0797
0798
0799 namespace mathSSE {
0800 template <>
0801 inline Vec4F sqrt(Vec4F v) {
0802 return _mm_sqrt_ps(v.vec);
0803 }
0804 template <>
0805 inline Vec2F sqrt(Vec2F v) {
0806 return sqrt(Vec4F(v));
0807 }
0808 template <>
0809 inline Vec2D sqrt(Vec2D v) {
0810 return _mm_sqrt_pd(v.vec);
0811 }
0812 #ifdef CMS_USE_AVX2
0813 template <>
0814 inline Vec4D sqrt(Vec4D v) {
0815 return _mm256_sqrt_pd(v.vec);
0816 }
0817 #else
0818 template <>
0819 inline Vec4D sqrt(Vec4D v) {
0820 return Vec4D(_mm_sqrt_pd(v.vec[0]), _mm_sqrt_pd(v.vec[1]));
0821 }
0822 #endif
0823 }
0824
0825 #endif
0826
0827 #include <iosfwd>
0828 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2D const &v);
0829 std::ostream &operator<<(std::ostream &out, mathSSE::Vec2F const &v);
0830 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4F const &v);
0831 std::ostream &operator<<(std::ostream &out, mathSSE::Vec4D const &v);
0832
0833 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float> const &v);
0834 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double> const &v);
0835
0836 #endif