Vec4

Macros

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
#ifndef DataFormat_Math_AVXVec_H
#define DataFormat_Math_AVXVec_H

// in principle it should not be used alone
// only as part of SSEVec
namespace mathSSE {

  template <>
  union Vec4<double> {
    typedef __m256d nativeType;
    __m256d vec;
    double __attribute__((aligned(32))) arr[4];
    OldVec<double> o;

    Vec4(__m256d ivec) : vec(ivec) {}

    Vec4(OldVec<double> const& ivec) : o(ivec) {}

    Vec4() { vec = _mm256_setzero_pd(); }

    inline Vec4(Vec4<float> ivec) { vec = _mm256_cvtps_pd(ivec.vec); }

    explicit Vec4(double f1) { set1(f1); }

    Vec4(double f1, double f2, double f3, double f4 = 0) {
      arr[0] = f1;
      arr[1] = f2;
      arr[2] = f3;
      arr[3] = f4;
    }

    Vec4(Vec2<double> ivec0, Vec2<double> ivec1) { vec = _mm256_set_m128d(ivec1.vec, ivec0.vec); }

    Vec4(Vec2<double> ivec0, double f3, double f4 = 0) {
      vec = _mm256_insertf128_pd(vec, ivec0.vec, 0);
      arr[2] = f3;
      arr[3] = f4;
    }

    Vec4(Vec2<double> ivec0) {
      vec = _mm256_setzero_pd();
      vec = _mm256_insertf128_pd(vec, ivec0.vec, 0);
    }

    // for masking
    void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
      Mask4<double> mask(m1, m2, m3, m4);
      vec = mask.vec;
    }

    void set(double f1, double f2, double f3, double f4 = 0) { vec = _mm256_set_pd(f4, f3, f2, f1); }

    void set1(double f1) { vec = _mm256_set1_pd(f1); }

    template <int N>
    Vec4 get1() const {
      return _mm256_set1_pd(arr[N]);  //FIXME
    }
    /*
    Vec4 get1(unsigned int n) const { 
      return _mm256_set1_pd(arr[n]); //FIXME
    }
    */
    double& operator[](unsigned int n) { return arr[n]; }

    double operator[](unsigned int n) const { return arr[n]; }

    Vec2<double> xy() const { return Vec2<double>(_mm256_castpd256_pd128(vec)); }
    Vec2<double> zw() const { return Vec2<double>(_mm256_castpd256_pd128(_mm256_permute2f128_pd(vec, vec, 1))); }
  };

  inline Vec4<float>::Vec4(Vec4<double> ivec) { vec = _mm256_cvtpd_ps(ivec.vec); }
}  // namespace mathSSE

inline bool operator==(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_movemask_pd(_mm256_cmp_pd(a.vec, b.vec, _CMP_EQ_OS)) == 0xf;
}

inline mathSSE::Vec4<double> cmpeq(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_cmp_pd(a.vec, b.vec, _CMP_EQ_OS);
}

inline mathSSE::Vec4<double> cmpgt(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_cmp_pd(a.vec, b.vec, _CMP_GT_OS);
}

inline mathSSE::Vec4<double> hadd(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_hadd_pd(a.vec, b.vec);
}

inline mathSSE::Vec4<double> operator-(mathSSE::Vec4<double> a) {
  const __m256d neg = _mm256_set_pd(-0.0, -0.0, -0.0, -0.0);
  return _mm256_xor_pd(a.vec, neg);
}

inline mathSSE::Vec4<double> operator&(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_and_pd(a.vec, b.vec);
}
inline mathSSE::Vec4<double> operator|(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_or_pd(a.vec, b.vec);
}
inline mathSSE::Vec4<double> operator^(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_xor_pd(a.vec, b.vec);
}
inline mathSSE::Vec4<double> andnot(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_andnot_pd(a.vec, b.vec);
}

inline mathSSE::Vec4<double> operator+(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_add_pd(a.vec, b.vec);
}

inline mathSSE::Vec4<double> operator-(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_sub_pd(a.vec, b.vec);
}

inline mathSSE::Vec4<double> operator*(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_mul_pd(a.vec, b.vec);
}

inline mathSSE::Vec4<double> operator/(mathSSE::Vec4<double> a, mathSSE::Vec4<double> b) {
  return _mm256_div_pd(a.vec, b.vec);
}

inline mathSSE::Vec4<double> operator*(double a, mathSSE::Vec4<double> b) {
  return _mm256_mul_pd(_mm256_set1_pd(a), b.vec);
}

inline mathSSE::Vec4<double> operator*(mathSSE::Vec4<double> b, double a) {
  return _mm256_mul_pd(_mm256_set1_pd(a), b.vec);
}

inline mathSSE::Vec4<double> operator/(mathSSE::Vec4<double> b, double a) {
  return _mm256_div_pd(b.vec, _mm256_set1_pd(a));
}

inline double __attribute__((always_inline)) __attribute__((pure)) dot(mathSSE::Vec4<double> a,
                                                                       mathSSE::Vec4<double> b) {
  using mathSSE::_mm256_dot_pd;
  mathSSE::Vec4<double> ret;
  ret.vec = _mm256_dot_pd(a.vec, b.vec);
  return ret.arr[0];
}

inline mathSSE::Vec4<double> __attribute__((always_inline)) __attribute__((pure)) cross(mathSSE::Vec4<double> a,
                                                                                        mathSSE::Vec4<double> b) {
  using mathSSE::_mm256_cross_pd;
  return _mm256_cross_pd(a.vec, b.vec);
}

inline double __attribute__((always_inline)) __attribute__((pure)) dotxy(mathSSE::Vec4<double> a,
                                                                         mathSSE::Vec4<double> b) {
  mathSSE::Vec4<double> mul = a * b;
  mul = hadd(mul, mul);
  return mul.arr[0];
}

#endif