src/Matriplex/MatriplexCommon.h

0001 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexCommon_h
0002 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexCommon_h
0003
0004 #include <cstring>
0005
0006 // Use intrinsics version of code when available, done via CPP flags.
0007 // #define  MPLEX_USE_INTRINSICS
0008
0009 //==============================================================================
0010 // Intrinsics -- preamble
0011 //==============================================================================
0012
0013 #if defined(__x86_64__)
0014 #include "immintrin.h"
0015 #else
0016 #include <cstdlib>
0017 #endif
0018
0019 #ifndef MPLEX_ALIGN
0020 #if defined(__AVX512F__)
0021 #define MPLEX_ALIGN 64
0022 #elif defined(__AVX__) || defined(__AVX2__)
0023 #define MPLEX_ALIGN 32
0024 #elif defined(__SSE3__)
0025 #define MPLEX_ALIGN 16
0026 #else
0027 #define MPLEX_ALIGN 32
0028 #endif
0029 #endif
0030
0031 #if defined(MPLEX_USE_INTRINSICS)
0032 // This seems unnecessary: __AVX__ is usually defined for all higher ISA extensions
0033 #if defined(__AVX__) || defined(__AVX512F__)
0034
0035 #define MPLEX_INTRINSICS
0036
0037 #endif
0038
0039 #if defined(__AVX512F__)
0040
0041 typedef __m512 IntrVec_t;
0042 #define MPLEX_INTRINSICS_WIDTH_BYTES 64
0043 #define MPLEX_INTRINSICS_WIDTH_BITS 512
0044 #define AVX512_INTRINSICS
0045 #define GATHER_INTRINSICS
0046 #define GATHER_IDX_LOAD(name, arr) __m512i name = _mm512_load_epi32(arr);
0047
0048 #define LD(a, i) _mm512_load_ps(&a[i * N + n])
0049 #define ST(a, i, r) _mm512_store_ps(&a[i * N + n], r)
0050 #define ADD(a, b) _mm512_add_ps(a, b)
0051 #define MUL(a, b) _mm512_mul_ps(a, b)
0052 #define FMA(a, b, v) _mm512_fmadd_ps(a, b, v)
0053
0054 #elif defined(__AVX2__) && defined(__FMA__)
0055
0056 typedef __m256 IntrVec_t;
0057 #define MPLEX_INTRINSICS_WIDTH_BYTES 32
0058 #define MPLEX_INTRINSICS_WIDTH_BITS 256
0059 #define AVX2_INTRINSICS
0060 #define GATHER_INTRINSICS
0061 // Previously used _mm256_load_epi32(arr) here, but that's part of AVX-512F, not AVX2
0062 #define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_si256(reinterpret_cast<const __m256i *>(arr));
0063
0064 #define LD(a, i) _mm256_load_ps(&a[i * N + n])
0065 #define ST(a, i, r) _mm256_store_ps(&a[i * N + n], r)
0066 #define ADD(a, b) _mm256_add_ps(a, b)
0067 #define MUL(a, b) _mm256_mul_ps(a, b)
0068 #define FMA(a, b, v) _mm256_fmadd_ps(a, b, v)
0069
0070 #elif defined(__AVX__)
0071
0072 typedef __m256 IntrVec_t;
0073 #define MPLEX_INTRINSICS_WIDTH_BYTES 32
0074 #define MPLEX_INTRINSICS_WIDTH_BITS 256
0075 #define AVX_INTRINSICS
0076
0077 #define LD(a, i) _mm256_load_ps(&a[i * N + n])
0078 #define ST(a, i, r) _mm256_store_ps(&a[i * N + n], r)
0079 #define ADD(a, b) _mm256_add_ps(a, b)
0080 #define MUL(a, b) _mm256_mul_ps(a, b)
0081 // #define FMA(a, b, v)  { __m256 temp = _mm256_mul_ps(a, b); v = _mm256_add_ps(temp, v); }
0082 inline __m256 FMA(const __m256 &a, const __m256 &b, const __m256 &v) {
0083   __m256 temp = _mm256_mul_ps(a, b);
0084   return _mm256_add_ps(temp, v);
0085 }
0086
0087 #endif
0088
0089 #endif
0090
0091 #ifdef __INTEL_COMPILER
0092 #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b)
0093 #else
0094 #define ASSUME_ALIGNED(a, b) a = static_cast<decltype(a)>(__builtin_assume_aligned(a, b))
0095 #endif
0096
0097 namespace Matriplex {
0098   typedef int idx_t;
0099
0100   void align_check(const char *pref, void *adr);
0101
0102   namespace internal {
0103     template <typename T>
0104     void sincos4(const T x, T &sin, T &cos) {
0105       // Had this writen with explicit division by factorial.
0106       // The *whole* fitting test ran like 2.5% slower on MIC, sigh.
0107
0108       const T x2 = x * x;
0109       cos = T(1.0) - T(0.5) * x2 + T(0.0416666666666666667) * x2 * x2;
0110       sin = x - T(0.166666666666666667) * x * x2;
0111     }
0112   }  // namespace internal
0113 }  // namespace Matriplex
0114
0115 #endif