File indexing completed on 2024-04-06 12:15:46
0001 #include <Eigen/Dense>
0002
0003 #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
0004
0005 template <int32_t S>
0006 struct MySoA {
0007
0008 static constexpr int32_t stride() { return S; }
0009
0010 eigenSoA::ScalarSoA<float, S> a;
0011 eigenSoA::ScalarSoA<float, S> b;
0012 };
0013
0014 using V = MySoA<128>;
0015
0016 __global__ void testBasicSoA(float* p) {
0017 using namespace eigenSoA;
0018
0019 assert(!isPowerOf2(0));
0020 assert(isPowerOf2(1));
0021 assert(isPowerOf2(1024));
0022 assert(!isPowerOf2(1026));
0023
0024 using M3 = Eigen::Matrix<float, 3, 3>;
0025
0026 __shared__ eigenSoA::MatrixSoA<M3, 64> m;
0027
0028 int first = threadIdx.x + blockIdx.x * blockDim.x;
0029 if (0 == first)
0030 printf("before %f\n", p[0]);
0031
0032
0033 int n = 64;
0034 for (int i = first; i < n; i += blockDim.x * gridDim.x) {
0035 m[i].setZero();
0036 m[i](0, 0) = p[i];
0037 m[i](1, 1) = p[i + 64];
0038 m[i](2, 2) = p[i + 64 * 2];
0039 }
0040 __syncthreads();
0041
0042 for (int i = first; i < n; i += blockDim.x * gridDim.x)
0043 m[i] = m[i].inverse().eval();
0044 __syncthreads();
0045
0046 for (int i = first; i < n; i += blockDim.x * gridDim.x) {
0047 p[i] = m[63 - i](0, 0);
0048 p[i + 64] = m[63 - i](1, 1);
0049 p[i + 64 * 2] = m[63 - i](2, 2);
0050 }
0051
0052 if (0 == first)
0053 printf("after %f\n", p[0]);
0054 }
0055
0056 #include <cassert>
0057 #include <iostream>
0058 #include <memory>
0059 #include <random>
0060
0061 #ifdef __CUDACC__
0062 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0063 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0064 #endif
0065
0066 int main() {
0067 #ifdef __CUDACC__
0068 cms::cudatest::requireDevices();
0069 #endif
0070
0071 float p[1024];
0072
0073 std::uniform_real_distribution<float> rgen(0.01, 0.99);
0074 std::mt19937 eng;
0075
0076 for (auto& r : p)
0077 r = rgen(eng);
0078 for (int i = 0, n = 64 * 3; i < n; ++i)
0079 assert(p[i] > 0 && p[i] < 1.);
0080
0081 std::cout << p[0] << std::endl;
0082 #ifdef __CUDACC__
0083 float* p_d;
0084 cudaCheck(cudaMalloc(&p_d, 1024 * 4));
0085 cudaCheck(cudaMemcpy(p_d, p, 1024 * 4, cudaMemcpyDefault));
0086 testBasicSoA<<<1, 1024>>>(p_d);
0087 cudaCheck(cudaGetLastError());
0088 cudaCheck(cudaMemcpy(p, p_d, 1024 * 4, cudaMemcpyDefault));
0089 cudaCheck(cudaDeviceSynchronize());
0090 #else
0091 testBasicSoA(p);
0092 #endif
0093
0094 std::cout << p[0] << std::endl;
0095
0096 for (int i = 0, n = 64 * 3; i < n; ++i)
0097 assert(p[i] > 1.);
0098
0099 std::cout << "END" << std::endl;
0100 return 0;
0101 }