Math/test/cudaMathTest.cu

0001 /**
0002  * Derived from the nVIDIA CUDA 8.0 samples by
0003  *
0004  *   Eyal Rozenberg <E.Rozenberg@cwi.nl>
0005  *
0006  * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
0007  * and the deriver is the owner of this code according to the EULA.
0008  *
0009  * Use this reasonably. If you want to discuss licensing formalities, please
0010  * contact the author.
0011  *
0012  *  Modified by VinInn for testing math funcs
0013  */
0014
0015 /* to run test
0016 foreach f ( $CMSSW_BASE/test/$SCRAM_ARCH/DFM_Vector* )
0017 echo $f; $f
0018 end
0019 */
0020
0021 #include <algorithm>
0022 #include <cassert>
0023 #include <chrono>
0024 #include <iomanip>
0025 #include <iostream>
0026 #include <memory>
0027 #include <random>
0028 #include <stdexcept>
0029
0030 #ifdef __CUDACC__
0031 #define inline __host__ __device__ inline
0032 #include <vdt/sin.h>
0033 #undef inline
0034 #else
0035 #include <vdt/sin.h>
0036 #endif
0037
0038 #include "DataFormats/Math/interface/approx_log.h"
0039 #include "DataFormats/Math/interface/approx_exp.h"
0040 #include "DataFormats/Math/interface/approx_atan2.h"
0041 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0042 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0043 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0044 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0045
0046 std::mt19937 eng;
0047 std::mt19937 eng2;
0048 std::uniform_real_distribution<float> rgen(0., 1.);
0049
0050 constexpr float myExp(float x) { return unsafe_expf<6>(x); }
0051
0052 constexpr float myLog(float x) { return unsafe_logf<6>(x); }
0053
0054 __host__ __device__ inline float mySin(float x) { return vdt::fast_sinf(x); }
0055
0056 constexpr int USEEXP = 0, USESIN = 1, USELOG = 2;
0057
0058 template <int USE, bool ADDY = false>
0059 // __host__ __device__
0060 constexpr float testFunc(float x, float y) {
0061   float ret = 0;
0062   if (USE == USEEXP)
0063     ret = myExp(x);
0064   else if (USE == USESIN)
0065     ret = mySin(x);
0066   else
0067     ret = myLog(x);
0068   return ADDY ? ret + y : ret;
0069 }
0070
0071 template <int USE, bool ADDY>
0072 __global__ void vectorOp(const float *A, const float *B, float *C, int numElements) {
0073   int i = blockDim.x * blockIdx.x + threadIdx.x;
0074   if (i < numElements) {
0075     C[i] = testFunc<USE, ADDY>(A[i], B[i]);
0076   }
0077 }
0078
0079 template <int USE, bool ADDY>
0080 void vectorOpH(const float *A, const float *B, float *C, int numElements) {
0081   for (int i = 0; i < numElements; ++i) {
0082     C[i] = testFunc<USE, ADDY>(A[i], B[i]);
0083   }
0084 }
0085
0086 template <int USE, bool ADDY = false>
0087 void go() {
0088   auto start = std::chrono::high_resolution_clock::now();
0089   auto delta = start - start;
0090
0091   int numElements = 200000;
0092   size_t size = numElements * sizeof(float);
0093   std::cout << "[Vector of " << numElements << " elements]\n";
0094
0095   auto h_A = std::make_unique<float[]>(numElements);
0096   auto h_B = std::make_unique<float[]>(numElements);
0097   auto h_C = std::make_unique<float[]>(numElements);
0098   auto h_C2 = std::make_unique<float[]>(numElements);
0099
0100   std::generate(h_A.get(), h_A.get() + numElements, [&]() { return rgen(eng); });
0101   std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); });
0102
0103   delta -= (std::chrono::high_resolution_clock::now() - start);
0104   auto d_A = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
0105   auto d_B = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
0106   auto d_C = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
0107
0108   cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
0109   cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
0110   delta += (std::chrono::high_resolution_clock::now() - start);
0111   std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0112             << std::endl;
0113
0114   // Launch the Vector OP CUDA Kernel
0115   int threadsPerBlock = 256;
0116   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0117   std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
0118
0119   delta -= (std::chrono::high_resolution_clock::now() - start);
0120   cms::cuda::launch(
0121       vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
0122   delta += (std::chrono::high_resolution_clock::now() - start);
0123   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0124             << std::endl;
0125
0126   delta -= (std::chrono::high_resolution_clock::now() - start);
0127   cms::cuda::launch(
0128       vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
0129   delta += (std::chrono::high_resolution_clock::now() - start);
0130   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0131             << std::endl;
0132
0133   delta -= (std::chrono::high_resolution_clock::now() - start);
0134   cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
0135   delta += (std::chrono::high_resolution_clock::now() - start);
0136   std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0137             << std::endl;
0138
0139   // on host now...
0140   delta -= (std::chrono::high_resolution_clock::now() - start);
0141   vectorOpH<USE, ADDY>(h_A.get(), h_B.get(), h_C2.get(), numElements);
0142   delta += (std::chrono::high_resolution_clock::now() - start);
0143   std::cout << "host computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0144             << std::endl;
0145
0146   delta -= (std::chrono::high_resolution_clock::now() - start);
0147   vectorOpH<USE, ADDY>(h_A.get(), h_B.get(), h_C2.get(), numElements);
0148   delta += (std::chrono::high_resolution_clock::now() - start);
0149   std::cout << "host computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0150             << std::endl;
0151
0152   // Verify that the result vector is correct
0153   double ave = 0;
0154   int maxDiff = 0;
0155   long long ndiff = 0;
0156   double fave = 0;
0157   float fmaxDiff = 0;
0158   for (int i = 0; i < numElements; ++i) {
0159     approx_math::binary32 g, c;
0160     g.f = testFunc<USE, ADDY>(h_A[i], h_B[i]);
0161     c.f = h_C[i];
0162     auto diff = std::abs(g.i32 - c.i32);
0163     maxDiff = std::max(diff, maxDiff);
0164     ave += diff;
0165     if (diff != 0)
0166       ++ndiff;
0167     auto fdiff = std::abs(g.f - c.f);
0168     fave += fdiff;
0169     fmaxDiff = std::max(fdiff, fmaxDiff);
0170     //           if (diff>7)
0171     //           std::cerr << "Large diff at element " << i << ' ' << diff << ' ' << std::hexfloat
0172     //                                  << g.f << "!=" << c.f << "\n";
0173   }
0174   std::cout << "ndiff ave, max " << ndiff << ' ' << ave / numElements << ' ' << maxDiff << std::endl;
0175   std::cout << "float ave, max " << fave / numElements << ' ' << fmaxDiff << std::endl;
0176   if (!ndiff) {
0177     std::cout << "Test PASSED\n";
0178     std::cout << "SUCCESS" << std::endl;
0179   }
0180   cudaDeviceSynchronize();
0181 }
0182
0183 int main() {
0184   cms::cudatest::requireDevices();
0185
0186   try {
0187     go<USEEXP>();
0188     go<USESIN>();
0189     go<USELOG>();
0190     go<USELOG, true>();
0191   } catch (std::runtime_error &ex) {
0192     std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
0193     exit(EXIT_FAILURE);
0194   } catch (...) {
0195     std::cerr << "A non-CUDA error occurred" << std::endl;
0196     exit(EXIT_FAILURE);
0197   }
0198
0199   return EXIT_SUCCESS;
0200 }