Warning, /DataFormats/Math/test/cudaMathTest.cu is written in an unsupported language. File is not indexed.
0001 /**
0002 * Derived from the nVIDIA CUDA 8.0 samples by
0003 *
0004 * Eyal Rozenberg <E.Rozenberg@cwi.nl>
0005 *
0006 * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
0007 * and the deriver is the owner of this code according to the EULA.
0008 *
0009 * Use this reasonably. If you want to discuss licensing formalities, please
0010 * contact the author.
0011 *
0012 * Modified by VinInn for testing math funcs
0013 */
0014
0015 /* to run test
0016 foreach f ( $CMSSW_BASE/test/$SCRAM_ARCH/DFM_Vector* )
0017 echo $f; $f
0018 end
0019 */
0020
0021 #include <algorithm>
0022 #include <cassert>
0023 #include <chrono>
0024 #include <iomanip>
0025 #include <iostream>
0026 #include <memory>
0027 #include <random>
0028 #include <stdexcept>
0029
0030 #ifdef __CUDACC__
0031 #define inline __host__ __device__ inline
0032 #include <vdt/sin.h>
0033 #undef inline
0034 #else
0035 #include <vdt/sin.h>
0036 #endif
0037
0038 #include "DataFormats/Math/interface/approx_log.h"
0039 #include "DataFormats/Math/interface/approx_exp.h"
0040 #include "DataFormats/Math/interface/approx_atan2.h"
0041 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0042 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0043 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0044 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0045
0046 std::mt19937 eng;
0047 std::mt19937 eng2;
0048 std::uniform_real_distribution<float> rgen(0., 1.);
0049
0050 constexpr float myExp(float x) { return unsafe_expf<6>(x); }
0051
0052 constexpr float myLog(float x) { return unsafe_logf<6>(x); }
0053
0054 __host__ __device__ inline float mySin(float x) { return vdt::fast_sinf(x); }
0055
0056 constexpr int USEEXP = 0, USESIN = 1, USELOG = 2;
0057
0058 template <int USE, bool ADDY = false>
0059 // __host__ __device__
0060 constexpr float testFunc(float x, float y) {
0061 float ret = 0;
0062 if (USE == USEEXP)
0063 ret = myExp(x);
0064 else if (USE == USESIN)
0065 ret = mySin(x);
0066 else
0067 ret = myLog(x);
0068 return ADDY ? ret + y : ret;
0069 }
0070
0071 template <int USE, bool ADDY>
0072 __global__ void vectorOp(const float *A, const float *B, float *C, int numElements) {
0073 int i = blockDim.x * blockIdx.x + threadIdx.x;
0074 if (i < numElements) {
0075 C[i] = testFunc<USE, ADDY>(A[i], B[i]);
0076 }
0077 }
0078
0079 template <int USE, bool ADDY>
0080 void vectorOpH(const float *A, const float *B, float *C, int numElements) {
0081 for (int i = 0; i < numElements; ++i) {
0082 C[i] = testFunc<USE, ADDY>(A[i], B[i]);
0083 }
0084 }
0085
0086 template <int USE, bool ADDY = false>
0087 void go() {
0088 auto start = std::chrono::high_resolution_clock::now();
0089 auto delta = start - start;
0090
0091 int numElements = 200000;
0092 size_t size = numElements * sizeof(float);
0093 std::cout << "[Vector of " << numElements << " elements]\n";
0094
0095 auto h_A = std::make_unique<float[]>(numElements);
0096 auto h_B = std::make_unique<float[]>(numElements);
0097 auto h_C = std::make_unique<float[]>(numElements);
0098 auto h_C2 = std::make_unique<float[]>(numElements);
0099
0100 std::generate(h_A.get(), h_A.get() + numElements, [&]() { return rgen(eng); });
0101 std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); });
0102
0103 delta -= (std::chrono::high_resolution_clock::now() - start);
0104 auto d_A = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
0105 auto d_B = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
0106 auto d_C = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
0107
0108 cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
0109 cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
0110 delta += (std::chrono::high_resolution_clock::now() - start);
0111 std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0112 << std::endl;
0113
0114 // Launch the Vector OP CUDA Kernel
0115 int threadsPerBlock = 256;
0116 int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0117 std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
0118
0119 delta -= (std::chrono::high_resolution_clock::now() - start);
0120 cms::cuda::launch(
0121 vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
0122 delta += (std::chrono::high_resolution_clock::now() - start);
0123 std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0124 << std::endl;
0125
0126 delta -= (std::chrono::high_resolution_clock::now() - start);
0127 cms::cuda::launch(
0128 vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
0129 delta += (std::chrono::high_resolution_clock::now() - start);
0130 std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0131 << std::endl;
0132
0133 delta -= (std::chrono::high_resolution_clock::now() - start);
0134 cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
0135 delta += (std::chrono::high_resolution_clock::now() - start);
0136 std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0137 << std::endl;
0138
0139 // on host now...
0140 delta -= (std::chrono::high_resolution_clock::now() - start);
0141 vectorOpH<USE, ADDY>(h_A.get(), h_B.get(), h_C2.get(), numElements);
0142 delta += (std::chrono::high_resolution_clock::now() - start);
0143 std::cout << "host computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0144 << std::endl;
0145
0146 delta -= (std::chrono::high_resolution_clock::now() - start);
0147 vectorOpH<USE, ADDY>(h_A.get(), h_B.get(), h_C2.get(), numElements);
0148 delta += (std::chrono::high_resolution_clock::now() - start);
0149 std::cout << "host computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0150 << std::endl;
0151
0152 // Verify that the result vector is correct
0153 double ave = 0;
0154 int maxDiff = 0;
0155 long long ndiff = 0;
0156 double fave = 0;
0157 float fmaxDiff = 0;
0158 for (int i = 0; i < numElements; ++i) {
0159 approx_math::binary32 g, c;
0160 g.f = testFunc<USE, ADDY>(h_A[i], h_B[i]);
0161 c.f = h_C[i];
0162 auto diff = std::abs(g.i32 - c.i32);
0163 maxDiff = std::max(diff, maxDiff);
0164 ave += diff;
0165 if (diff != 0)
0166 ++ndiff;
0167 auto fdiff = std::abs(g.f - c.f);
0168 fave += fdiff;
0169 fmaxDiff = std::max(fdiff, fmaxDiff);
0170 // if (diff>7)
0171 // std::cerr << "Large diff at element " << i << ' ' << diff << ' ' << std::hexfloat
0172 // << g.f << "!=" << c.f << "\n";
0173 }
0174 std::cout << "ndiff ave, max " << ndiff << ' ' << ave / numElements << ' ' << maxDiff << std::endl;
0175 std::cout << "float ave, max " << fave / numElements << ' ' << fmaxDiff << std::endl;
0176 if (!ndiff) {
0177 std::cout << "Test PASSED\n";
0178 std::cout << "SUCCESS" << std::endl;
0179 }
0180 cudaDeviceSynchronize();
0181 }
0182
0183 int main() {
0184 cms::cudatest::requireDevices();
0185
0186 try {
0187 go<USEEXP>();
0188 go<USESIN>();
0189 go<USELOG>();
0190 go<USELOG, true>();
0191 } catch (std::runtime_error &ex) {
0192 std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
0193 exit(EXIT_FAILURE);
0194 } catch (...) {
0195 std::cerr << "A non-CUDA error occurred" << std::endl;
0196 exit(EXIT_FAILURE);
0197 }
0198
0199 return EXIT_SUCCESS;
0200 }