Math/test/cudaAtan2Test.cu

0001 /**
0002  * Derived from the nVIDIA CUDA 8.0 samples by
0003  *
0004  *   Eyal Rozenberg <E.Rozenberg@cwi.nl>
0005  *
0006  * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
0007  * and the deriver is the owner of this code according to the EULA.
0008  *
0009  * Use this reasonably. If you want to discuss licensing formalities, please
0010  * contact the author.
0011  *
0012  *  Modified by VinInn for testing math funcs
0013  */
0014
0015 /* to run test
0016 foreach f ( $CMSSW_BASE/test/$SCRAM_ARCH/DFM_Vector* )
0017 echo $f; $f
0018 end
0019 */
0020
0021 #include <algorithm>
0022 #include <cassert>
0023 #include <chrono>
0024 #include <iomanip>
0025 #include <iostream>
0026 #include <memory>
0027 #include <random>
0028 #include <stdexcept>
0029
0030 #include "DataFormats/Math/interface/approx_atan2.h"
0031 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0032 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0033 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0034 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0035
0036 constexpr float xmin = -100.001;  // avoid 0
0037 constexpr float incr = 0.04;
0038 constexpr int Nsteps = 2. * std::abs(xmin) / incr;
0039
0040 template <int DEGREE>
0041 __global__ void diffAtan(int *diffs) {
0042   auto mdiff = &diffs[0];
0043   auto idiff = &diffs[1];
0044   auto sdiff = &diffs[2];
0045
0046   int i = blockDim.x * blockIdx.x + threadIdx.x;
0047   int j = blockDim.y * blockIdx.y + threadIdx.y;
0048
0049   auto x = xmin + incr * i;
0050   auto y = xmin + incr * j;
0051
0052   auto approx = unsafe_atan2f<DEGREE>(y, x);
0053   auto iapprox = unsafe_atan2i<DEGREE>(y, x);
0054   auto sapprox = unsafe_atan2s<DEGREE>(y, x);
0055   auto std = std::atan2(y, x);
0056   auto fd = std::abs(std - approx);
0057   atomicMax(mdiff, int(fd * 1.e7));
0058   atomicMax(idiff, std::abs(phi2int(std) - iapprox));
0059   short dd = std::abs(phi2short(std) - sapprox);
0060   atomicMax(sdiff, int(dd));
0061 }
0062
0063 template <int DEGREE>
0064 void go() {
0065   auto start = std::chrono::high_resolution_clock::now();
0066   auto delta = start - start;
0067
0068   // atan2
0069   delta -= (std::chrono::high_resolution_clock::now() - start);
0070
0071   auto diff_d = cms::cuda::make_device_unique<int[]>(3, nullptr);
0072
0073   int diffs[3];
0074   cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));
0075
0076   // Launch the diff CUDA Kernel
0077   dim3 threadsPerBlock(32, 32, 1);
0078   dim3 blocksPerGrid(
0079       (Nsteps + threadsPerBlock.x - 1) / threadsPerBlock.x, (Nsteps + threadsPerBlock.y - 1) / threadsPerBlock.y, 1);
0080   std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y
0081             << " threads\n";
0082
0083   cms::cuda::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
0084
0085   cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
0086   delta += (std::chrono::high_resolution_clock::now() - start);
0087
0088   float mdiff = diffs[0] * 1.e-7;
0089   int idiff = diffs[1];
0090   int sdiff = diffs[2];
0091
0092   std::cout << "for degree " << DEGREE << " max diff is " << mdiff << ' ' << idiff << ' ' << int2phi(idiff) << ' '
0093             << sdiff << ' ' << short2phi(sdiff) << std::endl;
0094   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
0095             << std::endl;
0096 }
0097
0098 int main() {
0099   cms::cudatest::requireDevices();
0100
0101   try {
0102     go<3>();
0103     go<5>();
0104     go<7>();
0105     go<9>();
0106   } catch (std::runtime_error &ex) {
0107     std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
0108     exit(EXIT_FAILURE);
0109   } catch (...) {
0110     std::cerr << "A non-CUDA error occurred" << std::endl;
0111     exit(EXIT_FAILURE);
0112   }
0113
0114   return EXIT_SUCCESS;
0115 }