Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
/**
 * Derived from the nVIDIA CUDA 8.0 samples by
 *
 *   Eyal Rozenberg <E.Rozenberg@cwi.nl>
 *
 * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
 * and the deriver is the owner of this code according to the EULA.
 *
 * Use this reasonably. If you want to discuss licensing formalities, please
 * contact the author.
 *
 *  Modified by VinInn for testing math funcs
 */

/* to run test
foreach f ( $CMSSW_BASE/test/$SCRAM_ARCH/DFM_Vector* )
echo $f; $f
end
*/

#include <algorithm>
#include <cassert>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <memory>
#include <random>
#include <stdexcept>

#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

constexpr float xmin = -100.001;  // avoid 0
constexpr float incr = 0.04;
constexpr int Nsteps = 2. * std::abs(xmin) / incr;

template <int DEGREE>
__global__ void diffAtan(int *diffs) {
  auto mdiff = &diffs[0];
  auto idiff = &diffs[1];
  auto sdiff = &diffs[2];

  int i = blockDim.x * blockIdx.x + threadIdx.x;
  int j = blockDim.y * blockIdx.y + threadIdx.y;

  auto x = xmin + incr * i;
  auto y = xmin + incr * j;

  auto approx = unsafe_atan2f<DEGREE>(y, x);
  auto iapprox = unsafe_atan2i<DEGREE>(y, x);
  auto sapprox = unsafe_atan2s<DEGREE>(y, x);
  auto std = std::atan2(y, x);
  auto fd = std::abs(std - approx);
  atomicMax(mdiff, int(fd * 1.e7));
  atomicMax(idiff, std::abs(phi2int(std) - iapprox));
  short dd = std::abs(phi2short(std) - sapprox);
  atomicMax(sdiff, int(dd));
}

template <int DEGREE>
void go() {
  auto start = std::chrono::high_resolution_clock::now();
  auto delta = start - start;

  // atan2
  delta -= (std::chrono::high_resolution_clock::now() - start);

  auto diff_d = cms::cuda::make_device_unique<int[]>(3, nullptr);

  int diffs[3];
  cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));

  // Launch the diff CUDA Kernel
  dim3 threadsPerBlock(32, 32, 1);
  dim3 blocksPerGrid(
      (Nsteps + threadsPerBlock.x - 1) / threadsPerBlock.x, (Nsteps + threadsPerBlock.y - 1) / threadsPerBlock.y, 1);
  std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y
            << " threads\n";

  cms::cuda::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());

  cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
  delta += (std::chrono::high_resolution_clock::now() - start);

  float mdiff = diffs[0] * 1.e-7;
  int idiff = diffs[1];
  int sdiff = diffs[2];

  std::cout << "for degree " << DEGREE << " max diff is " << mdiff << ' ' << idiff << ' ' << int2phi(idiff) << ' '
            << sdiff << ' ' << short2phi(sdiff) << std::endl;
  std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
            << std::endl;
}

int main() {
  cms::cudatest::requireDevices();

  try {
    go<3>();
    go<5>();
    go<7>();
    go<9>();
  } catch (std::runtime_error &ex) {
    std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
    exit(EXIT_FAILURE);
  } catch (...) {
    std::cerr << "A non-CUDA error occurred" << std::endl;
    exit(EXIT_FAILURE);
  }

  return EXIT_SUCCESS;
}