CUDADevice/test/testDeviceAddition.cu

0001 #include <cstddef>
0002 #include <cstdint>
0003 #include <random>
0004 #include <vector>
0005
0006 #define CATCH_CONFIG_MAIN
0007 #include <catch.hpp>
0008
0009 #include <cuda_runtime.h>
0010
0011 #include "HeterogeneousTest/CUDADevice/interface/DeviceAddition.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0014
0015 __global__ void kernel_add_vectors_f(const float* __restrict__ in1,
0016                                      const float* __restrict__ in2,
0017                                      float* __restrict__ out,
0018                                      size_t size) {
0019   cms::cudatest::add_vectors_f(in1, in2, out, size);
0020 }
0021
0022 TEST_CASE("HeterogeneousTest/CUDADevice test", "[cudaTestDeviceAddition]") {
0023   cms::cudatest::requireDevices();
0024
0025   // random number generator with a gaussian distribution
0026   std::random_device rd{};
0027   std::default_random_engine rand{rd()};
0028   std::normal_distribution<float> dist{0., 1.};
0029
0030   // tolerance
0031   constexpr float epsilon = 0.000001;
0032
0033   // buffer size
0034   constexpr size_t size = 1024 * 1024;
0035
0036   // allocate input and output host buffers
0037   std::vector<float> in1_h(size);
0038   std::vector<float> in2_h(size);
0039   std::vector<float> out_h(size);
0040
0041   // fill the input buffers with random data, and the output buffer with zeros
0042   for (size_t i = 0; i < size; ++i) {
0043     in1_h[i] = dist(rand);
0044     in2_h[i] = dist(rand);
0045     out_h[i] = 0.;
0046   }
0047
0048   SECTION("Test add_vectors_f") {
0049     // allocate input and output buffers on the device
0050     float* in1_d;
0051     float* in2_d;
0052     float* out_d;
0053     REQUIRE_NOTHROW(cudaCheck(cudaMalloc(&in1_d, size * sizeof(float))));
0054     REQUIRE_NOTHROW(cudaCheck(cudaMalloc(&in2_d, size * sizeof(float))));
0055     REQUIRE_NOTHROW(cudaCheck(cudaMalloc(&out_d, size * sizeof(float))));
0056
0057     // copy the input data to the device
0058     REQUIRE_NOTHROW(cudaCheck(cudaMemcpy(in1_d, in1_h.data(), size * sizeof(float), cudaMemcpyHostToDevice)));
0059     REQUIRE_NOTHROW(cudaCheck(cudaMemcpy(in2_d, in2_h.data(), size * sizeof(float), cudaMemcpyHostToDevice)));
0060
0061     // fill the output buffer with zeros
0062     REQUIRE_NOTHROW(cudaCheck(cudaMemset(out_d, 0, size * sizeof(float))));
0063
0064     // launch the 1-dimensional kernel for vector addition
0065     kernel_add_vectors_f<<<32, 32>>>(in1_d, in2_d, out_d, size);
0066     REQUIRE_NOTHROW(cudaCheck(cudaGetLastError()));
0067
0068     // copy the results from the device to the host
0069     REQUIRE_NOTHROW(cudaCheck(cudaMemcpy(out_h.data(), out_d, size * sizeof(float), cudaMemcpyDeviceToHost)));
0070
0071     // wait for all the operations to complete
0072     REQUIRE_NOTHROW(cudaCheck(cudaDeviceSynchronize()));
0073
0074     // check the results
0075     for (size_t i = 0; i < size; ++i) {
0076       float sum = in1_h[i] + in2_h[i];
0077       CHECK_THAT(out_h[i], Catch::Matchers::WithinAbs(sum, epsilon));
0078     }
0079   }
0080 }