CUDAOpaque/src/DeviceAdditionOpaque.cc

0001 #include <cstddef>
0002
0003 #include <cuda_runtime.h>
0004
0005 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0006 #include "HeterogeneousTest/CUDAOpaque/interface/DeviceAdditionOpaque.h"
0007 #include "HeterogeneousTest/CUDAWrapper/interface/DeviceAdditionWrapper.h"
0008
0009 namespace cms::cudatest {
0010
0011   void opaque_add_vectors_f(const float* in1_h, const float* in2_h, float* out_h, size_t size) {
0012     // allocate input and output buffers on the device
0013     float* in1_d;
0014     float* in2_d;
0015     float* out_d;
0016     cudaCheck(cudaMalloc(&in1_d, size * sizeof(float)));
0017     cudaCheck(cudaMalloc(&in2_d, size * sizeof(float)));
0018     cudaCheck(cudaMalloc(&out_d, size * sizeof(float)));
0019
0020     // copy the input data to the device
0021     cudaCheck(cudaMemcpy(in1_d, in1_h, size * sizeof(float), cudaMemcpyHostToDevice));
0022     cudaCheck(cudaMemcpy(in2_d, in2_h, size * sizeof(float), cudaMemcpyHostToDevice));
0023
0024     // fill the output buffer with zeros
0025     cudaCheck(cudaMemset(out_d, 0, size * sizeof(float)));
0026
0027     // launch the 1-dimensional kernel for vector addition
0028     wrapper_add_vectors_f(in1_d, in2_d, out_d, size);
0029
0030     // copy the results from the device to the host
0031     cudaCheck(cudaMemcpy(out_h, out_d, size * sizeof(float), cudaMemcpyDeviceToHost));
0032
0033     // wait for all the operations to complete
0034     cudaCheck(cudaDeviceSynchronize());
0035
0036     // free the input and output buffers on the device
0037     cudaCheck(cudaFree(in1_d));
0038     cudaCheck(cudaFree(in2_d));
0039     cudaCheck(cudaFree(out_d));
0040   }
0041
0042   void opaque_add_vectors_d(const double* in1_h, const double* in2_h, double* out_h, size_t size) {
0043     // allocate input and output buffers on the device
0044     double* in1_d;
0045     double* in2_d;
0046     double* out_d;
0047     cudaCheck(cudaMalloc(&in1_d, size * sizeof(double)));
0048     cudaCheck(cudaMalloc(&in2_d, size * sizeof(double)));
0049     cudaCheck(cudaMalloc(&out_d, size * sizeof(double)));
0050
0051     // copy the input data to the device
0052     cudaCheck(cudaMemcpy(in1_d, in1_h, size * sizeof(double), cudaMemcpyHostToDevice));
0053     cudaCheck(cudaMemcpy(in2_d, in2_h, size * sizeof(double), cudaMemcpyHostToDevice));
0054
0055     // fill the output buffer with zeros
0056     cudaCheck(cudaMemset(out_d, 0, size * sizeof(double)));
0057
0058     // launch the 1-dimensional kernel for vector addition
0059     wrapper_add_vectors_d(in1_d, in2_d, out_d, size);
0060
0061     // copy the results from the device to the host
0062     cudaCheck(cudaMemcpy(out_h, out_d, size * sizeof(double), cudaMemcpyDeviceToHost));
0063
0064     // wait for all the operations to complete
0065     cudaCheck(cudaDeviceSynchronize());
0066
0067     // free the input and output buffers on the device
0068     cudaCheck(cudaFree(in1_d));
0069     cudaCheck(cudaFree(in2_d));
0070     cudaCheck(cudaFree(out_d));
0071   }
0072
0073 }  // namespace cms::cudatest