Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-03-09 04:50:20

0001 #include "catch.hpp"
0002 
0003 #include "CUDADataFormats/Common/interface/Product.h"
0004 #include "FWCore/Concurrency/interface/WaitingTask.h"
0005 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0006 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0007 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0008 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
0016 
0017 #include "test_ScopedContextKernels.h"
0018 
0019 namespace cms::cudatest {
0020   class TestScopedContext {
0021   public:
0022     static cuda::ScopedContextProduce make(int dev, bool createEvent) {
0023       cms::cuda::SharedEventPtr event;
0024       if (createEvent) {
0025         event = cms::cuda::getEventCache().get();
0026       }
0027       return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
0028     }
0029   };
0030 }  // namespace cms::cudatest
0031 
0032 namespace {
0033   std::unique_ptr<cms::cuda::Product<int*>> produce(int device, int* d, int* h) {
0034     auto ctx = cms::cudatest::TestScopedContext::make(device, true);
0035     cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
0036     cms::cudatest::testScopedContextKernels_single(d, ctx.stream());
0037     return ctx.wrap(d);
0038   }
0039 }  // namespace
0040 
0041 TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
0042   if (not cms::cudatest::testDevices()) {
0043     return;
0044   }
0045 
0046   constexpr int defaultDevice = 0;
0047   {
0048     auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
0049 
0050     SECTION("Construct from device ID") { REQUIRE(cms::cuda::currentDevice() == defaultDevice); }
0051 
0052     SECTION("Wrap T to cms::cuda::Product<T>") {
0053       std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
0054       REQUIRE(dataPtr.get() != nullptr);
0055       REQUIRE(dataPtr->device() == ctx.device());
0056       REQUIRE(dataPtr->stream() == ctx.stream());
0057     }
0058 
0059     SECTION("Construct from from cms::cuda::Product<T>") {
0060       std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
0061       const auto& data = *dataPtr;
0062 
0063       cms::cuda::ScopedContextProduce ctx2{data};
0064       REQUIRE(cms::cuda::currentDevice() == data.device());
0065       REQUIRE(ctx2.stream() == data.stream());
0066 
0067       // Second use of a product should lead to new stream
0068       cms::cuda::ScopedContextProduce ctx3{data};
0069       REQUIRE(cms::cuda::currentDevice() == data.device());
0070       REQUIRE(ctx3.stream() != data.stream());
0071     }
0072 
0073     SECTION("Storing state in cms::cuda::ContextState") {
0074       cms::cuda::ContextState ctxstate;
0075       {  // acquire
0076         std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
0077         const auto& data = *dataPtr;
0078         tbb::task_group group;
0079         edm::WaitingTaskWithArenaHolder dummy{group, edm::make_waiting_task([](std::exception_ptr const* iPtr) {})};
0080         cms::cuda::ScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
0081       }
0082 
0083       {  // produce
0084         cms::cuda::ScopedContextProduce ctx2{ctxstate};
0085         REQUIRE(cms::cuda::currentDevice() == ctx.device());
0086         REQUIRE(ctx2.stream() == ctx.stream());
0087       }
0088     }
0089 
0090     SECTION("Joining multiple CUDA streams") {
0091       cms::cuda::ScopedSetDevice setDeviceForThisScope(defaultDevice);
0092 
0093       // Mimick a producer on the first CUDA stream
0094       int h_a1 = 1;
0095       auto d_a1 = cms::cuda::make_device_unique<int>(nullptr);
0096       auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
0097 
0098       // Mimick a producer on the second CUDA stream
0099       int h_a2 = 2;
0100       auto d_a2 = cms::cuda::make_device_unique<int>(nullptr);
0101       auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
0102 
0103       REQUIRE(wprod1->stream() != wprod2->stream());
0104 
0105       // Mimick a third producer "joining" the two streams
0106       cms::cuda::ScopedContextProduce ctx2{*wprod1};
0107 
0108       auto prod1 = ctx2.get(*wprod1);
0109       auto prod2 = ctx2.get(*wprod2);
0110 
0111       auto d_a3 = cms::cuda::make_device_unique<int>(nullptr);
0112       cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
0113       cudaCheck(cudaStreamSynchronize(ctx2.stream()));
0114       REQUIRE(wprod2->isAvailable());
0115       REQUIRE(cms::cuda::eventWorkHasCompleted(wprod2->event()));
0116 
0117       h_a1 = 0;
0118       h_a2 = 0;
0119       int h_a3 = 0;
0120 
0121       cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
0122       cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
0123       cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
0124 
0125       REQUIRE(h_a1 == 2);
0126       REQUIRE(h_a2 == 4);
0127       REQUIRE(h_a3 == 6);
0128     }
0129   }
0130 
0131   cudaCheck(cudaSetDevice(defaultDevice));
0132   cudaCheck(cudaDeviceSynchronize());
0133   // Note: CUDA resources are cleaned up by the destructors of the global cache objects
0134 }