CUDAUtilities/src/EventCache.cc

0001 #include "FWCore/Utilities/interface/thread_safety_macros.h"
0002 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0003 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0004 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0005 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
0006 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
0007 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
0008
0009 namespace cms::cuda {
0010   void EventCache::Deleter::operator()(cudaEvent_t event) const {
0011     if (device_ != -1) {
0012       ScopedSetDevice deviceGuard{device_};
0013       cudaCheck(cudaEventDestroy(event));
0014     }
0015   }
0016
0017   // EventCache should be constructed by the first call to
0018   // getEventCache() only if we have CUDA devices present
0019   EventCache::EventCache() : cache_(deviceCount()) {}
0020
0021   SharedEventPtr EventCache::get() {
0022     const auto dev = currentDevice();
0023     auto event = makeOrGet(dev);
0024     // captured work has completed, or a just-created event
0025     if (eventWorkHasCompleted(event.get())) {
0026       return event;
0027     }
0028
0029     // Got an event with incomplete captured work. Try again until we
0030     // get a completed (or a just-created) event. Need to keep all
0031     // incomplete events until a completed event is found in order to
0032     // avoid ping-pong with an incomplete event.
0033     std::vector<SharedEventPtr> ptrs{std::move(event)};
0034     bool completed;
0035     do {
0036       event = makeOrGet(dev);
0037       completed = eventWorkHasCompleted(event.get());
0038       if (not completed) {
0039         ptrs.emplace_back(std::move(event));
0040       }
0041     } while (not completed);
0042     return event;
0043   }
0044
0045   SharedEventPtr EventCache::makeOrGet(int dev) {
0046     return cache_[dev].makeOrGet([dev]() {
0047       cudaEvent_t event;
0048       // it should be a bit faster to ignore timings
0049       // cudaEventBlockingSync is needed to let the thread calling
0050       // cudaEventSynchronize() to sleep instead of spinning the CPU
0051       cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming | cudaEventBlockingSync));
0052       return std::unique_ptr<BareEvent, Deleter>(event, Deleter{dev});
0053     });
0054   }
0055
0056   void EventCache::clear() {
0057     // Reset the contents of the caches, but leave an
0058     // edm::ReusableObjectHolder alive for each device. This is needed
0059     // mostly for the unit tests, where the function-static
0060     // EventCache lives through multiple tests (and go through
0061     // multiple shutdowns of the framework).
0062     cache_.clear();
0063     cache_.resize(deviceCount());
0064   }
0065
0066   EventCache& getEventCache() {
0067     // the public interface is thread safe
0068     CMS_THREAD_SAFE static EventCache cache;
0069     return cache;
0070   }
0071 }  // namespace cms::cuda