CUDACore/interface/ESProduct.h

0001 #ifndef HeterogeneousCore_CUDACore_ESProduct_h
0002 #define HeterogeneousCore_CUDACore_ESProduct_h
0003
0004 #include <atomic>
0005 #include <cassert>
0006 #include <mutex>
0007 #include <vector>
0008
0009 #include "FWCore/Utilities/interface/thread_safety_macros.h"
0010 #include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
0016
0017 namespace cms {
0018   namespace cuda {
0019     template <typename T>
0020     class ESProduct {
0021     public:
0022       ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
0023         if (not gpuDataPerDevice_.empty()) {
0024           cms::cuda::ScopedSetDevice scopedDevice;
0025           for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
0026             scopedDevice.set(i);
0027             gpuDataPerDevice_[i].m_event = getEventCache().get();
0028           }
0029         }
0030       }
0031
0032       ~ESProduct() = default;
0033
0034       // transferAsync should be a function of (T&, cudaStream_t)
0035       // which enqueues asynchronous transfers (possibly kernels as well)
0036       // to the CUDA stream
0037       template <typename F>
0038       const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
0039         int device = currentDevice();
0040         auto& data = gpuDataPerDevice_[device];
0041
0042         // If the GPU data has already been filled, we can return it immediately
0043         if (not data.m_filled.load()) {
0044           // It wasn't, so need to fill it
0045           std::scoped_lock<std::mutex> lk{data.m_mutex};
0046
0047           if (data.m_filled.load()) {
0048             // Other thread marked it filled while we were locking the mutex, so we're free to return it
0049             return data.m_data;
0050           }
0051
0052           if (data.m_fillingStream != nullptr) {
0053             // Someone else is filling
0054
0055             // Check first if the recorded event has occurred
0056             if (eventWorkHasCompleted(data.m_event.get())) {
0057               // It was, so data is accessible from all CUDA streams on
0058               // the device. Set the 'filled' for all subsequent calls and
0059               // return the value
0060               auto should_be_false = data.m_filled.exchange(true);
0061               assert(not should_be_false);
0062               data.m_fillingStream = nullptr;
0063             } else if (data.m_fillingStream != cudaStream) {
0064               // Filling is still going on. For other CUDA stream, add
0065               // wait on the CUDA stream and return the value. Subsequent
0066               // work queued on the stream will wait for the event to
0067               // occur (i.e. transfer to finish).
0068               cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
0069                         "Failed to make a stream to wait for an event");
0070             }
0071             // else: filling is still going on. But for the same CUDA
0072             // stream (which would be a bit strange but fine), we can just
0073             // return as all subsequent work should be enqueued to the
0074             // same CUDA stream (or stream to be explicitly synchronized
0075             // by the caller)
0076           } else {
0077             // Now we can be sure that the data is not yet on the GPU, and
0078             // this thread is the first to try that.
0079             transferAsync(data.m_data, cudaStream);
0080             assert(data.m_fillingStream == nullptr);
0081             data.m_fillingStream = cudaStream;
0082             // Record in the cudaStream an event to mark the readiness of the
0083             // EventSetup data on the GPU, so other streams can check for it
0084             cudaCheck(cudaEventRecord(data.m_event.get(), cudaStream));
0085             // Now the filling has been enqueued to the cudaStream, so we
0086             // can return the GPU data immediately, since all subsequent
0087             // work must be either enqueued to the cudaStream, or the cudaStream
0088             // must be synchronized by the caller
0089           }
0090         }
0091
0092         return data.m_data;
0093       }
0094
0095     private:
0096       struct Item {
0097         mutable std::mutex m_mutex;
0098         CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
0099         // non-null if some thread is already filling (cudaStream_t is just a pointer)
0100         CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
0101         mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
0102         CMS_THREAD_GUARD(m_mutex) mutable T m_data;
0103       };
0104
0105       std::vector<Item> gpuDataPerDevice_;
0106     };
0107   }  // namespace cuda
0108 }  // namespace cms
0109
0110 #endif  // HeterogeneousCore_CUDACore_ESProduct_h