CUDAUtilities/src/CachingDeviceAllocator.h

0001 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
0002 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
0003
0004 /******************************************************************************
0005  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
0006  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
0007  *
0008  * Redistribution and use in source and binary forms, with or without
0009  * modification, are permitted provided that the following conditions are met:
0010  *     * Redistributions of source code must retain the above copyright
0011  *       notice, this list of conditions and the following disclaimer.
0012  *     * Redistributions in binary form must reproduce the above copyright
0013  *       notice, this list of conditions and the following disclaimer in the
0014  *       documentation and/or other materials provided with the distribution.
0015  *     * Neither the name of the NVIDIA CORPORATION nor the
0016  *       names of its contributors may be used to endorse or promote products
0017  *       derived from this software without specific prior written permission.
0018  *
0019  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0020  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0021  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0022  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
0023  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0024  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0025  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0026  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0028  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0029  *
0030  ******************************************************************************/
0031
0032 /**
0033  * Forked to CMSSW by Matti Kortelainen
0034  */
0035
0036 /******************************************************************************
0037  * Simple caching allocator for device memory allocations. The allocator is
0038  * thread-safe and capable of managing device allocations on multiple devices.
0039  ******************************************************************************/
0040
0041 #include <cmath>
0042 #include <map>
0043 #include <set>
0044 #include <mutex>
0045
0046 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0047 #include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
0048
0049 /// CUB namespace
0050 namespace notcub {
0051
0052   /**
0053  * \addtogroup UtilMgmt
0054  * @{
0055  */
0056
0057   /******************************************************************************
0058  * CachingDeviceAllocator (host use)
0059  ******************************************************************************/
0060
0061   /**
0062  * \brief A simple caching allocator for device memory allocations.
0063  *
0064  * \par Overview
0065  * The allocator is thread-safe and stream-safe and is capable of managing cached
0066  * device allocations on multiple devices.  It behaves as follows:
0067  *
0068  * \par
0069  * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
0070  *   the allocation becomes available immediately for reuse within the \p active_stream
0071  *   with which it was associated with during allocation, and it becomes available for
0072  *   reuse within other streams when all prior work submitted to \p active_stream has completed.
0073  * - Allocations are categorized and cached by bin size.  A new allocation request of
0074  *   a given size will only consider cached allocations within the corresponding bin.
0075  * - Bin limits progress geometrically in accordance with the growth factor
0076  *   \p bin_growth provided during construction.  Unused device allocations within
0077  *   a larger bin cache are not reused for allocation requests that categorize to
0078  *   smaller bin sizes.
0079  * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
0080  *   (\p bin_growth ^ \p min_bin).
0081  * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
0082  *   bin and are simply freed when they are deallocated instead of being returned
0083  *   to a bin-cache.
0084  * - %If the total storage of cached allocations on a given device will exceed
0085  *   \p max_cached_bytes, allocations for that device are simply freed when they are
0086  *   deallocated instead of being returned to their bin-cache.
0087  *
0088  * \par
0089  * For example, the default-constructed CachingDeviceAllocator is configured with:
0090  * - \p bin_growth          = 8
0091  * - \p min_bin             = 3
0092  * - \p max_bin             = 7
0093  * - \p max_cached_bytes    = 6MB - 1B
0094  *
0095  * \par
0096  * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
0097  * and sets a maximum of 6,291,455 cached bytes per device
0098  *
0099  */
0100   struct CachingDeviceAllocator {
0101     //---------------------------------------------------------------------
0102     // Constants
0103     //---------------------------------------------------------------------
0104
0105     /// Out-of-bounds bin
0106     static const unsigned int INVALID_BIN = (unsigned int)-1;
0107
0108     /// Invalid size
0109     static const size_t INVALID_SIZE = (size_t)-1;
0110
0111 #ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
0112
0113     /// Invalid device ordinal
0114     static const int INVALID_DEVICE_ORDINAL = -1;
0115
0116     //---------------------------------------------------------------------
0117     // Type definitions and helper types
0118     //---------------------------------------------------------------------
0119
0120     /**
0121      * Descriptor for device memory allocations
0122      */
0123     struct BlockDescriptor {
0124       void *d_ptr;                     // Device pointer
0125       size_t bytes;                    // Size of allocation in bytes
0126       size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)
0127       unsigned int bin;                // Bin enumeration
0128       int device;                      // device ordinal
0129       cudaStream_t associated_stream;  // Associated associated_stream
0130       cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
0131
0132       // Constructor (suitable for searching maps for a specific block, given its pointer and device)
0133       BlockDescriptor(void *d_ptr, int device)
0134           : d_ptr(d_ptr),
0135             bytes(0),
0136             bytesRequested(0),  // CMS
0137             bin(INVALID_BIN),
0138             device(device),
0139             associated_stream(nullptr),
0140             ready_event(nullptr) {}
0141
0142       // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
0143       BlockDescriptor(int device)
0144           : d_ptr(nullptr),
0145             bytes(0),
0146             bytesRequested(0),  // CMS
0147             bin(INVALID_BIN),
0148             device(device),
0149             associated_stream(nullptr),
0150             ready_event(nullptr) {}
0151
0152       // Comparison functor for comparing device pointers
0153       static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
0154         if (a.device == b.device)
0155           return (a.d_ptr < b.d_ptr);
0156         else
0157           return (a.device < b.device);
0158       }
0159
0160       // Comparison functor for comparing allocation sizes
0161       static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
0162         if (a.device == b.device)
0163           return (a.bytes < b.bytes);
0164         else
0165           return (a.device < b.device);
0166       }
0167     };
0168
0169     /// BlockDescriptor comparator function interface
0170     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
0171
0172     // CMS: Moved TotalBytes to deviceAllocatorStatus.h
0173
0174     /// Set type for cached blocks (ordered by size)
0175     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
0176
0177     /// Set type for live blocks (ordered by ptr)
0178     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
0179
0180     /// Map type of device ordinals to the number of cached bytes cached by each device
0181     // CMS: Moved definition to deviceAllocatorStatus.h
0182     using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;
0183
0184     //---------------------------------------------------------------------
0185     // Utility functions
0186     //---------------------------------------------------------------------
0187
0188     /**
0189      * Integer pow function for unsigned base and exponent
0190      */
0191     static unsigned int IntPow(unsigned int base, unsigned int exp) {
0192       unsigned int retval = 1;
0193       while (exp > 0) {
0194         if (exp & 1) {
0195           retval = retval * base;  // multiply the result by the current base
0196         }
0197         base = base * base;  // square the base
0198         exp = exp >> 1;      // divide the exponent in half
0199       }
0200       return retval;
0201     }
0202
0203     /**
0204      * Round up to the nearest power-of
0205      */
0206     void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
0207       power = 0;
0208       rounded_bytes = 1;
0209
0210       if (value * base < value) {
0211         // Overflow
0212         power = sizeof(size_t) * 8;
0213         rounded_bytes = size_t(0) - 1;
0214         return;
0215       }
0216
0217       while (rounded_bytes < value) {
0218         rounded_bytes *= base;
0219         power++;
0220       }
0221     }
0222
0223     //---------------------------------------------------------------------
0224     // Fields
0225     //---------------------------------------------------------------------
0226
0227     // CMS: use std::mutex instead of cub::Mutex, declare mutable
0228     mutable std::mutex mutex;  /// Mutex for thread-safety
0229
0230     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
0231     unsigned int min_bin;     /// Minimum bin enumeration
0232     unsigned int max_bin;     /// Maximum bin enumeration
0233
0234     size_t min_bin_bytes;     /// Minimum bin size
0235     size_t max_bin_bytes;     /// Maximum bin size
0236     size_t max_cached_bytes;  /// Maximum aggregate cached bytes per device
0237
0238     const bool
0239         skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
0240     bool debug;        /// Whether or not to print (de)allocation events to stdout
0241
0242     GpuCachedBytes cached_bytes;  /// Map of device ordinal to aggregate cached bytes on that device
0243     CachedBlocks cached_blocks;   /// Set of cached device allocations available for reuse
0244     BusyBlocks live_blocks;       /// Set of live device allocations currently in use
0245
0246 #endif  // DOXYGEN_SHOULD_SKIP_THIS
0247
0248     //---------------------------------------------------------------------
0249     // Methods
0250     //---------------------------------------------------------------------
0251
0252     /**
0253      * \brief Constructor.
0254      */
0255     CachingDeviceAllocator(
0256         unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
0257         unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
0258         unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
0259         size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes per device (default is no limit)
0260         bool skip_cleanup =
0261             false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
0262         bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
0263         : bin_growth(bin_growth),
0264           min_bin(min_bin),
0265           max_bin(max_bin),
0266           min_bin_bytes(IntPow(bin_growth, min_bin)),
0267           max_bin_bytes(IntPow(bin_growth, max_bin)),
0268           max_cached_bytes(max_cached_bytes),
0269           skip_cleanup(skip_cleanup),
0270           debug(debug),
0271           cached_blocks(BlockDescriptor::SizeCompare),
0272           live_blocks(BlockDescriptor::PtrCompare) {}
0273
0274     /**
0275      * \brief Default constructor.
0276      *
0277      * Configured with:
0278      * \par
0279      * - \p bin_growth          = 8
0280      * - \p min_bin             = 3
0281      * - \p max_bin             = 7
0282      * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
0283      *
0284      * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
0285      * sets a maximum of 6,291,455 cached bytes per device
0286      */
0287     CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
0288         : bin_growth(8),
0289           min_bin(3),
0290           max_bin(7),
0291           min_bin_bytes(IntPow(bin_growth, min_bin)),
0292           max_bin_bytes(IntPow(bin_growth, max_bin)),
0293           max_cached_bytes((max_bin_bytes * 3) - 1),
0294           skip_cleanup(skip_cleanup),
0295           debug(debug),
0296           cached_blocks(BlockDescriptor::SizeCompare),
0297           live_blocks(BlockDescriptor::PtrCompare) {}
0298
0299     /**
0300      * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
0301      *
0302      * Changing the ceiling of cached bytes does not cause any allocations (in-use or
0303      * cached-in-reserve) to be freed.  See \p FreeAllCached().
0304      */
0305     cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
0306       // Lock
0307       // CMS: use RAII instead of (un)locking explicitly
0308       std::unique_lock mutex_locker(mutex);
0309
0310       if (debug)
0311         // CMS: use raw printf
0312         printf("Changing max_cached_bytes (%lld -> %lld)\n",
0313                (long long)this->max_cached_bytes,
0314                (long long)max_cached_bytes);
0315
0316       this->max_cached_bytes = max_cached_bytes;
0317
0318       // Unlock (redundant, kept for style uniformity)
0319       mutex_locker.unlock();
0320
0321       return cudaSuccess;
0322     }
0323
0324     /**
0325      * \brief Provides a suitable allocation of device memory for the given size on the specified device.
0326      *
0327      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
0328      * with which it was associated with during allocation, and it becomes available for reuse within other
0329      * streams when all prior work submitted to \p active_stream has completed.
0330      */
0331     cudaError_t DeviceAllocate(
0332         int device,                            ///< [in] Device on which to place the allocation
0333         void **d_ptr,                          ///< [out] Reference to pointer to the allocation
0334         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
0335         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
0336     {
0337       // CMS: use RAII instead of (un)locking explicitly
0338       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0339       *d_ptr = nullptr;
0340       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0341       cudaError_t error = cudaSuccess;
0342
0343       if (device == INVALID_DEVICE_ORDINAL) {
0344         // CMS: throw exception on error
0345         cudaCheck(error = cudaGetDevice(&entrypoint_device));
0346         device = entrypoint_device;
0347       }
0348
0349       // Create a block descriptor for the requested allocation
0350       bool found = false;
0351       BlockDescriptor search_key(device);
0352       search_key.bytesRequested = bytes;  // CMS
0353       search_key.associated_stream = active_stream;
0354       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
0355
0356       if (search_key.bin > max_bin) {
0357         // Bin is greater than our maximum bin: allocate the request
0358         // exactly and give out-of-bounds bin.  It will not be cached
0359         // for reuse when returned.
0360         search_key.bin = INVALID_BIN;
0361         search_key.bytes = bytes;
0362       } else {
0363         // Search for a suitable cached allocation: lock
0364         mutex_locker.lock();
0365
0366         if (search_key.bin < min_bin) {
0367           // Bin is less than minimum bin: round up
0368           search_key.bin = min_bin;
0369           search_key.bytes = min_bin_bytes;
0370         }
0371
0372         // Iterate through the range of cached blocks on the same device in the same bin
0373         CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
0374         while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
0375                (block_itr->bin == search_key.bin)) {
0376           // To prevent races with reusing blocks returned by the host but still
0377           // in use by the device, only consider cached blocks that are
0378           // either (from the active stream) or (from an idle stream)
0379           if ((active_stream == block_itr->associated_stream) ||
0380               (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
0381             // Reuse existing cache block.  Insert into live blocks.
0382             found = true;
0383             search_key = *block_itr;
0384             search_key.associated_stream = active_stream;
0385             live_blocks.insert(search_key);
0386
0387             // Remove from free blocks
0388             cached_bytes[device].free -= search_key.bytes;
0389             cached_bytes[device].live += search_key.bytes;
0390             cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
0391
0392             if (debug)
0393               // CMS: improved debug message
0394               // CMS: use raw printf
0395               printf(
0396                   "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
0397                   "associated with stream %lld, event %lld).\n",
0398                   device,
0399                   search_key.d_ptr,
0400                   (long long)search_key.bytes,
0401                   (long long)search_key.associated_stream,
0402                   (long long)search_key.ready_event,
0403                   (long long)block_itr->associated_stream,
0404                   (long long)block_itr->ready_event);
0405
0406             cached_blocks.erase(block_itr);
0407
0408             break;
0409           }
0410           block_itr++;
0411         }
0412
0413         // Done searching: unlock
0414         mutex_locker.unlock();
0415       }
0416
0417       // Allocate the block if necessary
0418       if (!found) {
0419         // Set runtime's current device to specified device (entrypoint may not be set)
0420         if (device != entrypoint_device) {
0421           // CMS: throw exception on error
0422           cudaCheck(error = cudaGetDevice(&entrypoint_device));
0423           cudaCheck(error = cudaSetDevice(device));
0424         }
0425
0426         // Attempt to allocate
0427         // CMS: silently ignore errors and retry or pass them to the caller
0428         if (cudaMalloc(&search_key.d_ptr, search_key.bytes) == cudaErrorMemoryAllocation) {
0429           // The allocation attempt failed: free all cached blocks on device and retry
0430           if (debug)
0431             // CMS: use raw printf
0432             printf(
0433                 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
0434                 device,
0435                 (long long)search_key.bytes,
0436                 (long long)search_key.associated_stream);
0437
0438           error = cudaSuccess;  // Reset the error we will return
0439           cudaGetLastError();   // Reset CUDART's error
0440
0441           // Lock
0442           mutex_locker.lock();
0443
0444           // Iterate the range of free blocks on the same device
0445           BlockDescriptor free_key(device);
0446           CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
0447
0448           while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
0449             // No need to worry about synchronization with the device: cudaFree is
0450             // blocking and will synchronize across all kernels executing
0451             // on the current device
0452
0453             // Free device memory and destroy stream event.
0454             // CMS: silently ignore errors and pass them to the caller
0455             if ((error = cudaFree(block_itr->d_ptr)))
0456               break;
0457             if ((error = cudaEventDestroy(block_itr->ready_event)))
0458               break;
0459
0460             // Reduce balance and erase entry
0461             cached_bytes[device].free -= block_itr->bytes;
0462
0463             if (debug)
0464               // CMS: use raw printf
0465               printf(
0466                   "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
0467                   "(%lld bytes) outstanding.\n",
0468                   device,
0469                   (long long)block_itr->bytes,
0470                   (long long)cached_blocks.size(),
0471                   (long long)cached_bytes[device].free,
0472                   (long long)live_blocks.size(),
0473                   (long long)cached_bytes[device].live);
0474
0475             cached_blocks.erase(block_itr);
0476
0477             block_itr++;
0478           }
0479
0480           // Unlock
0481           mutex_locker.unlock();
0482
0483           // Return under error
0484           if (error)
0485             return error;
0486
0487           // Try to allocate again
0488           // CMS: throw exception on error
0489           cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
0490         }
0491
0492         // Create ready event
0493         // CMS: throw exception on error
0494         cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0495
0496         // Insert into live blocks
0497         mutex_locker.lock();
0498         live_blocks.insert(search_key);
0499         cached_bytes[device].live += search_key.bytes;
0500         cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
0501         mutex_locker.unlock();
0502
0503         if (debug)
0504           // CMS: improved debug message
0505           // CMS: use raw printf
0506           printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
0507                  device,
0508                  search_key.d_ptr,
0509                  (long long)search_key.bytes,
0510                  (long long)search_key.associated_stream,
0511                  (long long)search_key.ready_event);
0512
0513         // Attempt to revert back to previous device if necessary
0514         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
0515           // CMS: throw exception on error
0516           cudaCheck(error = cudaSetDevice(entrypoint_device));
0517         }
0518       }
0519
0520       // Copy device pointer to output parameter
0521       *d_ptr = search_key.d_ptr;
0522
0523       if (debug)
0524         // CMS: use raw printf
0525         printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
0526                (long long)cached_blocks.size(),
0527                (long long)cached_bytes[device].free,
0528                (long long)live_blocks.size(),
0529                (long long)cached_bytes[device].live);
0530
0531       return error;
0532     }
0533
0534     /**
0535      * \brief Provides a suitable allocation of device memory for the given size on the current device.
0536      *
0537      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
0538      * with which it was associated with during allocation, and it becomes available for reuse within other
0539      * streams when all prior work submitted to \p active_stream has completed.
0540      */
0541     cudaError_t DeviceAllocate(
0542         void **d_ptr,                          ///< [out] Reference to pointer to the allocation
0543         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
0544         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
0545     {
0546       return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
0547     }
0548
0549     /**
0550      * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
0551      *
0552      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
0553      * with which it was associated with during allocation, and it becomes available for reuse within other
0554      * streams when all prior work submitted to \p active_stream has completed.
0555      */
0556     cudaError_t DeviceFree(int device, void *d_ptr) {
0557       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0558       cudaError_t error = cudaSuccess;
0559       // CMS: use RAII instead of (un)locking explicitly
0560       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0561
0562       if (device == INVALID_DEVICE_ORDINAL) {
0563         // CMS: throw exception on error
0564         cudaCheck(error = cudaGetDevice(&entrypoint_device));
0565         device = entrypoint_device;
0566       }
0567
0568       // Lock
0569       mutex_locker.lock();
0570
0571       // Find corresponding block descriptor
0572       bool recached = false;
0573       BlockDescriptor search_key(d_ptr, device);
0574       BusyBlocks::iterator block_itr = live_blocks.find(search_key);
0575       if (block_itr != live_blocks.end()) {
0576         // Remove from live blocks
0577         search_key = *block_itr;
0578         live_blocks.erase(block_itr);
0579         cached_bytes[device].live -= search_key.bytes;
0580         cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS
0581
0582         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
0583         if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
0584           // Insert returned allocation into free blocks
0585           recached = true;
0586           cached_blocks.insert(search_key);
0587           cached_bytes[device].free += search_key.bytes;
0588
0589           if (debug)
0590             // CMS: improved debug message
0591             // CMS: use raw printf
0592             printf(
0593                 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
0594                 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
0595                 device,
0596                 (long long)search_key.bytes,
0597                 d_ptr,
0598                 (long long)search_key.associated_stream,
0599                 (long long)search_key.ready_event,
0600                 (long long)cached_blocks.size(),
0601                 (long long)cached_bytes[device].free,
0602                 (long long)live_blocks.size(),
0603                 (long long)cached_bytes[device].live);
0604         }
0605       }
0606
0607       // First set to specified device (entrypoint may not be set)
0608       if (device != entrypoint_device) {
0609         // CMS: throw exception on error
0610         cudaCheck(error = cudaGetDevice(&entrypoint_device));
0611         cudaCheck(error = cudaSetDevice(device));
0612       }
0613
0614       if (recached) {
0615         // Insert the ready event in the associated stream (must have current device set properly)
0616         // CMS: throw exception on error
0617         cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
0618       }
0619
0620       // Unlock
0621       mutex_locker.unlock();
0622
0623       if (!recached) {
0624         // Free the allocation from the runtime and cleanup the event.
0625         // CMS: throw exception on error
0626         cudaCheck(error = cudaFree(d_ptr));
0627         cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0628
0629         if (debug)
0630           // CMS: improved debug message
0631           printf(
0632               "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
0633               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
0634               device,
0635               (long long)search_key.bytes,
0636               d_ptr,
0637               (long long)search_key.associated_stream,
0638               (long long)search_key.ready_event,
0639               (long long)cached_blocks.size(),
0640               (long long)cached_bytes[device].free,
0641               (long long)live_blocks.size(),
0642               (long long)cached_bytes[device].live);
0643       }
0644
0645       // Reset device
0646       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
0647         // CMS: throw exception on error
0648         cudaCheck(error = cudaSetDevice(entrypoint_device));
0649       }
0650
0651       return error;
0652     }
0653
0654     /**
0655      * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
0656      *
0657      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
0658      * with which it was associated with during allocation, and it becomes available for reuse within other
0659      * streams when all prior work submitted to \p active_stream has completed.
0660      */
0661     cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
0662
0663     /**
0664      * \brief Frees all cached device allocations on all devices
0665      */
0666     cudaError_t FreeAllCached() {
0667       cudaError_t error = cudaSuccess;
0668       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0669       int current_device = INVALID_DEVICE_ORDINAL;
0670       // CMS: use RAII instead of (un)locking explicitly
0671       std::unique_lock<std::mutex> mutex_locker(mutex);
0672
0673       while (!cached_blocks.empty()) {
0674         // Get first block
0675         CachedBlocks::iterator begin = cached_blocks.begin();
0676
0677         // Get entry-point device ordinal if necessary
0678         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
0679           // CMS: silently ignore errors and pass them to the caller
0680           if ((error = cudaGetDevice(&entrypoint_device)))
0681             break;
0682         }
0683
0684         // Set current device ordinal if necessary
0685         if (begin->device != current_device) {
0686           // CMS: silently ignore errors and pass them to the caller
0687           if ((error = cudaSetDevice(begin->device)))
0688             break;
0689           current_device = begin->device;
0690         }
0691
0692         // Free device memory
0693         // CMS: silently ignore errors and pass them to the caller
0694         if ((error = cudaFree(begin->d_ptr)))
0695           break;
0696         if ((error = cudaEventDestroy(begin->ready_event)))
0697           break;
0698
0699         // Reduce balance and erase entry
0700         cached_bytes[current_device].free -= begin->bytes;
0701
0702         if (debug)
0703           printf(
0704               "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0705               "bytes) outstanding.\n",
0706               current_device,
0707               (long long)begin->bytes,
0708               (long long)cached_blocks.size(),
0709               (long long)cached_bytes[current_device].free,
0710               (long long)live_blocks.size(),
0711               (long long)cached_bytes[current_device].live);
0712
0713         cached_blocks.erase(begin);
0714       }
0715
0716       mutex_locker.unlock();
0717
0718       // Attempt to revert back to entry-point device if necessary
0719       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
0720         // CMS: throw exception on error
0721         cudaCheck(error = cudaSetDevice(entrypoint_device));
0722       }
0723
0724       return error;
0725     }
0726
0727     // CMS: give access to cache allocation status
0728     GpuCachedBytes CacheStatus() const {
0729       std::unique_lock mutex_locker(mutex);
0730       return cached_bytes;
0731     }
0732
0733     /**
0734      * \brief Destructor
0735      */
0736     // CMS: make the destructor not virtual
0737     ~CachingDeviceAllocator() {
0738       if (!skip_cleanup)
0739         FreeAllCached();
0740     }
0741   };
0742
0743   /** @} */  // end group UtilMgmt
0744
0745 }  // namespace notcub
0746
0747 #endif