CUDAUtilities/src/CachingHostAllocator.h

0001 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0002 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0003
0004 /******************************************************************************
0005  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
0006  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
0007  *
0008  * Redistribution and use in source and binary forms, with or without
0009  * modification, are permitted provided that the following conditions are met:
0010  *     * Redistributions of source code must retain the above copyright
0011  *       notice, this list of conditions and the following disclaimer.
0012  *     * Redistributions in binary form must reproduce the above copyright
0013  *       notice, this list of conditions and the following disclaimer in the
0014  *       documentation and/or other materials provided with the distribution.
0015  *     * Neither the name of the NVIDIA CORPORATION nor the
0016  *       names of its contributors may be used to endorse or promote products
0017  *       derived from this software without specific prior written permission.
0018  *
0019  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0020  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0021  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0022  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
0023  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0024  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0025  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0026  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0028  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0029  *
0030  ******************************************************************************/
0031
0032 /**
0033  * Modified to cache pinned host allocations by Matti Kortelainen
0034  */
0035
0036 /******************************************************************************
0037  * Simple caching allocator for pinned host memory allocations. The allocator is
0038  * thread-safe.
0039  ******************************************************************************/
0040
0041 #include <cmath>
0042 #include <map>
0043 #include <set>
0044 #include <mutex>
0045
0046 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0047
0048 /// CUB namespace
0049 namespace notcub {
0050
0051   /**
0052  * \addtogroup UtilMgmt
0053  * @{
0054  */
0055
0056   /******************************************************************************
0057  * CachingHostAllocator (host use)
0058  ******************************************************************************/
0059
0060   /**
0061  * \brief A simple caching allocator pinned host memory allocations.
0062  *
0063  * \par Overview
0064  * The allocator is thread-safe.  It behaves as follows:
0065  *
0066  * I presume the CUDA stream-safeness is not useful as to read/write
0067  * from/to the pinned host memory one needs to synchronize anyway. The
0068  * difference wrt. device memory is that in the CPU all operations to
0069  * the device memory are scheduled via the CUDA stream, while for the
0070  * host memory one can perform operations directly.
0071  *
0072  * \par
0073  * - Allocations are categorized and cached by bin size.  A new allocation request of
0074  *   a given size will only consider cached allocations within the corresponding bin.
0075  * - Bin limits progress geometrically in accordance with the growth factor
0076  *   \p bin_growth provided during construction.  Unused host allocations within
0077  *   a larger bin cache are not reused for allocation requests that categorize to
0078  *   smaller bin sizes.
0079  * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
0080  *   (\p bin_growth ^ \p min_bin).
0081  * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
0082  *   bin and are simply freed when they are deallocated instead of being returned
0083  *   to a bin-cache.
0084  * - %If the total storage of cached allocations  will exceed
0085  *   \p max_cached_bytes, allocations are simply freed when they are
0086  *   deallocated instead of being returned to their bin-cache.
0087  *
0088  * \par
0089  * For example, the default-constructed CachingHostAllocator is configured with:
0090  * - \p bin_growth          = 8
0091  * - \p min_bin             = 3
0092  * - \p max_bin             = 7
0093  * - \p max_cached_bytes    = 6MB - 1B
0094  *
0095  * \par
0096  * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
0097  * and sets a maximum of 6,291,455 cached bytes
0098  *
0099  */
0100   struct CachingHostAllocator {
0101     //---------------------------------------------------------------------
0102     // Constants
0103     //---------------------------------------------------------------------
0104
0105     /// Out-of-bounds bin
0106     static const unsigned int INVALID_BIN = (unsigned int)-1;
0107
0108     /// Invalid size
0109     static const size_t INVALID_SIZE = (size_t)-1;
0110
0111 #ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
0112
0113     /// Invalid device ordinal
0114     static const int INVALID_DEVICE_ORDINAL = -1;
0115
0116     //---------------------------------------------------------------------
0117     // Type definitions and helper types
0118     //---------------------------------------------------------------------
0119
0120     /**
0121      * Descriptor for pinned host memory allocations
0122      */
0123     struct BlockDescriptor {
0124       void *d_ptr;                     // Host pointer
0125       size_t bytes;                    // Size of allocation in bytes
0126       unsigned int bin;                // Bin enumeration
0127       int device;                      // device ordinal
0128       cudaStream_t associated_stream;  // Associated associated_stream
0129       cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
0130
0131       // Constructor (suitable for searching maps for a specific block, given its pointer)
0132       BlockDescriptor(void *d_ptr)
0133           : d_ptr(d_ptr),
0134             bytes(0),
0135             bin(INVALID_BIN),
0136             device(INVALID_DEVICE_ORDINAL),
0137             associated_stream(nullptr),
0138             ready_event(nullptr) {}
0139
0140       // Constructor (suitable for searching maps for a range of suitable blocks)
0141       BlockDescriptor()
0142           : d_ptr(nullptr),
0143             bytes(0),
0144             bin(INVALID_BIN),
0145             device(INVALID_DEVICE_ORDINAL),
0146             associated_stream(nullptr),
0147             ready_event(nullptr) {}
0148
0149       // Comparison functor for comparing host pointers
0150       static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
0151
0152       // Comparison functor for comparing allocation sizes
0153       static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
0154     };
0155
0156     /// BlockDescriptor comparator function interface
0157     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
0158
0159     class TotalBytes {
0160     public:
0161       size_t free;
0162       size_t live;
0163       TotalBytes() { free = live = 0; }
0164     };
0165
0166     /// Set type for cached blocks (ordered by size)
0167     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
0168
0169     /// Set type for live blocks (ordered by ptr)
0170     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
0171
0172     //---------------------------------------------------------------------
0173     // Utility functions
0174     //---------------------------------------------------------------------
0175
0176     /**
0177      * Integer pow function for unsigned base and exponent
0178      */
0179     static unsigned int IntPow(unsigned int base, unsigned int exp) {
0180       unsigned int retval = 1;
0181       while (exp > 0) {
0182         if (exp & 1) {
0183           retval = retval * base;  // multiply the result by the current base
0184         }
0185         base = base * base;  // square the base
0186         exp = exp >> 1;      // divide the exponent in half
0187       }
0188       return retval;
0189     }
0190
0191     /**
0192      * Round up to the nearest power-of
0193      */
0194     void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
0195       power = 0;
0196       rounded_bytes = 1;
0197
0198       if (value * base < value) {
0199         // Overflow
0200         power = sizeof(size_t) * 8;
0201         rounded_bytes = size_t(0) - 1;
0202         return;
0203       }
0204
0205       while (rounded_bytes < value) {
0206         rounded_bytes *= base;
0207         power++;
0208       }
0209     }
0210
0211     //---------------------------------------------------------------------
0212     // Fields
0213     //---------------------------------------------------------------------
0214
0215     std::mutex mutex;  /// Mutex for thread-safety
0216
0217     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
0218     unsigned int min_bin;     /// Minimum bin enumeration
0219     unsigned int max_bin;     /// Maximum bin enumeration
0220
0221     size_t min_bin_bytes;     /// Minimum bin size
0222     size_t max_bin_bytes;     /// Maximum bin size
0223     size_t max_cached_bytes;  /// Maximum aggregate cached bytes
0224
0225     const bool
0226         skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
0227     bool debug;        /// Whether or not to print (de)allocation events to stdout
0228
0229     TotalBytes cached_bytes;     /// Aggregate cached bytes
0230     CachedBlocks cached_blocks;  /// Set of cached pinned host allocations available for reuse
0231     BusyBlocks live_blocks;      /// Set of live pinned host allocations currently in use
0232
0233 #endif  // DOXYGEN_SHOULD_SKIP_THIS
0234
0235     //---------------------------------------------------------------------
0236     // Methods
0237     //---------------------------------------------------------------------
0238
0239     /**
0240      * \brief Constructor.
0241      */
0242     CachingHostAllocator(
0243         unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
0244         unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
0245         unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
0246         size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
0247         bool skip_cleanup =
0248             false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
0249         bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
0250         : bin_growth(bin_growth),
0251           min_bin(min_bin),
0252           max_bin(max_bin),
0253           min_bin_bytes(IntPow(bin_growth, min_bin)),
0254           max_bin_bytes(IntPow(bin_growth, max_bin)),
0255           max_cached_bytes(max_cached_bytes),
0256           skip_cleanup(skip_cleanup),
0257           debug(debug),
0258           cached_blocks(BlockDescriptor::SizeCompare),
0259           live_blocks(BlockDescriptor::PtrCompare) {}
0260
0261     /**
0262      * \brief Default constructor.
0263      *
0264      * Configured with:
0265      * \par
0266      * - \p bin_growth          = 8
0267      * - \p min_bin             = 3
0268      * - \p max_bin             = 7
0269      * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
0270      *
0271      * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
0272      * sets a maximum of 6,291,455 cached bytes
0273      */
0274     CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
0275         : bin_growth(8),
0276           min_bin(3),
0277           max_bin(7),
0278           min_bin_bytes(IntPow(bin_growth, min_bin)),
0279           max_bin_bytes(IntPow(bin_growth, max_bin)),
0280           max_cached_bytes((max_bin_bytes * 3) - 1),
0281           skip_cleanup(skip_cleanup),
0282           debug(debug),
0283           cached_blocks(BlockDescriptor::SizeCompare),
0284           live_blocks(BlockDescriptor::PtrCompare) {}
0285
0286     /**
0287      * \brief Sets the limit on the number bytes this allocator is allowed to cache
0288      *
0289      * Changing the ceiling of cached bytes does not cause any allocations (in-use or
0290      * cached-in-reserve) to be freed.  See \p FreeAllCached().
0291      */
0292     void SetMaxCachedBytes(size_t max_cached_bytes) {
0293       // Lock
0294       std::unique_lock mutex_locker(mutex);
0295
0296       if (debug)
0297         printf("Changing max_cached_bytes (%lld -> %lld)\n",
0298                (long long)this->max_cached_bytes,
0299                (long long)max_cached_bytes);
0300
0301       this->max_cached_bytes = max_cached_bytes;
0302
0303       // Unlock (redundant, kept for style uniformity)
0304       mutex_locker.unlock();
0305     }
0306
0307     /**
0308      * \brief Provides a suitable allocation of pinned host memory for the given size.
0309      *
0310      * Once freed, the allocation becomes available immediately for reuse.
0311      */
0312     cudaError_t HostAllocate(
0313         void **d_ptr,                          ///< [out] Reference to pointer to the allocation
0314         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
0315         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
0316     {
0317       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0318       *d_ptr = nullptr;
0319       int device = INVALID_DEVICE_ORDINAL;
0320       cudaError_t error = cudaSuccess;
0321
0322       cudaCheck(error = cudaGetDevice(&device));
0323
0324       // Create a block descriptor for the requested allocation
0325       bool found = false;
0326       BlockDescriptor search_key;
0327       search_key.device = device;
0328       search_key.associated_stream = active_stream;
0329       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
0330
0331       if (search_key.bin > max_bin) {
0332         // Bin is greater than our maximum bin: allocate the request
0333         // exactly and give out-of-bounds bin.  It will not be cached
0334         // for reuse when returned.
0335         search_key.bin = INVALID_BIN;
0336         search_key.bytes = bytes;
0337       } else {
0338         // Search for a suitable cached allocation: lock
0339         mutex_locker.lock();
0340
0341         if (search_key.bin < min_bin) {
0342           // Bin is less than minimum bin: round up
0343           search_key.bin = min_bin;
0344           search_key.bytes = min_bin_bytes;
0345         }
0346
0347         // Iterate through the range of cached blocks in the same bin
0348         CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
0349         while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
0350           // To prevent races with reusing blocks returned by the host but still
0351           // in use for transfers, only consider cached blocks that are from an idle stream
0352           if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
0353             // Reuse existing cache block.  Insert into live blocks.
0354             found = true;
0355             search_key = *block_itr;
0356             search_key.associated_stream = active_stream;
0357             if (search_key.device != device) {
0358               // If "associated" device changes, need to re-create the event on the right device
0359               cudaCheck(error = cudaSetDevice(search_key.device));
0360               cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0361               cudaCheck(error = cudaSetDevice(device));
0362               cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0363               search_key.device = device;
0364             }
0365
0366             live_blocks.insert(search_key);
0367
0368             // Remove from free blocks
0369             cached_bytes.free -= search_key.bytes;
0370             cached_bytes.live += search_key.bytes;
0371
0372             if (debug)
0373               printf(
0374                   "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
0375                   "(previously associated with stream %lld, event %lld).\n",
0376                   search_key.d_ptr,
0377                   (long long)search_key.bytes,
0378                   (long long)search_key.associated_stream,
0379                   (long long)search_key.ready_event,
0380                   (long long)search_key.device,
0381                   (long long)block_itr->associated_stream,
0382                   (long long)block_itr->ready_event);
0383
0384             cached_blocks.erase(block_itr);
0385
0386             break;
0387           }
0388           block_itr++;
0389         }
0390
0391         // Done searching: unlock
0392         mutex_locker.unlock();
0393       }
0394
0395       // Allocate the block if necessary
0396       if (!found) {
0397         // Attempt to allocate
0398         // TODO: eventually support allocation flags
0399         if (cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault) == cudaErrorMemoryAllocation) {
0400           // The allocation attempt failed: free all cached blocks on device and retry
0401           if (debug)
0402             printf(
0403                 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
0404                 "allocations",
0405                 (long long)search_key.bytes,
0406                 (long long)search_key.associated_stream,
0407                 (long long)search_key.device);
0408
0409           error = cudaSuccess;  // Reset the error we will return
0410           cudaGetLastError();   // Reset CUDART's error
0411
0412           // Lock
0413           mutex_locker.lock();
0414
0415           // Iterate the range of free blocks
0416           CachedBlocks::iterator block_itr = cached_blocks.begin();
0417
0418           while ((block_itr != cached_blocks.end())) {
0419             // No need to worry about synchronization with the device: cudaFree is
0420             // blocking and will synchronize across all kernels executing
0421             // on the current device
0422
0423             // Free pinned host memory.
0424             if ((error = cudaFreeHost(block_itr->d_ptr)))
0425               break;
0426             if ((error = cudaEventDestroy(block_itr->ready_event)))
0427               break;
0428
0429             // Reduce balance and erase entry
0430             cached_bytes.free -= block_itr->bytes;
0431
0432             if (debug)
0433               printf(
0434                   "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0435                   "bytes) outstanding.\n",
0436                   (long long)block_itr->bytes,
0437                   (long long)cached_blocks.size(),
0438                   (long long)cached_bytes.free,
0439                   (long long)live_blocks.size(),
0440                   (long long)cached_bytes.live);
0441
0442             cached_blocks.erase(block_itr);
0443
0444             block_itr++;
0445           }
0446
0447           // Unlock
0448           mutex_locker.unlock();
0449
0450           // Return under error
0451           if (error)
0452             return error;
0453
0454           // Try to allocate again
0455           cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
0456         }
0457
0458         // Create ready event
0459         cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0460
0461         // Insert into live blocks
0462         mutex_locker.lock();
0463         live_blocks.insert(search_key);
0464         cached_bytes.live += search_key.bytes;
0465         mutex_locker.unlock();
0466
0467         if (debug)
0468           printf(
0469               "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
0470               "%lld).\n",
0471               search_key.d_ptr,
0472               (long long)search_key.bytes,
0473               (long long)search_key.associated_stream,
0474               (long long)search_key.ready_event,
0475               (long long)search_key.device);
0476       }
0477
0478       // Copy host pointer to output parameter
0479       *d_ptr = search_key.d_ptr;
0480
0481       if (debug)
0482         printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
0483                (long long)cached_blocks.size(),
0484                (long long)cached_bytes.free,
0485                (long long)live_blocks.size(),
0486                (long long)cached_bytes.live);
0487
0488       return error;
0489     }
0490
0491     /**
0492      * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
0493      *
0494      * Once freed, the allocation becomes available immediately for reuse.
0495      */
0496     cudaError_t HostFree(void *d_ptr) {
0497       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0498       cudaError_t error = cudaSuccess;
0499
0500       // Lock
0501       std::unique_lock<std::mutex> mutex_locker(mutex);
0502
0503       // Find corresponding block descriptor
0504       bool recached = false;
0505       BlockDescriptor search_key(d_ptr);
0506       BusyBlocks::iterator block_itr = live_blocks.find(search_key);
0507       if (block_itr != live_blocks.end()) {
0508         // Remove from live blocks
0509         search_key = *block_itr;
0510         live_blocks.erase(block_itr);
0511         cached_bytes.live -= search_key.bytes;
0512
0513         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
0514         if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
0515           // Insert returned allocation into free blocks
0516           recached = true;
0517           cached_blocks.insert(search_key);
0518           cached_bytes.free += search_key.bytes;
0519
0520           if (debug)
0521             printf(
0522                 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
0523                 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
0524                 (long long)search_key.bytes,
0525                 (long long)search_key.associated_stream,
0526                 (long long)search_key.ready_event,
0527                 (long long)search_key.device,
0528                 (long long)cached_blocks.size(),
0529                 (long long)cached_bytes.free,
0530                 (long long)live_blocks.size(),
0531                 (long long)cached_bytes.live);
0532         }
0533       }
0534
0535       cudaCheck(error = cudaGetDevice(&entrypoint_device));
0536       if (entrypoint_device != search_key.device) {
0537         cudaCheck(error = cudaSetDevice(search_key.device));
0538       }
0539
0540       if (recached) {
0541         // Insert the ready event in the associated stream (must have current device set properly)
0542         cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
0543       }
0544
0545       // Unlock
0546       mutex_locker.unlock();
0547
0548       if (!recached) {
0549         // Free the allocation from the runtime and cleanup the event.
0550         cudaCheck(error = cudaFreeHost(d_ptr));
0551         cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0552
0553         if (debug)
0554           printf(
0555               "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
0556               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
0557               (long long)search_key.bytes,
0558               (long long)search_key.associated_stream,
0559               (long long)search_key.ready_event,
0560               (long long)search_key.device,
0561               (long long)cached_blocks.size(),
0562               (long long)cached_bytes.free,
0563               (long long)live_blocks.size(),
0564               (long long)cached_bytes.live);
0565       }
0566
0567       // Reset device
0568       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
0569         cudaCheck(error = cudaSetDevice(entrypoint_device));
0570       }
0571
0572       return error;
0573     }
0574
0575     /**
0576      * \brief Frees all cached pinned host allocations
0577      */
0578     cudaError_t FreeAllCached() {
0579       cudaError_t error = cudaSuccess;
0580       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0581       int current_device = INVALID_DEVICE_ORDINAL;
0582
0583       std::unique_lock<std::mutex> mutex_locker(mutex);
0584
0585       while (!cached_blocks.empty()) {
0586         // Get first block
0587         CachedBlocks::iterator begin = cached_blocks.begin();
0588
0589         // Get entry-point device ordinal if necessary
0590         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
0591           if ((error = cudaGetDevice(&entrypoint_device)))
0592             break;
0593         }
0594
0595         // Set current device ordinal if necessary
0596         if (begin->device != current_device) {
0597           if ((error = cudaSetDevice(begin->device)))
0598             break;
0599           current_device = begin->device;
0600         }
0601
0602         // Free host memory
0603         if ((error = cudaFreeHost(begin->d_ptr)))
0604           break;
0605         if ((error = cudaEventDestroy(begin->ready_event)))
0606           break;
0607
0608         // Reduce balance and erase entry
0609         cached_bytes.free -= begin->bytes;
0610
0611         if (debug)
0612           printf(
0613               "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0614               "bytes) outstanding.\n",
0615               (long long)begin->bytes,
0616               (long long)cached_blocks.size(),
0617               (long long)cached_bytes.free,
0618               (long long)live_blocks.size(),
0619               (long long)cached_bytes.live);
0620
0621         cached_blocks.erase(begin);
0622       }
0623
0624       mutex_locker.unlock();
0625
0626       // Attempt to revert back to entry-point device if necessary
0627       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
0628         cudaCheck(error = cudaSetDevice(entrypoint_device));
0629       }
0630
0631       return error;
0632     }
0633
0634     /**
0635      * \brief Destructor
0636      */
0637     ~CachingHostAllocator() {
0638       if (!skip_cleanup)
0639         FreeAllCached();
0640     }
0641   };
0642
0643   /** @} */  // end group UtilMgmt
0644
0645 }  // namespace notcub
0646
0647 #endif