Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:15:45

0001 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0002 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0003 
0004 /******************************************************************************
0005  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
0006  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
0007  *
0008  * Redistribution and use in source and binary forms, with or without
0009  * modification, are permitted provided that the following conditions are met:
0010  *     * Redistributions of source code must retain the above copyright
0011  *       notice, this list of conditions and the following disclaimer.
0012  *     * Redistributions in binary form must reproduce the above copyright
0013  *       notice, this list of conditions and the following disclaimer in the
0014  *       documentation and/or other materials provided with the distribution.
0015  *     * Neither the name of the NVIDIA CORPORATION nor the
0016  *       names of its contributors may be used to endorse or promote products
0017  *       derived from this software without specific prior written permission.
0018  *
0019  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0020  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0021  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0022  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
0023  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0024  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0025  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0026  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0028  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0029  *
0030  ******************************************************************************/
0031 
0032 /**
0033  * Modified to cache pinned host allocations by Matti Kortelainen
0034  */
0035 
0036 /******************************************************************************
0037  * Simple caching allocator for pinned host memory allocations. The allocator is
0038  * thread-safe.
0039  ******************************************************************************/
0040 
0041 #include <cmath>
0042 #include <map>
0043 #include <set>
0044 #include <mutex>
0045 
0046 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0047 
0048 /// CUB namespace
0049 namespace notcub {
0050 
0051   /**
0052  * \addtogroup UtilMgmt
0053  * @{
0054  */
0055 
0056   /******************************************************************************
0057  * CachingHostAllocator (host use)
0058  ******************************************************************************/
0059 
0060   /**
0061  * \brief A simple caching allocator pinned host memory allocations.
0062  *
0063  * \par Overview
0064  * The allocator is thread-safe.  It behaves as follows:
0065  *
0066  * I presume the CUDA stream-safeness is not useful as to read/write
0067  * from/to the pinned host memory one needs to synchronize anyway. The
0068  * difference wrt. device memory is that in the CPU all operations to
0069  * the device memory are scheduled via the CUDA stream, while for the
0070  * host memory one can perform operations directly.
0071  *
0072  * \par
0073  * - Allocations are categorized and cached by bin size.  A new allocation request of
0074  *   a given size will only consider cached allocations within the corresponding bin.
0075  * - Bin limits progress geometrically in accordance with the growth factor
0076  *   \p bin_growth provided during construction.  Unused host allocations within
0077  *   a larger bin cache are not reused for allocation requests that categorize to
0078  *   smaller bin sizes.
0079  * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
0080  *   (\p bin_growth ^ \p min_bin).
0081  * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
0082  *   bin and are simply freed when they are deallocated instead of being returned
0083  *   to a bin-cache.
0084  * - %If the total storage of cached allocations  will exceed
0085  *   \p max_cached_bytes, allocations are simply freed when they are
0086  *   deallocated instead of being returned to their bin-cache.
0087  *
0088  * \par
0089  * For example, the default-constructed CachingHostAllocator is configured with:
0090  * - \p bin_growth          = 8
0091  * - \p min_bin             = 3
0092  * - \p max_bin             = 7
0093  * - \p max_cached_bytes    = 6MB - 1B
0094  *
0095  * \par
0096  * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
0097  * and sets a maximum of 6,291,455 cached bytes
0098  *
0099  */
0100   struct CachingHostAllocator {
0101     //---------------------------------------------------------------------
0102     // Constants
0103     //---------------------------------------------------------------------
0104 
0105     /// Out-of-bounds bin
0106     static const unsigned int INVALID_BIN = (unsigned int)-1;
0107 
0108     /// Invalid size
0109     static const size_t INVALID_SIZE = (size_t)-1;
0110 
0111 #ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
0112 
0113     /// Invalid device ordinal
0114     static const int INVALID_DEVICE_ORDINAL = -1;
0115 
0116     //---------------------------------------------------------------------
0117     // Type definitions and helper types
0118     //---------------------------------------------------------------------
0119 
0120     /**
0121      * Descriptor for pinned host memory allocations
0122      */
0123     struct BlockDescriptor {
0124       void *d_ptr;                     // Host pointer
0125       size_t bytes;                    // Size of allocation in bytes
0126       unsigned int bin;                // Bin enumeration
0127       int device;                      // device ordinal
0128       cudaStream_t associated_stream;  // Associated associated_stream
0129       cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
0130 
0131       // Constructor (suitable for searching maps for a specific block, given its pointer)
0132       BlockDescriptor(void *d_ptr)
0133           : d_ptr(d_ptr),
0134             bytes(0),
0135             bin(INVALID_BIN),
0136             device(INVALID_DEVICE_ORDINAL),
0137             associated_stream(nullptr),
0138             ready_event(nullptr) {}
0139 
0140       // Constructor (suitable for searching maps for a range of suitable blocks)
0141       BlockDescriptor()
0142           : d_ptr(nullptr),
0143             bytes(0),
0144             bin(INVALID_BIN),
0145             device(INVALID_DEVICE_ORDINAL),
0146             associated_stream(nullptr),
0147             ready_event(nullptr) {}
0148 
0149       // Comparison functor for comparing host pointers
0150       static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
0151 
0152       // Comparison functor for comparing allocation sizes
0153       static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
0154     };
0155 
0156     /// BlockDescriptor comparator function interface
0157     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
0158 
0159     class TotalBytes {
0160     public:
0161       size_t free;
0162       size_t live;
0163       TotalBytes() { free = live = 0; }
0164     };
0165 
0166     /// Set type for cached blocks (ordered by size)
0167     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
0168 
0169     /// Set type for live blocks (ordered by ptr)
0170     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
0171 
0172     //---------------------------------------------------------------------
0173     // Utility functions
0174     //---------------------------------------------------------------------
0175 
0176     /**
0177      * Integer pow function for unsigned base and exponent
0178      */
0179     static unsigned int IntPow(unsigned int base, unsigned int exp) {
0180       unsigned int retval = 1;
0181       while (exp > 0) {
0182         if (exp & 1) {
0183           retval = retval * base;  // multiply the result by the current base
0184         }
0185         base = base * base;  // square the base
0186         exp = exp >> 1;      // divide the exponent in half
0187       }
0188       return retval;
0189     }
0190 
0191     /**
0192      * Round up to the nearest power-of
0193      */
0194     void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
0195       power = 0;
0196       rounded_bytes = 1;
0197 
0198       if (value * base < value) {
0199         // Overflow
0200         power = sizeof(size_t) * 8;
0201         rounded_bytes = size_t(0) - 1;
0202         return;
0203       }
0204 
0205       while (rounded_bytes < value) {
0206         rounded_bytes *= base;
0207         power++;
0208       }
0209     }
0210 
0211     //---------------------------------------------------------------------
0212     // Fields
0213     //---------------------------------------------------------------------
0214 
0215     std::mutex mutex;  /// Mutex for thread-safety
0216 
0217     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
0218     unsigned int min_bin;     /// Minimum bin enumeration
0219     unsigned int max_bin;     /// Maximum bin enumeration
0220 
0221     size_t min_bin_bytes;     /// Minimum bin size
0222     size_t max_bin_bytes;     /// Maximum bin size
0223     size_t max_cached_bytes;  /// Maximum aggregate cached bytes
0224 
0225     const bool
0226         skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
0227     bool debug;        /// Whether or not to print (de)allocation events to stdout
0228 
0229     TotalBytes cached_bytes;     /// Aggregate cached bytes
0230     CachedBlocks cached_blocks;  /// Set of cached pinned host allocations available for reuse
0231     BusyBlocks live_blocks;      /// Set of live pinned host allocations currently in use
0232 
0233 #endif  // DOXYGEN_SHOULD_SKIP_THIS
0234 
0235     //---------------------------------------------------------------------
0236     // Methods
0237     //---------------------------------------------------------------------
0238 
0239     /**
0240      * \brief Constructor.
0241      */
0242     CachingHostAllocator(
0243         unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
0244         unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
0245         unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
0246         size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
0247         bool skip_cleanup =
0248             false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
0249         bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
0250         : bin_growth(bin_growth),
0251           min_bin(min_bin),
0252           max_bin(max_bin),
0253           min_bin_bytes(IntPow(bin_growth, min_bin)),
0254           max_bin_bytes(IntPow(bin_growth, max_bin)),
0255           max_cached_bytes(max_cached_bytes),
0256           skip_cleanup(skip_cleanup),
0257           debug(debug),
0258           cached_blocks(BlockDescriptor::SizeCompare),
0259           live_blocks(BlockDescriptor::PtrCompare) {}
0260 
0261     /**
0262      * \brief Default constructor.
0263      *
0264      * Configured with:
0265      * \par
0266      * - \p bin_growth          = 8
0267      * - \p min_bin             = 3
0268      * - \p max_bin             = 7
0269      * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
0270      *
0271      * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
0272      * sets a maximum of 6,291,455 cached bytes
0273      */
0274     CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
0275         : bin_growth(8),
0276           min_bin(3),
0277           max_bin(7),
0278           min_bin_bytes(IntPow(bin_growth, min_bin)),
0279           max_bin_bytes(IntPow(bin_growth, max_bin)),
0280           max_cached_bytes((max_bin_bytes * 3) - 1),
0281           skip_cleanup(skip_cleanup),
0282           debug(debug),
0283           cached_blocks(BlockDescriptor::SizeCompare),
0284           live_blocks(BlockDescriptor::PtrCompare) {}
0285 
0286     /**
0287      * \brief Sets the limit on the number bytes this allocator is allowed to cache
0288      *
0289      * Changing the ceiling of cached bytes does not cause any allocations (in-use or
0290      * cached-in-reserve) to be freed.  See \p FreeAllCached().
0291      */
0292     void SetMaxCachedBytes(size_t max_cached_bytes) {
0293       // Lock
0294       std::unique_lock mutex_locker(mutex);
0295 
0296       if (debug)
0297         printf("Changing max_cached_bytes (%lld -> %lld)\n",
0298                (long long)this->max_cached_bytes,
0299                (long long)max_cached_bytes);
0300 
0301       this->max_cached_bytes = max_cached_bytes;
0302 
0303       // Unlock (redundant, kept for style uniformity)
0304       mutex_locker.unlock();
0305     }
0306 
0307     /**
0308      * \brief Provides a suitable allocation of pinned host memory for the given size.
0309      *
0310      * Once freed, the allocation becomes available immediately for reuse.
0311      */
0312     cudaError_t HostAllocate(
0313         void **d_ptr,                          ///< [out] Reference to pointer to the allocation
0314         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
0315         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
0316     {
0317       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0318       *d_ptr = nullptr;
0319       int device = INVALID_DEVICE_ORDINAL;
0320       cudaError_t error = cudaSuccess;
0321 
0322       cudaCheck(error = cudaGetDevice(&device));
0323 
0324       // Create a block descriptor for the requested allocation
0325       bool found = false;
0326       BlockDescriptor search_key;
0327       search_key.device = device;
0328       search_key.associated_stream = active_stream;
0329       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
0330 
0331       if (search_key.bin > max_bin) {
0332         // Bin is greater than our maximum bin: allocate the request
0333         // exactly and give out-of-bounds bin.  It will not be cached
0334         // for reuse when returned.
0335         search_key.bin = INVALID_BIN;
0336         search_key.bytes = bytes;
0337       } else {
0338         // Search for a suitable cached allocation: lock
0339         mutex_locker.lock();
0340 
0341         if (search_key.bin < min_bin) {
0342           // Bin is less than minimum bin: round up
0343           search_key.bin = min_bin;
0344           search_key.bytes = min_bin_bytes;
0345         }
0346 
0347         // Iterate through the range of cached blocks in the same bin
0348         CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
0349         while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
0350           // To prevent races with reusing blocks returned by the host but still
0351           // in use for transfers, only consider cached blocks that are from an idle stream
0352           if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
0353             // Reuse existing cache block.  Insert into live blocks.
0354             found = true;
0355             search_key = *block_itr;
0356             search_key.associated_stream = active_stream;
0357             if (search_key.device != device) {
0358               // If "associated" device changes, need to re-create the event on the right device
0359               cudaCheck(error = cudaSetDevice(search_key.device));
0360               cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0361               cudaCheck(error = cudaSetDevice(device));
0362               cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0363               search_key.device = device;
0364             }
0365 
0366             live_blocks.insert(search_key);
0367 
0368             // Remove from free blocks
0369             cached_bytes.free -= search_key.bytes;
0370             cached_bytes.live += search_key.bytes;
0371 
0372             if (debug)
0373               printf(
0374                   "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
0375                   "(previously associated with stream %lld, event %lld).\n",
0376                   search_key.d_ptr,
0377                   (long long)search_key.bytes,
0378                   (long long)search_key.associated_stream,
0379                   (long long)search_key.ready_event,
0380                   (long long)search_key.device,
0381                   (long long)block_itr->associated_stream,
0382                   (long long)block_itr->ready_event);
0383 
0384             cached_blocks.erase(block_itr);
0385 
0386             break;
0387           }
0388           block_itr++;
0389         }
0390 
0391         // Done searching: unlock
0392         mutex_locker.unlock();
0393       }
0394 
0395       // Allocate the block if necessary
0396       if (!found) {
0397         // Attempt to allocate
0398         // TODO: eventually support allocation flags
0399         if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
0400             cudaErrorMemoryAllocation) {
0401           // The allocation attempt failed: free all cached blocks on device and retry
0402           if (debug)
0403             printf(
0404                 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
0405                 "allocations",
0406                 (long long)search_key.bytes,
0407                 (long long)search_key.associated_stream,
0408                 (long long)search_key.device);
0409 
0410           error = cudaSuccess;  // Reset the error we will return
0411           cudaGetLastError();   // Reset CUDART's error
0412 
0413           // Lock
0414           mutex_locker.lock();
0415 
0416           // Iterate the range of free blocks
0417           CachedBlocks::iterator block_itr = cached_blocks.begin();
0418 
0419           while ((block_itr != cached_blocks.end())) {
0420             // No need to worry about synchronization with the device: cudaFree is
0421             // blocking and will synchronize across all kernels executing
0422             // on the current device
0423 
0424             // Free pinned host memory.
0425             if ((error = cudaFreeHost(block_itr->d_ptr)))
0426               break;
0427             if ((error = cudaEventDestroy(block_itr->ready_event)))
0428               break;
0429 
0430             // Reduce balance and erase entry
0431             cached_bytes.free -= block_itr->bytes;
0432 
0433             if (debug)
0434               printf(
0435                   "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0436                   "bytes) outstanding.\n",
0437                   (long long)block_itr->bytes,
0438                   (long long)cached_blocks.size(),
0439                   (long long)cached_bytes.free,
0440                   (long long)live_blocks.size(),
0441                   (long long)cached_bytes.live);
0442 
0443             cached_blocks.erase(block_itr);
0444 
0445             block_itr++;
0446           }
0447 
0448           // Unlock
0449           mutex_locker.unlock();
0450 
0451           // Return under error
0452           if (error)
0453             return error;
0454 
0455           // Try to allocate again
0456           cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
0457         }
0458 
0459         // Create ready event
0460         cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0461 
0462         // Insert into live blocks
0463         mutex_locker.lock();
0464         live_blocks.insert(search_key);
0465         cached_bytes.live += search_key.bytes;
0466         mutex_locker.unlock();
0467 
0468         if (debug)
0469           printf(
0470               "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
0471               "%lld).\n",
0472               search_key.d_ptr,
0473               (long long)search_key.bytes,
0474               (long long)search_key.associated_stream,
0475               (long long)search_key.ready_event,
0476               (long long)search_key.device);
0477       }
0478 
0479       // Copy host pointer to output parameter
0480       *d_ptr = search_key.d_ptr;
0481 
0482       if (debug)
0483         printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
0484                (long long)cached_blocks.size(),
0485                (long long)cached_bytes.free,
0486                (long long)live_blocks.size(),
0487                (long long)cached_bytes.live);
0488 
0489       return error;
0490     }
0491 
0492     /**
0493      * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
0494      *
0495      * Once freed, the allocation becomes available immediately for reuse.
0496      */
0497     cudaError_t HostFree(void *d_ptr) {
0498       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0499       cudaError_t error = cudaSuccess;
0500 
0501       // Lock
0502       std::unique_lock<std::mutex> mutex_locker(mutex);
0503 
0504       // Find corresponding block descriptor
0505       bool recached = false;
0506       BlockDescriptor search_key(d_ptr);
0507       BusyBlocks::iterator block_itr = live_blocks.find(search_key);
0508       if (block_itr != live_blocks.end()) {
0509         // Remove from live blocks
0510         search_key = *block_itr;
0511         live_blocks.erase(block_itr);
0512         cached_bytes.live -= search_key.bytes;
0513 
0514         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
0515         if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
0516           // Insert returned allocation into free blocks
0517           recached = true;
0518           cached_blocks.insert(search_key);
0519           cached_bytes.free += search_key.bytes;
0520 
0521           if (debug)
0522             printf(
0523                 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
0524                 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
0525                 (long long)search_key.bytes,
0526                 (long long)search_key.associated_stream,
0527                 (long long)search_key.ready_event,
0528                 (long long)search_key.device,
0529                 (long long)cached_blocks.size(),
0530                 (long long)cached_bytes.free,
0531                 (long long)live_blocks.size(),
0532                 (long long)cached_bytes.live);
0533         }
0534       }
0535 
0536       cudaCheck(error = cudaGetDevice(&entrypoint_device));
0537       if (entrypoint_device != search_key.device) {
0538         cudaCheck(error = cudaSetDevice(search_key.device));
0539       }
0540 
0541       if (recached) {
0542         // Insert the ready event in the associated stream (must have current device set properly)
0543         cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
0544       }
0545 
0546       // Unlock
0547       mutex_locker.unlock();
0548 
0549       if (!recached) {
0550         // Free the allocation from the runtime and cleanup the event.
0551         cudaCheck(error = cudaFreeHost(d_ptr));
0552         cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0553 
0554         if (debug)
0555           printf(
0556               "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
0557               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
0558               (long long)search_key.bytes,
0559               (long long)search_key.associated_stream,
0560               (long long)search_key.ready_event,
0561               (long long)search_key.device,
0562               (long long)cached_blocks.size(),
0563               (long long)cached_bytes.free,
0564               (long long)live_blocks.size(),
0565               (long long)cached_bytes.live);
0566       }
0567 
0568       // Reset device
0569       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
0570         cudaCheck(error = cudaSetDevice(entrypoint_device));
0571       }
0572 
0573       return error;
0574     }
0575 
0576     /**
0577      * \brief Frees all cached pinned host allocations
0578      */
0579     cudaError_t FreeAllCached() {
0580       cudaError_t error = cudaSuccess;
0581       int entrypoint_device = INVALID_DEVICE_ORDINAL;
0582       int current_device = INVALID_DEVICE_ORDINAL;
0583 
0584       std::unique_lock<std::mutex> mutex_locker(mutex);
0585 
0586       while (!cached_blocks.empty()) {
0587         // Get first block
0588         CachedBlocks::iterator begin = cached_blocks.begin();
0589 
0590         // Get entry-point device ordinal if necessary
0591         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
0592           if ((error = cudaGetDevice(&entrypoint_device)))
0593             break;
0594         }
0595 
0596         // Set current device ordinal if necessary
0597         if (begin->device != current_device) {
0598           if ((error = cudaSetDevice(begin->device)))
0599             break;
0600           current_device = begin->device;
0601         }
0602 
0603         // Free host memory
0604         if ((error = cudaFreeHost(begin->d_ptr)))
0605           break;
0606         if ((error = cudaEventDestroy(begin->ready_event)))
0607           break;
0608 
0609         // Reduce balance and erase entry
0610         cached_bytes.free -= begin->bytes;
0611 
0612         if (debug)
0613           printf(
0614               "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0615               "bytes) outstanding.\n",
0616               (long long)begin->bytes,
0617               (long long)cached_blocks.size(),
0618               (long long)cached_bytes.free,
0619               (long long)live_blocks.size(),
0620               (long long)cached_bytes.live);
0621 
0622         cached_blocks.erase(begin);
0623       }
0624 
0625       mutex_locker.unlock();
0626 
0627       // Attempt to revert back to entry-point device if necessary
0628       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
0629         cudaCheck(error = cudaSetDevice(entrypoint_device));
0630       }
0631 
0632       return error;
0633     }
0634 
0635     /**
0636      * \brief Destructor
0637      */
0638     ~CachingHostAllocator() {
0639       if (!skip_cleanup)
0640         FreeAllCached();
0641     }
0642   };
0643 
0644   /** @} */  // end group UtilMgmt
0645 
0646 }  // namespace notcub
0647 
0648 #endif