File indexing completed on 2024-10-07 04:59:44
0001 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
0002 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041 #include <cmath>
0042 #include <map>
0043 #include <set>
0044 #include <mutex>
0045
0046 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0047 #include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
0048
0049
0050 namespace notcub {
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100 struct CachingDeviceAllocator {
0101
0102
0103
0104
0105
0106 static const unsigned int INVALID_BIN = (unsigned int)-1;
0107
0108
0109 static const size_t INVALID_SIZE = (size_t)-1;
0110
0111 #ifndef DOXYGEN_SHOULD_SKIP_THIS
0112
0113
0114 static const int INVALID_DEVICE_ORDINAL = -1;
0115
0116
0117
0118
0119
0120
0121
0122
0123 struct BlockDescriptor {
0124 void *d_ptr;
0125 size_t bytes;
0126 size_t bytesRequested;
0127 unsigned int bin;
0128 int device;
0129 cudaStream_t associated_stream;
0130 cudaEvent_t ready_event;
0131
0132
0133 BlockDescriptor(void *d_ptr, int device)
0134 : d_ptr(d_ptr),
0135 bytes(0),
0136 bytesRequested(0),
0137 bin(INVALID_BIN),
0138 device(device),
0139 associated_stream(nullptr),
0140 ready_event(nullptr) {}
0141
0142
0143 BlockDescriptor(int device)
0144 : d_ptr(nullptr),
0145 bytes(0),
0146 bytesRequested(0),
0147 bin(INVALID_BIN),
0148 device(device),
0149 associated_stream(nullptr),
0150 ready_event(nullptr) {}
0151
0152
0153 static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
0154 if (a.device == b.device)
0155 return (a.d_ptr < b.d_ptr);
0156 else
0157 return (a.device < b.device);
0158 }
0159
0160
0161 static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
0162 if (a.device == b.device)
0163 return (a.bytes < b.bytes);
0164 else
0165 return (a.device < b.device);
0166 }
0167 };
0168
0169
0170 typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
0171
0172
0173
0174
0175 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
0176
0177
0178 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
0179
0180
0181
0182 using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;
0183
0184
0185
0186
0187
0188
0189
0190
0191 static unsigned int IntPow(unsigned int base, unsigned int exp) {
0192 unsigned int retval = 1;
0193 while (exp > 0) {
0194 if (exp & 1) {
0195 retval = retval * base;
0196 }
0197 base = base * base;
0198 exp = exp >> 1;
0199 }
0200 return retval;
0201 }
0202
0203
0204
0205
0206 void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
0207 power = 0;
0208 rounded_bytes = 1;
0209
0210 if (value * base < value) {
0211
0212 power = sizeof(size_t) * 8;
0213 rounded_bytes = size_t(0) - 1;
0214 return;
0215 }
0216
0217 while (rounded_bytes < value) {
0218 rounded_bytes *= base;
0219 power++;
0220 }
0221 }
0222
0223
0224
0225
0226
0227
0228 mutable std::mutex mutex;
0229
0230 unsigned int bin_growth;
0231 unsigned int min_bin;
0232 unsigned int max_bin;
0233
0234 size_t min_bin_bytes;
0235 size_t max_bin_bytes;
0236 size_t max_cached_bytes;
0237
0238 const bool
0239 skip_cleanup;
0240 bool debug;
0241
0242 GpuCachedBytes cached_bytes;
0243 CachedBlocks cached_blocks;
0244 BusyBlocks live_blocks;
0245
0246 #endif
0247
0248
0249
0250
0251
0252
0253
0254
0255 CachingDeviceAllocator(
0256 unsigned int bin_growth,
0257 unsigned int min_bin = 1,
0258 unsigned int max_bin = INVALID_BIN,
0259 size_t max_cached_bytes = INVALID_SIZE,
0260 bool skip_cleanup =
0261 false,
0262 bool debug = false)
0263 : bin_growth(bin_growth),
0264 min_bin(min_bin),
0265 max_bin(max_bin),
0266 min_bin_bytes(IntPow(bin_growth, min_bin)),
0267 max_bin_bytes(IntPow(bin_growth, max_bin)),
0268 max_cached_bytes(max_cached_bytes),
0269 skip_cleanup(skip_cleanup),
0270 debug(debug),
0271 cached_blocks(BlockDescriptor::SizeCompare),
0272 live_blocks(BlockDescriptor::PtrCompare) {}
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285
0286
0287 CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
0288 : bin_growth(8),
0289 min_bin(3),
0290 max_bin(7),
0291 min_bin_bytes(IntPow(bin_growth, min_bin)),
0292 max_bin_bytes(IntPow(bin_growth, max_bin)),
0293 max_cached_bytes((max_bin_bytes * 3) - 1),
0294 skip_cleanup(skip_cleanup),
0295 debug(debug),
0296 cached_blocks(BlockDescriptor::SizeCompare),
0297 live_blocks(BlockDescriptor::PtrCompare) {}
0298
0299
0300
0301
0302
0303
0304
0305 cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
0306
0307
0308 std::unique_lock mutex_locker(mutex);
0309
0310 if (debug)
0311
0312 printf("Changing max_cached_bytes (%lld -> %lld)\n",
0313 (long long)this->max_cached_bytes,
0314 (long long)max_cached_bytes);
0315
0316 this->max_cached_bytes = max_cached_bytes;
0317
0318
0319 mutex_locker.unlock();
0320
0321 return cudaSuccess;
0322 }
0323
0324
0325
0326
0327
0328
0329
0330
0331 cudaError_t DeviceAllocate(
0332 int device,
0333 void **d_ptr,
0334 size_t bytes,
0335 cudaStream_t active_stream = nullptr)
0336 {
0337
0338 std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0339 *d_ptr = nullptr;
0340 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0341 cudaError_t error = cudaSuccess;
0342
0343 if (device == INVALID_DEVICE_ORDINAL) {
0344
0345 cudaCheck(error = cudaGetDevice(&entrypoint_device));
0346 device = entrypoint_device;
0347 }
0348
0349
0350 bool found = false;
0351 BlockDescriptor search_key(device);
0352 search_key.bytesRequested = bytes;
0353 search_key.associated_stream = active_stream;
0354 NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
0355
0356 if (search_key.bin > max_bin) {
0357
0358
0359
0360 search_key.bin = INVALID_BIN;
0361 search_key.bytes = bytes;
0362 } else {
0363
0364 mutex_locker.lock();
0365
0366 if (search_key.bin < min_bin) {
0367
0368 search_key.bin = min_bin;
0369 search_key.bytes = min_bin_bytes;
0370 }
0371
0372
0373 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
0374 while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
0375 (block_itr->bin == search_key.bin)) {
0376
0377
0378
0379 if ((active_stream == block_itr->associated_stream) ||
0380 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
0381
0382 found = true;
0383 search_key = *block_itr;
0384 search_key.associated_stream = active_stream;
0385 live_blocks.insert(search_key);
0386
0387
0388 cached_bytes[device].free -= search_key.bytes;
0389 cached_bytes[device].live += search_key.bytes;
0390 cached_bytes[device].liveRequested += search_key.bytesRequested;
0391
0392 if (debug)
0393
0394
0395 printf(
0396 "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
0397 "associated with stream %lld, event %lld).\n",
0398 device,
0399 search_key.d_ptr,
0400 (long long)search_key.bytes,
0401 (long long)search_key.associated_stream,
0402 (long long)search_key.ready_event,
0403 (long long)block_itr->associated_stream,
0404 (long long)block_itr->ready_event);
0405
0406 cached_blocks.erase(block_itr);
0407
0408 break;
0409 }
0410 block_itr++;
0411 }
0412
0413
0414 mutex_locker.unlock();
0415 }
0416
0417
0418 if (!found) {
0419
0420 if (device != entrypoint_device) {
0421
0422 cudaCheck(error = cudaGetDevice(&entrypoint_device));
0423 cudaCheck(error = cudaSetDevice(device));
0424 }
0425
0426
0427
0428 if (cudaMalloc(&search_key.d_ptr, search_key.bytes) == cudaErrorMemoryAllocation) {
0429
0430 if (debug)
0431
0432 printf(
0433 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
0434 device,
0435 (long long)search_key.bytes,
0436 (long long)search_key.associated_stream);
0437
0438 error = cudaSuccess;
0439 cudaGetLastError();
0440
0441
0442 mutex_locker.lock();
0443
0444
0445 BlockDescriptor free_key(device);
0446 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
0447
0448 while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
0449
0450
0451
0452
0453
0454
0455 if ((error = cudaFree(block_itr->d_ptr)))
0456 break;
0457 if ((error = cudaEventDestroy(block_itr->ready_event)))
0458 break;
0459
0460
0461 cached_bytes[device].free -= block_itr->bytes;
0462
0463 if (debug)
0464
0465 printf(
0466 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks "
0467 "(%lld bytes) outstanding.\n",
0468 device,
0469 (long long)block_itr->bytes,
0470 (long long)cached_blocks.size(),
0471 (long long)cached_bytes[device].free,
0472 (long long)live_blocks.size(),
0473 (long long)cached_bytes[device].live);
0474
0475 cached_blocks.erase(block_itr);
0476
0477 block_itr++;
0478 }
0479
0480
0481 mutex_locker.unlock();
0482
0483
0484 if (error)
0485 return error;
0486
0487
0488
0489 cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
0490 }
0491
0492
0493
0494 cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0495
0496
0497 mutex_locker.lock();
0498 live_blocks.insert(search_key);
0499 cached_bytes[device].live += search_key.bytes;
0500 cached_bytes[device].liveRequested += search_key.bytesRequested;
0501 mutex_locker.unlock();
0502
0503 if (debug)
0504
0505
0506 printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
0507 device,
0508 search_key.d_ptr,
0509 (long long)search_key.bytes,
0510 (long long)search_key.associated_stream,
0511 (long long)search_key.ready_event);
0512
0513
0514 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
0515
0516 cudaCheck(error = cudaSetDevice(entrypoint_device));
0517 }
0518 }
0519
0520
0521 *d_ptr = search_key.d_ptr;
0522
0523 if (debug)
0524
0525 printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
0526 (long long)cached_blocks.size(),
0527 (long long)cached_bytes[device].free,
0528 (long long)live_blocks.size(),
0529 (long long)cached_bytes[device].live);
0530
0531 return error;
0532 }
0533
0534
0535
0536
0537
0538
0539
0540
0541 cudaError_t DeviceAllocate(
0542 void **d_ptr,
0543 size_t bytes,
0544 cudaStream_t active_stream = nullptr)
0545 {
0546 return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
0547 }
0548
0549
0550
0551
0552
0553
0554
0555
0556 cudaError_t DeviceFree(int device, void *d_ptr) {
0557 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0558 cudaError_t error = cudaSuccess;
0559
0560 std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0561
0562 if (device == INVALID_DEVICE_ORDINAL) {
0563
0564 cudaCheck(error = cudaGetDevice(&entrypoint_device));
0565 device = entrypoint_device;
0566 }
0567
0568
0569 mutex_locker.lock();
0570
0571
0572 bool recached = false;
0573 BlockDescriptor search_key(d_ptr, device);
0574 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
0575 if (block_itr != live_blocks.end()) {
0576
0577 search_key = *block_itr;
0578 live_blocks.erase(block_itr);
0579 cached_bytes[device].live -= search_key.bytes;
0580 cached_bytes[device].liveRequested -= search_key.bytesRequested;
0581
0582
0583 if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
0584
0585 recached = true;
0586 cached_blocks.insert(search_key);
0587 cached_bytes[device].free += search_key.bytes;
0588
0589 if (debug)
0590
0591
0592 printf(
0593 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
0594 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
0595 device,
0596 (long long)search_key.bytes,
0597 d_ptr,
0598 (long long)search_key.associated_stream,
0599 (long long)search_key.ready_event,
0600 (long long)cached_blocks.size(),
0601 (long long)cached_bytes[device].free,
0602 (long long)live_blocks.size(),
0603 (long long)cached_bytes[device].live);
0604 }
0605 }
0606
0607
0608 if (device != entrypoint_device) {
0609
0610 cudaCheck(error = cudaGetDevice(&entrypoint_device));
0611 cudaCheck(error = cudaSetDevice(device));
0612 }
0613
0614 if (recached) {
0615
0616
0617 cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
0618 }
0619
0620
0621 mutex_locker.unlock();
0622
0623 if (!recached) {
0624
0625
0626 cudaCheck(error = cudaFree(d_ptr));
0627 cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0628
0629 if (debug)
0630
0631 printf(
0632 "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
0633 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
0634 device,
0635 (long long)search_key.bytes,
0636 d_ptr,
0637 (long long)search_key.associated_stream,
0638 (long long)search_key.ready_event,
0639 (long long)cached_blocks.size(),
0640 (long long)cached_bytes[device].free,
0641 (long long)live_blocks.size(),
0642 (long long)cached_bytes[device].live);
0643 }
0644
0645
0646 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
0647
0648 cudaCheck(error = cudaSetDevice(entrypoint_device));
0649 }
0650
0651 return error;
0652 }
0653
0654
0655
0656
0657
0658
0659
0660
0661 cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
0662
0663
0664
0665
0666 cudaError_t FreeAllCached() {
0667 cudaError_t error = cudaSuccess;
0668 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0669 int current_device = INVALID_DEVICE_ORDINAL;
0670
0671 std::unique_lock<std::mutex> mutex_locker(mutex);
0672
0673 while (!cached_blocks.empty()) {
0674
0675 CachedBlocks::iterator begin = cached_blocks.begin();
0676
0677
0678 if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
0679
0680 if ((error = cudaGetDevice(&entrypoint_device)))
0681 break;
0682 }
0683
0684
0685 if (begin->device != current_device) {
0686
0687 if ((error = cudaSetDevice(begin->device)))
0688 break;
0689 current_device = begin->device;
0690 }
0691
0692
0693
0694 if ((error = cudaFree(begin->d_ptr)))
0695 break;
0696 if ((error = cudaEventDestroy(begin->ready_event)))
0697 break;
0698
0699
0700 cached_bytes[current_device].free -= begin->bytes;
0701
0702 if (debug)
0703 printf(
0704 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0705 "bytes) outstanding.\n",
0706 current_device,
0707 (long long)begin->bytes,
0708 (long long)cached_blocks.size(),
0709 (long long)cached_bytes[current_device].free,
0710 (long long)live_blocks.size(),
0711 (long long)cached_bytes[current_device].live);
0712
0713 cached_blocks.erase(begin);
0714 }
0715
0716 mutex_locker.unlock();
0717
0718
0719 if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
0720
0721 cudaCheck(error = cudaSetDevice(entrypoint_device));
0722 }
0723
0724 return error;
0725 }
0726
0727
0728 GpuCachedBytes CacheStatus() const {
0729 std::unique_lock mutex_locker(mutex);
0730 return cached_bytes;
0731 }
0732
0733
0734
0735
0736
0737 ~CachingDeviceAllocator() {
0738 if (!skip_cleanup)
0739 FreeAllCached();
0740 }
0741 };
0742
0743
0744
0745 }
0746
0747 #endif