File indexing completed on 2024-10-07 04:59:44
0001 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0002 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041 #include <cmath>
0042 #include <map>
0043 #include <set>
0044 #include <mutex>
0045
0046 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0047
0048
0049 namespace notcub {
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100 struct CachingHostAllocator {
0101
0102
0103
0104
0105
0106 static const unsigned int INVALID_BIN = (unsigned int)-1;
0107
0108
0109 static const size_t INVALID_SIZE = (size_t)-1;
0110
0111 #ifndef DOXYGEN_SHOULD_SKIP_THIS
0112
0113
0114 static const int INVALID_DEVICE_ORDINAL = -1;
0115
0116
0117
0118
0119
0120
0121
0122
0123 struct BlockDescriptor {
0124 void *d_ptr;
0125 size_t bytes;
0126 unsigned int bin;
0127 int device;
0128 cudaStream_t associated_stream;
0129 cudaEvent_t ready_event;
0130
0131
0132 BlockDescriptor(void *d_ptr)
0133 : d_ptr(d_ptr),
0134 bytes(0),
0135 bin(INVALID_BIN),
0136 device(INVALID_DEVICE_ORDINAL),
0137 associated_stream(nullptr),
0138 ready_event(nullptr) {}
0139
0140
0141 BlockDescriptor()
0142 : d_ptr(nullptr),
0143 bytes(0),
0144 bin(INVALID_BIN),
0145 device(INVALID_DEVICE_ORDINAL),
0146 associated_stream(nullptr),
0147 ready_event(nullptr) {}
0148
0149
0150 static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
0151
0152
0153 static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
0154 };
0155
0156
0157 typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
0158
0159 class TotalBytes {
0160 public:
0161 size_t free;
0162 size_t live;
0163 TotalBytes() { free = live = 0; }
0164 };
0165
0166
0167 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
0168
0169
0170 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
0171
0172
0173
0174
0175
0176
0177
0178
0179 static unsigned int IntPow(unsigned int base, unsigned int exp) {
0180 unsigned int retval = 1;
0181 while (exp > 0) {
0182 if (exp & 1) {
0183 retval = retval * base;
0184 }
0185 base = base * base;
0186 exp = exp >> 1;
0187 }
0188 return retval;
0189 }
0190
0191
0192
0193
0194 void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
0195 power = 0;
0196 rounded_bytes = 1;
0197
0198 if (value * base < value) {
0199
0200 power = sizeof(size_t) * 8;
0201 rounded_bytes = size_t(0) - 1;
0202 return;
0203 }
0204
0205 while (rounded_bytes < value) {
0206 rounded_bytes *= base;
0207 power++;
0208 }
0209 }
0210
0211
0212
0213
0214
0215 std::mutex mutex;
0216
0217 unsigned int bin_growth;
0218 unsigned int min_bin;
0219 unsigned int max_bin;
0220
0221 size_t min_bin_bytes;
0222 size_t max_bin_bytes;
0223 size_t max_cached_bytes;
0224
0225 const bool
0226 skip_cleanup;
0227 bool debug;
0228
0229 TotalBytes cached_bytes;
0230 CachedBlocks cached_blocks;
0231 BusyBlocks live_blocks;
0232
0233 #endif
0234
0235
0236
0237
0238
0239
0240
0241
0242 CachingHostAllocator(
0243 unsigned int bin_growth,
0244 unsigned int min_bin = 1,
0245 unsigned int max_bin = INVALID_BIN,
0246 size_t max_cached_bytes = INVALID_SIZE,
0247 bool skip_cleanup =
0248 false,
0249 bool debug = false)
0250 : bin_growth(bin_growth),
0251 min_bin(min_bin),
0252 max_bin(max_bin),
0253 min_bin_bytes(IntPow(bin_growth, min_bin)),
0254 max_bin_bytes(IntPow(bin_growth, max_bin)),
0255 max_cached_bytes(max_cached_bytes),
0256 skip_cleanup(skip_cleanup),
0257 debug(debug),
0258 cached_blocks(BlockDescriptor::SizeCompare),
0259 live_blocks(BlockDescriptor::PtrCompare) {}
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274 CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
0275 : bin_growth(8),
0276 min_bin(3),
0277 max_bin(7),
0278 min_bin_bytes(IntPow(bin_growth, min_bin)),
0279 max_bin_bytes(IntPow(bin_growth, max_bin)),
0280 max_cached_bytes((max_bin_bytes * 3) - 1),
0281 skip_cleanup(skip_cleanup),
0282 debug(debug),
0283 cached_blocks(BlockDescriptor::SizeCompare),
0284 live_blocks(BlockDescriptor::PtrCompare) {}
0285
0286
0287
0288
0289
0290
0291
0292 void SetMaxCachedBytes(size_t max_cached_bytes) {
0293
0294 std::unique_lock mutex_locker(mutex);
0295
0296 if (debug)
0297 printf("Changing max_cached_bytes (%lld -> %lld)\n",
0298 (long long)this->max_cached_bytes,
0299 (long long)max_cached_bytes);
0300
0301 this->max_cached_bytes = max_cached_bytes;
0302
0303
0304 mutex_locker.unlock();
0305 }
0306
0307
0308
0309
0310
0311
0312 cudaError_t HostAllocate(
0313 void **d_ptr,
0314 size_t bytes,
0315 cudaStream_t active_stream = nullptr)
0316 {
0317 std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0318 *d_ptr = nullptr;
0319 int device = INVALID_DEVICE_ORDINAL;
0320 cudaError_t error = cudaSuccess;
0321
0322 cudaCheck(error = cudaGetDevice(&device));
0323
0324
0325 bool found = false;
0326 BlockDescriptor search_key;
0327 search_key.device = device;
0328 search_key.associated_stream = active_stream;
0329 NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
0330
0331 if (search_key.bin > max_bin) {
0332
0333
0334
0335 search_key.bin = INVALID_BIN;
0336 search_key.bytes = bytes;
0337 } else {
0338
0339 mutex_locker.lock();
0340
0341 if (search_key.bin < min_bin) {
0342
0343 search_key.bin = min_bin;
0344 search_key.bytes = min_bin_bytes;
0345 }
0346
0347
0348 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
0349 while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
0350
0351
0352 if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
0353
0354 found = true;
0355 search_key = *block_itr;
0356 search_key.associated_stream = active_stream;
0357 if (search_key.device != device) {
0358
0359 cudaCheck(error = cudaSetDevice(search_key.device));
0360 cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0361 cudaCheck(error = cudaSetDevice(device));
0362 cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0363 search_key.device = device;
0364 }
0365
0366 live_blocks.insert(search_key);
0367
0368
0369 cached_bytes.free -= search_key.bytes;
0370 cached_bytes.live += search_key.bytes;
0371
0372 if (debug)
0373 printf(
0374 "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
0375 "(previously associated with stream %lld, event %lld).\n",
0376 search_key.d_ptr,
0377 (long long)search_key.bytes,
0378 (long long)search_key.associated_stream,
0379 (long long)search_key.ready_event,
0380 (long long)search_key.device,
0381 (long long)block_itr->associated_stream,
0382 (long long)block_itr->ready_event);
0383
0384 cached_blocks.erase(block_itr);
0385
0386 break;
0387 }
0388 block_itr++;
0389 }
0390
0391
0392 mutex_locker.unlock();
0393 }
0394
0395
0396 if (!found) {
0397
0398
0399 if (cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault) == cudaErrorMemoryAllocation) {
0400
0401 if (debug)
0402 printf(
0403 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
0404 "allocations",
0405 (long long)search_key.bytes,
0406 (long long)search_key.associated_stream,
0407 (long long)search_key.device);
0408
0409 error = cudaSuccess;
0410 cudaGetLastError();
0411
0412
0413 mutex_locker.lock();
0414
0415
0416 CachedBlocks::iterator block_itr = cached_blocks.begin();
0417
0418 while ((block_itr != cached_blocks.end())) {
0419
0420
0421
0422
0423
0424 if ((error = cudaFreeHost(block_itr->d_ptr)))
0425 break;
0426 if ((error = cudaEventDestroy(block_itr->ready_event)))
0427 break;
0428
0429
0430 cached_bytes.free -= block_itr->bytes;
0431
0432 if (debug)
0433 printf(
0434 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0435 "bytes) outstanding.\n",
0436 (long long)block_itr->bytes,
0437 (long long)cached_blocks.size(),
0438 (long long)cached_bytes.free,
0439 (long long)live_blocks.size(),
0440 (long long)cached_bytes.live);
0441
0442 cached_blocks.erase(block_itr);
0443
0444 block_itr++;
0445 }
0446
0447
0448 mutex_locker.unlock();
0449
0450
0451 if (error)
0452 return error;
0453
0454
0455 cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
0456 }
0457
0458
0459 cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0460
0461
0462 mutex_locker.lock();
0463 live_blocks.insert(search_key);
0464 cached_bytes.live += search_key.bytes;
0465 mutex_locker.unlock();
0466
0467 if (debug)
0468 printf(
0469 "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
0470 "%lld).\n",
0471 search_key.d_ptr,
0472 (long long)search_key.bytes,
0473 (long long)search_key.associated_stream,
0474 (long long)search_key.ready_event,
0475 (long long)search_key.device);
0476 }
0477
0478
0479 *d_ptr = search_key.d_ptr;
0480
0481 if (debug)
0482 printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
0483 (long long)cached_blocks.size(),
0484 (long long)cached_bytes.free,
0485 (long long)live_blocks.size(),
0486 (long long)cached_bytes.live);
0487
0488 return error;
0489 }
0490
0491
0492
0493
0494
0495
0496 cudaError_t HostFree(void *d_ptr) {
0497 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0498 cudaError_t error = cudaSuccess;
0499
0500
0501 std::unique_lock<std::mutex> mutex_locker(mutex);
0502
0503
0504 bool recached = false;
0505 BlockDescriptor search_key(d_ptr);
0506 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
0507 if (block_itr != live_blocks.end()) {
0508
0509 search_key = *block_itr;
0510 live_blocks.erase(block_itr);
0511 cached_bytes.live -= search_key.bytes;
0512
0513
0514 if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
0515
0516 recached = true;
0517 cached_blocks.insert(search_key);
0518 cached_bytes.free += search_key.bytes;
0519
0520 if (debug)
0521 printf(
0522 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
0523 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
0524 (long long)search_key.bytes,
0525 (long long)search_key.associated_stream,
0526 (long long)search_key.ready_event,
0527 (long long)search_key.device,
0528 (long long)cached_blocks.size(),
0529 (long long)cached_bytes.free,
0530 (long long)live_blocks.size(),
0531 (long long)cached_bytes.live);
0532 }
0533 }
0534
0535 cudaCheck(error = cudaGetDevice(&entrypoint_device));
0536 if (entrypoint_device != search_key.device) {
0537 cudaCheck(error = cudaSetDevice(search_key.device));
0538 }
0539
0540 if (recached) {
0541
0542 cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
0543 }
0544
0545
0546 mutex_locker.unlock();
0547
0548 if (!recached) {
0549
0550 cudaCheck(error = cudaFreeHost(d_ptr));
0551 cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0552
0553 if (debug)
0554 printf(
0555 "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available "
0556 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
0557 (long long)search_key.bytes,
0558 (long long)search_key.associated_stream,
0559 (long long)search_key.ready_event,
0560 (long long)search_key.device,
0561 (long long)cached_blocks.size(),
0562 (long long)cached_bytes.free,
0563 (long long)live_blocks.size(),
0564 (long long)cached_bytes.live);
0565 }
0566
0567
0568 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
0569 cudaCheck(error = cudaSetDevice(entrypoint_device));
0570 }
0571
0572 return error;
0573 }
0574
0575
0576
0577
0578 cudaError_t FreeAllCached() {
0579 cudaError_t error = cudaSuccess;
0580 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0581 int current_device = INVALID_DEVICE_ORDINAL;
0582
0583 std::unique_lock<std::mutex> mutex_locker(mutex);
0584
0585 while (!cached_blocks.empty()) {
0586
0587 CachedBlocks::iterator begin = cached_blocks.begin();
0588
0589
0590 if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
0591 if ((error = cudaGetDevice(&entrypoint_device)))
0592 break;
0593 }
0594
0595
0596 if (begin->device != current_device) {
0597 if ((error = cudaSetDevice(begin->device)))
0598 break;
0599 current_device = begin->device;
0600 }
0601
0602
0603 if ((error = cudaFreeHost(begin->d_ptr)))
0604 break;
0605 if ((error = cudaEventDestroy(begin->ready_event)))
0606 break;
0607
0608
0609 cached_bytes.free -= begin->bytes;
0610
0611 if (debug)
0612 printf(
0613 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0614 "bytes) outstanding.\n",
0615 (long long)begin->bytes,
0616 (long long)cached_blocks.size(),
0617 (long long)cached_bytes.free,
0618 (long long)live_blocks.size(),
0619 (long long)cached_bytes.live);
0620
0621 cached_blocks.erase(begin);
0622 }
0623
0624 mutex_locker.unlock();
0625
0626
0627 if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
0628 cudaCheck(error = cudaSetDevice(entrypoint_device));
0629 }
0630
0631 return error;
0632 }
0633
0634
0635
0636
0637 ~CachingHostAllocator() {
0638 if (!skip_cleanup)
0639 FreeAllCached();
0640 }
0641 };
0642
0643
0644
0645 }
0646
0647 #endif